PyPI - warp-lang - Versions diffs - 1.4.2__py3-none-macosx_10_13_universal2.whl → 1.5.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.4.2__py3-none-macosx_10_13_universal2.whl → 1.5.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (165) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/libwarp.dylib +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1819 -7
warp/codegen.py +197 -61
warp/config.py +2 -2
warp/context.py +379 -107
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/sim/example_cloth.py +4 -25
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -7
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +15 -0
warp/native/builtin.h +66 -26
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +604 -0
warp/native/cuda_util.cpp +68 -51
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1854 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +140 -67
warp/sim/graph_coloring.py +292 -0
warp/sim/import_urdf.py +8 -8
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +109 -32
warp/sparse.py +1 -1
warp/stubs.py +569 -4
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +39 -0
warp/tests/test_codegen.py +81 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +251 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +21 -5
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +34 -4
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_lerp.py +13 -87
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_matmul.py +6 -9
warp/tests/test_matmul_lite.py +6 -11
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_overwrite.py +45 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +56 -1
warp/tests/test_smoothstep.py +17 -83
warp/tests/test_spatial.py +1 -1
warp/tests/test_static.py +3 -3
warp/tests/test_tile.py +744 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +19 -2
warp/tests/unittest_utils.py +4 -2
warp/types.py +340 -74
warp/utils.py +23 -3
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/METADATA +32 -7
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/RECORD +160 -133
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0

warp/examples/fem/example_stokes.py CHANGED Viewed

@@ -200,4 +200,6 @@ if __name__ == "__main__":
         example.render()
         if not args.headless:
-            example.renderer.plot(options={"velocity": {"streamlines": {}}, "pressure": {"contours": {}}})
+            example.renderer.plot(
+                options={"velocity": {"streamlines": {}}, "pressure": {"contours": {}}}, backend="matplotlib"
+            )

warp/examples/fem/example_streamlines.py CHANGED Viewed

@@ -82,7 +82,7 @@ def mass_form(
     u: fem.Field,
     v: fem.Field,
 ):
-    return u(s) * v(s)
+    return wp.dot(u(s), v(s))
 @fem.integrand
@@ -199,7 +199,7 @@ class Example:
             domain=self._inflow, order=self._degree, family=fem.Polynomial.GAUSS_LEGENDRE
         )
         n_streamlines = streamline_spawn.total_point_count()
-        spawn_points = wp.array(dtype=wp.vec3, shape=n_streamlines)
+        spawn_points = wp.empty(dtype=wp.vec3, shape=n_streamlines)
         jitter_amount = self._streamline_dx / self._degree
         fem.interpolate(
@@ -212,8 +212,8 @@ class Example:
         # to populate the per-point data
         point_count = self._streamline_point_count
-        points = wp.array(dtype=wp.vec3, shape=(n_streamlines, point_count))
-        speed = wp.array(dtype=float, shape=(n_streamlines, point_count))
+        points = wp.empty(dtype=wp.vec3, shape=(n_streamlines, point_count))
+        speed = wp.empty(dtype=float, shape=(n_streamlines, point_count))
         fem.interpolate(
             gen_streamlines,
@@ -235,7 +235,7 @@ class Example:
     def render(self):
         # self.renderer.add_field("solution", self.pressure_field)
         self.plot.add_field("pressure", self.pressure_field)
-        self.plot.add_field("velocity", self.velocity_field)
+        # self.plot.add_field("velocity", self.velocity_field)
         if self.renderer is not None:
             streamline_count = self._points.shape[0]
@@ -259,10 +259,11 @@ class Example:
             self.renderer.end_frame()
     def _generate_incompressible_flow(self):
-        # Function spaces for velocity, scalars and pressure (Pk / Pk / Pk-1)
-        u_space = fem.make_polynomial_space(geo=self._geo, degree=self._degree, dtype=wp.vec3)
-        s_space = fem.make_polynomial_space(geo=self._geo, degree=self._degree, dtype=float)
-        p_space = fem.make_polynomial_space(geo=self._geo, degree=self._degree - 1, dtype=float)
+        # Function spaces for velocity and pressure (RT1 / P0)
+        u_space = fem.make_polynomial_space(
+            geo=self._geo, element_basis=fem.ElementBasis.RAVIART_THOMAS, degree=1, dtype=wp.vec3
+        )
+        p_space = fem.make_polynomial_space(geo=self._geo, degree=0, dtype=float)
         self.pressure_field = p_space.make_field()
         self.velocity_field = u_space.make_field()
@@ -288,8 +289,8 @@ class Example:
         fem.interpolate(inflow_velocity, dest=fem.make_restriction(self.velocity_field, domain=self._inflow))
         # (Diagonal) mass matrix
-        rho_test = fem.make_test(s_space)
-        rho_trial = fem.make_trial(s_space)
+        rho_test = fem.make_test(u_space)
+        rho_trial = fem.make_trial(u_space)
         inv_mass_matrix = fem.integrate(
             mass_form, fields={"u": rho_trial, "v": rho_test}, nodal=True, output_dtype=float
         )
@@ -341,11 +342,3 @@ if __name__ == "__main__":
         example.step()
         example.render()
-        if not args.headless:
-            example.plot.plot(
-                {
-                    "velocity": {"streamlines": {"density": 2}},
-                    "pressure": {"contours": {}},
-                }
-            )

warp/examples/fem/utils.py CHANGED Viewed

@@ -31,7 +31,7 @@ def gen_trimesh(res, bounds_lo: Optional[wp.vec2] = None, bounds_hi: Optional[wp
     Args:
         res: Resolution of the grid along each dimension
         bounds_lo: Position of the lower bound of the axis-aligned grid
-        bounds_up: Position of the upper bound of the axis-aligned grid
+        bounds_hi: Position of the upper bound of the axis-aligned grid
     Returns:
         Tuple of ndarrays: (Vertex positions, Triangle vertex indices)
@@ -62,7 +62,7 @@ def gen_tetmesh(res, bounds_lo: Optional[wp.vec3] = None, bounds_hi: Optional[wp
     Args:
         res: Resolution of the grid along each dimension
         bounds_lo: Position of the lower bound of the axis-aligned grid
-        bounds_up: Position of the upper bound of the axis-aligned grid
+        bounds_hi: Position of the upper bound of the axis-aligned grid
     Returns:
         Tuple of ndarrays: (Vertex positions, Tetrahedron vertex indices)
@@ -95,7 +95,7 @@ def gen_quadmesh(res, bounds_lo: Optional[wp.vec2] = None, bounds_hi: Optional[w
     Args:
         res: Resolution of the grid along each dimension
         bounds_lo: Position of the lower bound of the axis-aligned grid
-        bounds_up: Position of the upper bound of the axis-aligned grid
+        bounds_hi: Position of the upper bound of the axis-aligned grid
     Returns:
         Tuple of ndarrays: (Vertex positions, Triangle vertex indices)
@@ -125,7 +125,7 @@ def gen_hexmesh(res, bounds_lo: Optional[wp.vec3] = None, bounds_hi: Optional[wp
     Args:
         res: Resolution of the grid along each dimension
         bounds_lo: Position of the lower bound of the axis-aligned grid
-        bounds_up: Position of the upper bound of the axis-aligned grid
+        bounds_hi: Position of the upper bound of the axis-aligned grid
     Returns:
         Tuple of ndarrays: (Vertex positions, Triangle vertex indices)
@@ -158,7 +158,7 @@ def gen_volume(res, bounds_lo: Optional[wp.vec3] = None, bounds_hi: Optional[wp.
     Args:
         res: Resolution of the grid along each dimension
         bounds_lo: Position of the lower bound of the axis-aligned grid
-        bounds_up: Position of the upper bound of the axis-aligned grid
+        bounds_hi: Position of the upper bound of the axis-aligned grid
         device: Cuda device on which to allocate the grid
     """
@@ -575,6 +575,7 @@ class Plot:
     def _plot_pyvista(self, options: Dict[str, Any]):
         import pyvista
+        import pyvista.themes
         grids = {}
         scales = {}
@@ -702,7 +703,7 @@ class Plot:
         subplot_rows = options.get("rows", 1)
         subplot_shape = (subplot_rows, (len(grids) + subplot_rows - 1) // subplot_rows)
-        plotter = pyvista.Plotter(shape=subplot_shape)
+        plotter = pyvista.Plotter(shape=subplot_shape, theme=pyvista.themes.DocumentProTheme())
         plotter.link_views()
         plotter.add_camera_orientation_widget()
         for index, (name, grid) in enumerate(grids.items()):
@@ -717,7 +718,7 @@ class Plot:
                     plotter.view_xy()
                 else:
                     plotter.add_mesh(marker)
-            elif field.space.dimension == 3:
+            elif field.space.geometry.cell_dimension == 3:
                 plotter.add_mesh_clip_plane(grid, show_edges=True, clim=value_range, assign_to_axis="z")
             else:
                 plotter.add_mesh(grid, show_edges=True, clim=value_range)
@@ -809,6 +810,8 @@ class Plot:
                 if "arrows" in args or "streamlines" in args:
                     plot_opts["glyph_scale"] = args.get("arrows", {}).get("glyph_scale", 1.0)
                     plot_fn = _plot_quivers_3d
+                elif field.space.geometry.cell_dimension == 2:
+                    plot_fn = _plot_surface
                 else:
                     plot_fn = _plot_3d_scatter
                 plot_3d = True
@@ -856,23 +859,43 @@ def _field_triangulation(field):
 def _plot_surface(field, axes, **kwargs):
-    Z = _value_or_magnitude(field.dof_values.numpy())
+    from matplotlib.cm import get_cmap
+    from matplotlib.colors import Normalize
-    if "clim" in kwargs:
-        axes.set_zlim(*kwargs["clim"])
+    C = _value_or_magnitude(field.dof_values.numpy())
+    positions = field.space.node_positions().numpy().T
+    if field.space.dimension == 3:
+        X, Y, Z = positions
+    else:
+        X, Y = positions
+        Z = C
+        axes.set_zlim(kwargs["clim"])
     if hasattr(field.space, "node_grid"):
         X, Y = field.space.node_grid()
-        Z = Z.reshape(X.shape)
-        return axes.plot_surface(X, Y, Z, linewidth=0.1, antialiased=False, **kwargs)
+        C = C.reshape(X.shape)
+        return axes.plot_surface(X, Y, C, linewidth=0.1, antialiased=False, **kwargs)
     if hasattr(field.space, "node_triangulation"):
         triangulation = _field_triangulation(field)
-        return axes.plot_trisurf(triangulation, Z, linewidth=0.1, antialiased=False, **kwargs)
+        if field.space.dimension == 3:
+            plot = axes.plot_trisurf(triangulation, Z, linewidth=0.1, antialiased=False)
+            # change colors -- recompute color map manually
+            vmin, vmax = kwargs["clim"]
+            norm = Normalize(vmin=vmin, vmax=vmax)
+            values = np.mean(C[triangulation.triangles], axis=1)
+            colors = get_cmap(kwargs["cmap"])(norm(values))
+            plot.set_norm(norm)
+            plot.set_fc(colors)
+        else:
+            plot = axes.plot_trisurf(triangulation, C, linewidth=0.1, antialiased=False, **kwargs)
+        return plot
     # scatter
-    X, Y = field.space.node_positions().numpy().T
-    return axes.scatter(X, Y, Z, c=Z, **kwargs)
+    return axes.scatter(X, Y, Z, c=C, **kwargs)
 def _plot_displaced_tri_mesh(field, axes, **kwargs):

warp/examples/sim/example_cloth.py CHANGED Viewed

@@ -26,29 +26,6 @@ import warp.sim
 import warp.sim.render
-def color_lattice_grid(num_x, num_y):
-    colors = []
-    for _i in range(4):
-        colors.append([])
-    for xi in range(num_x + 1):
-        for yi in range(num_y + 1):
-            vId = xi * (num_y + 1) + yi
-            a = 1 if xi % 2 else 0
-            b = 1 if yi % 2 else 0
-            c = a * 2 + b
-            colors[c].append(vId)
-    colors_wp = []
-    for i_color in range(len(colors)):
-        colors_wp.append(wp.array(colors[i_color], dtype=wp.int32))
-    return colors_wp
 class IntegratorType(Enum):
     EULER = "euler"
     XPBD = "xpbd"
@@ -122,6 +99,7 @@ class Example:
                 tri_ke=1e4,
                 tri_ka=1e4,
                 tri_kd=1e-5,
+                edge_ke=100,
             )
         usd_stage = Usd.Stage.Open(os.path.join(warp.examples.get_asset_directory(), "bunny.usd"))
@@ -143,6 +121,9 @@ class Example:
             kf=1.0e1,
         )
+        if self.integrator_type == IntegratorType.VBD:
+            builder.color()
         self.model = builder.finalize()
         self.model.ground = True
         self.model.soft_contact_ke = 1.0e4
@@ -154,8 +135,6 @@ class Example:
             self.integrator = wp.sim.XPBDIntegrator(iterations=1)
         else:
             self.integrator = wp.sim.VBDIntegrator(self.model, iterations=1)
-            # we need to give VBD coloring information
-            self.model.particle_coloring = color_lattice_grid(width, height)
         self.state_0 = self.model.state()
         self.state_1 = self.model.state()

warp/examples/sim/example_quadruped.py CHANGED Viewed

@@ -115,10 +115,11 @@ class Example:
         self.model.joint_attach_ke = 16000.0
         self.model.joint_attach_kd = 200.0
+        self.use_tile_gemm = False
         # self.integrator = wp.sim.XPBDIntegrator()
         # self.integrator = wp.sim.SemiImplicitIntegrator()
-        self.integrator = wp.sim.FeatherstoneIntegrator(self.model)
+        self.integrator = wp.sim.FeatherstoneIntegrator(self.model, use_tile_gemm=self.use_tile_gemm)
         if stage_path:
             self.renderer = wp.sim.render.SimRenderer(self.model, stage_path)

warp/examples/tile/example_tile_convolution.py ADDED Viewed

@@ -0,0 +1,58 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Tile Convolution
+#
+# Shows how to write a simple convolution kernel using Warp FFT tile
+# primitives.
+#
+###########################################################################
+import numpy as np
+import warp as wp
+wp.set_module_options({"enable_backward": False})
+BLOCK_DIM = 64
+TILE_M = 1
+TILE_N = 128
+scale = wp.vec2d(wp.float64(1 / TILE_N), wp.float64(1 / TILE_N))
+@wp.func
+def filter(x: wp.vec2d):
+    return wp.cw_mul(x, scale)
+@wp.kernel
+def conv_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_fft(a)
+    b = wp.tile_map(filter, a)
+    wp.tile_ifft(b)
+    wp.tile_store(y, i, j, b)
+if __name__ == "__main__":
+    wp.set_device("cuda:0")
+    rng = np.random.default_rng(42)
+    x_h = rng.standard_normal((TILE_M, TILE_N, 2), dtype=np.float64)
+    y_h = np.zeros_like(x_h)
+    x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+    y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+    wp.launch_tiled(conv_tiled, dim=[1, 1], inputs=[x_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
+    # Since filter is 1/N, conv_tiled is a ~no-op
+    assert np.allclose(x_h, y_wp.numpy())

warp/examples/tile/example_tile_fft.py ADDED Viewed

@@ -0,0 +1,47 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Tile FFT
+#
+# Shows how to write a simple FFT kernel using Warp tile primitives.
+#
+###########################################################################
+import numpy as np
+import warp as wp
+wp.set_module_options({"enable_backward": False})
+BLOCK_DIM = 8
+TILE_M = 1
+TILE_N = 32
+@wp.kernel
+def fft_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_fft(a)
+    wp.tile_ifft(a)
+    wp.tile_store(y, i, j, a)
+if __name__ == "__main__":
+    wp.set_device("cuda:0")
+    x_h = np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
+    x_h[:, :, 1] = 0
+    y_h = 3 * np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
+    x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+    y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+    wp.launch_tiled(fft_tiled, dim=[1, 1], inputs=[x_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
+    print("Inputs:\n", x_wp)  # [1+0i, 1+0i, 1+0i, ...]
+    print("Output:\n", y_wp)  # [32+0i, 0, 0, ...]

warp/examples/tile/example_tile_filtering.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Tile Filtering
+#
+# Shows how to write a simple filtering kernel using Warp FFT tile
+# primitives.
+#
+###########################################################################
+import numpy as np
+import warp as wp
+wp.set_module_options({"enable_backward": False})
+BLOCK_DIM = 128
+TILE_M = 1
+TILE_N = 512
+scale = wp.vec2d(wp.float64(1 / TILE_N), wp.float64(1 / TILE_N))
+def cplx(array):
+    return array[..., 0] + 1j * array[..., 1]
+@wp.func
+def cplx_prod(x: wp.vec2d, y: wp.vec2d):
+    return wp.cw_mul(wp.vec2d(x[0] * y[0] - x[1] * y[1], x[0] * y[1] + x[1] * y[0]), scale)
+@wp.kernel
+def conv_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d), z: wp.array2d(dtype=wp.vec2d)):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
+    b = wp.tile_load(y, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_fft(a)
+    c = wp.tile_map(cplx_prod, a, b)
+    wp.tile_ifft(c)
+    wp.tile_store(z, i, j, c)
+if __name__ == "__main__":
+    rng = np.random.default_rng(42)
+    # Create noisy input signal
+    t = np.linspace(0, 2 * np.pi, TILE_N, dtype=np.float64)
+    x = np.sin(t) + 0.5 * rng.random(TILE_N, dtype=np.float64)
+    # Create filter. This filter keeps only ~10% of the frequencies at the center
+    # of the spectrum.
+    f = np.ones_like(x)
+    freq = np.fft.fftfreq(TILE_N)
+    f[np.abs(freq) > 0.05] = 0.0
+    f[np.abs(freq) <= 0.05] = 1.0
+    # Create Warp input data
+    # We use vec2d to hold complex numbers
+    x_h = np.zeros((TILE_M, TILE_N, 2), dtype=np.float64)
+    f_h = np.zeros_like(x_h)
+    y_h = np.zeros_like(f_h)
+    x_h[:, :, 0] = x
+    f_h[:, :, 0] = f
+    x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+    f_wp = wp.array2d(f_h, dtype=wp.vec2d)
+    y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+    wp.launch_tiled(conv_tiled, dim=[1, 1], inputs=[x_wp, f_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
+    # Extract output and compare with numpy
+    x_np = cplx(x_h)
+    f_np = cplx(f_h)
+    y_test = cplx(y_wp.numpy())
+    y_ref = np.fft.ifft(f_np * np.fft.fft(x_np))
+    assert np.allclose(y_ref, y_test)
+try:
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.plot(
+        x,
+        color="#DDDDDD",
+        linewidth=2,
+        label="Original",
+    )
+    ax.plot(y_test[0, :].real, color="#76B900", linewidth=3, label="Smoothed")
+    ax.legend()
+    ax.grid(True)
+    plt.tight_layout()
+    plt.show()
+except ModuleNotFoundError:
+    print("Matplotlib not available; skipping figure")

warp/examples/tile/example_tile_matmul.py ADDED Viewed

@@ -0,0 +1,79 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Tile MatMul
+#
+# Shows how to write a simple GEMM kernel using Warp tile primitives.
+#
+###########################################################################
+import numpy as np
+import warp as wp
+# tile size
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+# num threads per-tile
+TILE_THREADS = 64
+@wp.kernel
+def tile_gemm(A: wp.array2d(dtype=wp.float32), B: wp.array2d(dtype=wp.float16), C: wp.array2d(dtype=wp.float64)):
+    # output tile index
+    i, j = wp.tid()
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
+    _M = A.shape[0]
+    _N = B.shape[1]
+    K = A.shape[1]
+    count = int(K / TILE_K)
+    for k in range(0, count):
+        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+        # sum += a*b
+        wp.tile_matmul(a, b, sum)
+    wp.tile_store(C, i, j, sum)
+if __name__ == "__main__":
+    wp.set_device("cuda:0")
+    # generate some tile aligned matrix dimensions
+    M = TILE_M * 7
+    K = TILE_K * 6
+    N = TILE_N * 5
+    rng = np.random.default_rng(42)
+    A = rng.random((M, K), dtype=np.float32)
+    B = rng.random((K, N), dtype=np.float32).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float64)
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.array(B, requires_grad=True)
+    C_wp = wp.array(C, requires_grad=True)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_gemm,
+            dim=(int(M / TILE_M), int(N / TILE_N)),
+            inputs=[A_wp, B_wp],
+            outputs=[C_wp],
+            block_dim=TILE_THREADS,
+        )
+    assert np.allclose(C_wp.numpy(), A @ B)
+    print("Example matrix multiplication passed")