PyPI - warp-lang - Versions diffs - 1.0.0b5__py3-none-manylinux2014_x86_64.whl → 1.0.0b6__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.0.0b5__py3-none-manylinux2014_x86_64.whl → 1.0.0b6__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

docs/conf.py +3 -4
examples/env/env_ant.py +1 -1
examples/env/env_cartpole.py +1 -1
examples/env/env_humanoid.py +1 -1
examples/example_dem.py +28 -26
examples/example_diffray.py +37 -30
examples/example_fluid.py +7 -3
examples/example_jacobian_ik.py +1 -1
examples/example_mesh_intersect.py +10 -7
examples/example_nvdb.py +3 -3
examples/example_render_opengl.py +19 -10
examples/example_sim_cartpole.py +9 -5
examples/example_sim_cloth.py +29 -25
examples/example_sim_fk_grad.py +2 -2
examples/example_sim_fk_grad_torch.py +3 -3
examples/example_sim_grad_bounce.py +11 -8
examples/example_sim_grad_cloth.py +12 -9
examples/example_sim_granular.py +2 -2
examples/example_sim_granular_collision_sdf.py +13 -13
examples/example_sim_neo_hookean.py +3 -3
examples/example_sim_particle_chain.py +2 -2
examples/example_sim_quadruped.py +8 -5
examples/example_sim_rigid_chain.py +8 -5
examples/example_sim_rigid_contact.py +13 -10
examples/example_sim_rigid_fem.py +2 -2
examples/example_sim_rigid_gyroscopic.py +2 -2
examples/example_sim_rigid_kinematics.py +1 -1
examples/example_sim_trajopt.py +3 -2
examples/fem/example_apic_fluid.py +5 -7
examples/fem/example_diffusion_mgpu.py +18 -16
warp/__init__.py +3 -2
warp/bin/warp.so +0 -0
warp/build_dll.py +29 -9
warp/builtins.py +206 -7
warp/codegen.py +58 -38
warp/config.py +3 -1
warp/context.py +234 -128
warp/fem/__init__.py +2 -2
warp/fem/cache.py +2 -1
warp/fem/field/nodal_field.py +18 -17
warp/fem/geometry/hexmesh.py +11 -6
warp/fem/geometry/quadmesh_2d.py +16 -12
warp/fem/geometry/tetmesh.py +19 -8
warp/fem/geometry/trimesh_2d.py +18 -7
warp/fem/integrate.py +341 -196
warp/fem/quadrature/__init__.py +1 -1
warp/fem/quadrature/pic_quadrature.py +138 -53
warp/fem/quadrature/quadrature.py +81 -9
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_space.py +169 -51
warp/fem/space/grid_2d_function_space.py +2 -2
warp/fem/space/grid_3d_function_space.py +2 -2
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +9 -6
warp/fem/space/quadmesh_2d_function_space.py +2 -2
warp/fem/space/shape/cube_shape_function.py +27 -15
warp/fem/space/shape/square_shape_function.py +29 -18
warp/fem/space/tetmesh_function_space.py +2 -2
warp/fem/space/topology.py +10 -0
warp/fem/space/trimesh_2d_function_space.py +2 -2
warp/fem/utils.py +10 -5
warp/native/array.h +49 -8
warp/native/builtin.h +31 -14
warp/native/cuda_util.cpp +8 -3
warp/native/cuda_util.h +1 -0
warp/native/exports.h +1177 -1108
warp/native/intersect.h +4 -4
warp/native/intersect_adj.h +8 -8
warp/native/mat.h +65 -6
warp/native/mesh.h +126 -5
warp/native/quat.h +28 -4
warp/native/vec.h +76 -14
warp/native/warp.cu +1 -6
warp/render/render_opengl.py +261 -109
warp/sim/import_mjcf.py +13 -7
warp/sim/import_urdf.py +14 -14
warp/sim/inertia.py +17 -18
warp/sim/model.py +67 -67
warp/sim/render.py +1 -1
warp/sparse.py +6 -6
warp/stubs.py +19 -81
warp/tape.py +1 -1
warp/tests/__main__.py +3 -6
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/{test_kinematics.py → disabled_kinematics.py} +10 -12
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +102 -106
warp/tests/test_arithmetic.py +39 -40
warp/tests/test_array.py +46 -48
warp/tests/test_array_reduce.py +25 -19
warp/tests/test_atomic.py +62 -26
warp/tests/test_bool.py +16 -11
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +9 -12
warp/tests/test_closest_point_edge_edge.py +53 -57
warp/tests/test_codegen.py +164 -134
warp/tests/test_compile_consts.py +13 -19
warp/tests/test_conditional.py +30 -32
warp/tests/test_copy.py +9 -12
warp/tests/test_ctypes.py +90 -98
warp/tests/test_dense.py +20 -14
warp/tests/test_devices.py +34 -35
warp/tests/test_dlpack.py +74 -75
warp/tests/test_examples.py +215 -97
warp/tests/test_fabricarray.py +15 -21
warp/tests/test_fast_math.py +14 -11
warp/tests/test_fem.py +280 -97
warp/tests/test_fp16.py +19 -15
warp/tests/test_func.py +177 -194
warp/tests/test_generics.py +71 -77
warp/tests/test_grad.py +83 -32
warp/tests/test_grad_customs.py +7 -9
warp/tests/test_hash_grid.py +6 -10
warp/tests/test_import.py +9 -23
warp/tests/test_indexedarray.py +19 -21
warp/tests/test_intersect.py +15 -9
warp/tests/test_large.py +17 -19
warp/tests/test_launch.py +14 -17
warp/tests/test_lerp.py +63 -63
warp/tests/test_lvalue.py +84 -35
warp/tests/test_marching_cubes.py +9 -13
warp/tests/test_mat.py +388 -3004
warp/tests/test_mat_lite.py +9 -12
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +10 -11
warp/tests/test_matmul.py +104 -100
warp/tests/test_matmul_lite.py +72 -98
warp/tests/test_mesh.py +35 -32
warp/tests/test_mesh_query_aabb.py +18 -25
warp/tests/test_mesh_query_point.py +39 -23
warp/tests/test_mesh_query_ray.py +9 -21
warp/tests/test_mlp.py +8 -9
warp/tests/test_model.py +89 -93
warp/tests/test_modules_lite.py +15 -25
warp/tests/test_multigpu.py +87 -114
warp/tests/test_noise.py +10 -12
warp/tests/test_operators.py +14 -21
warp/tests/test_options.py +10 -11
warp/tests/test_pinned.py +16 -18
warp/tests/test_print.py +16 -20
warp/tests/test_quat.py +121 -88
warp/tests/test_rand.py +12 -13
warp/tests/test_reload.py +27 -32
warp/tests/test_rounding.py +7 -10
warp/tests/test_runlength_encode.py +105 -106
warp/tests/test_smoothstep.py +8 -9
warp/tests/test_snippet.py +13 -22
warp/tests/test_sparse.py +30 -29
warp/tests/test_spatial.py +179 -174
warp/tests/test_streams.py +100 -107
warp/tests/test_struct.py +98 -67
warp/tests/test_tape.py +11 -17
warp/tests/test_torch.py +89 -86
warp/tests/test_transient_module.py +9 -12
warp/tests/test_types.py +328 -50
warp/tests/test_utils.py +217 -218
warp/tests/test_vec.py +133 -2133
warp/tests/test_vec_lite.py +8 -11
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +391 -382
warp/tests/test_volume_write.py +122 -135
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +291 -0
warp/tests/{test_base.py → unittest_utils.py} +138 -25
warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
warp/tests/{test_debug.py → walkthough_debug.py} +2 -15
warp/thirdparty/unittest_parallel.py +257 -54
warp/types.py +119 -98
warp/utils.py +14 -0
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/METADATA +2 -1
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/RECORD +182 -178
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
warp/tests/test_all.py +0 -239
warp/tests/test_conditional_unequal_types_kernels.py +0 -14
warp/tests/test_coverage.py +0 -38
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0

warp/types.py CHANGED Viewed

@@ -178,13 +178,13 @@ def vector(length, dtype):
             return warp.add(self, y)
         def __radd__(self, y):
-            return warp.add(self, y)
+            return warp.add(y, self)
         def __sub__(self, y):
             return warp.sub(self, y)
-        def __rsub__(self, x):
-            return warp.sub(x, self)
+        def __rsub__(self, y):
+            return warp.sub(y, self)
         def __mul__(self, y):
             return warp.mul(self, y)
@@ -195,7 +195,7 @@ def vector(length, dtype):
         def __truediv__(self, y):
             return warp.div(self, y)
-        def __rdiv__(self, x):
+        def __rtruediv__(self, x):
             return warp.div(x, self)
         def __pos__(self):
@@ -294,13 +294,13 @@ def matrix(shape, dtype):
             return warp.add(self, y)
         def __radd__(self, y):
-            return warp.add(self, y)
+            return warp.add(y, self)
         def __sub__(self, y):
             return warp.sub(self, y)
-        def __rsub__(self, x):
-            return warp.sub(x, self)
+        def __rsub__(self, y):
+            return warp.sub(y, self)
         def __mul__(self, y):
             return warp.mul(self, y)
@@ -317,7 +317,7 @@ def matrix(shape, dtype):
         def __truediv__(self, y):
             return warp.div(self, y)
-        def __rdiv__(self, x):
+        def __rtruediv__(self, x):
             return warp.div(x, self)
         def __pos__(self):
@@ -582,11 +582,11 @@ def transformation(dtype=Any):
         @property
         def p(self):
-            return self[0:3]
+            return vec3(self[0:3])
         @property
         def q(self):
-            return self[3:7]
+            return quat(self[3:7])
     return transform_t
@@ -910,18 +910,21 @@ class range_t:
 # definition just for kernel type (cannot be a parameter), see bvh.h
 class bvh_query_t:
+    """Object used to track state during BVH traversal."""
     def __init__(self):
         pass
 # definition just for kernel type (cannot be a parameter), see mesh.h
 class mesh_query_aabb_t:
+    """Object used to track state during mesh traversal."""
     def __init__(self):
         pass
 # definition just for kernel type (cannot be a parameter), see hash_grid.h
 class hash_grid_query_t:
+    """Object used to track state during neighbor traversal."""
     def __init__(self):
         pass
@@ -2979,6 +2982,67 @@ class Volume:
         return volume
+# definition just for kernel type (cannot be a parameter), see mesh.h
+# NOTE: its layout must match the corresponding struct defined in C.
+# NOTE: it needs to be defined after `indexedarray` to workaround a circular import issue.
+class mesh_query_point_t:
+    """Output for the mesh query point functions.
+    Attributes:
+        result (bool): Whether a point is found within the given constraints.
+        sign (float32): A value < 0 if query point is inside the mesh, >=0 otherwise.
+                        Note that mesh must be watertight for this to be robust
+        face (int32): Index of the closest face.
+        u (float32): Barycentric u coordinate of the closest point.
+        v (float32): Barycentric v coordinate of the closest point.
+    See Also:
+        :func:`mesh_query_point`, :func:`mesh_query_point_no_sign`,
+        :func:`mesh_query_furthest_point_no_sign`,
+        :func:`mesh_query_point_sign_normal`,
+        and :func:`mesh_query_point_sign_winding_number`.
+    """
+    from warp.codegen import Var
+    vars = {
+        "result": Var("result", bool),
+        "sign": Var("sign", float32),
+        "face": Var("face", int32),
+        "u": Var("u", float32),
+        "v": Var("v", float32),
+    }
+# definition just for kernel type (cannot be a parameter), see mesh.h
+# NOTE: its layout must match the corresponding struct defined in C.
+class mesh_query_ray_t:
+    """Output for the mesh query ray functions.
+    Attributes:
+        result (bool): Whether a hit is found within the given constraints.
+        sign (float32): A value > 0 if the ray hit in front of the face, returns < 0 otherwise.
+        face (int32): Index of the closest face.
+        t (float32): Distance of the closest hit along the ray.
+        u (float32): Barycentric u coordinate of the closest hit.
+        v (float32): Barycentric v coordinate of the closest hit.
+        normal (vec3f): Face normal.
+    See Also:
+        :func:`mesh_query_ray`.
+    """
+    from warp.codegen import Var
+    vars = {
+        "result": Var("result", bool),
+        "sign": Var("sign", float32),
+        "face": Var("face", int32),
+        "t": Var("t", float32),
+        "u": Var("u", float32),
+        "v": Var("v", float32),
+        "normal": Var("normal", vec3),
+    }
 def matmul(
     a: array2d,
     b: array2d,
@@ -3157,9 +3221,9 @@ def adj_matmul(
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()))
-        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()))
-        adj_c.assign(beta * adj_d.numpy())
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()) + adj_a.numpy())
+        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()) + adj_b.numpy())
+        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
     cc = device.arch
@@ -3174,10 +3238,10 @@ def adj_matmul(
             type_typestr(a.dtype).encode(),
             ctypes.c_void_p(adj_d.ptr),
             ctypes.c_void_p(b.ptr),
-            ctypes.c_void_p(a.ptr),
+            ctypes.c_void_p(adj_a.ptr),
             ctypes.c_void_p(adj_a.ptr),
             alpha,
-            0.0,
+            1.0,
             True,
             b.is_transposed,
             allow_tf32x3_arith,
@@ -3194,10 +3258,10 @@ def adj_matmul(
             type_typestr(a.dtype).encode(),
             ctypes.c_void_p(b.ptr),
             ctypes.c_void_p(adj_d.ptr),
-            ctypes.c_void_p(a.ptr),
+            ctypes.c_void_p(adj_a.ptr),
             ctypes.c_void_p(adj_a.ptr),
             alpha,
-            0.0,
+            1.0,
             not b.is_transposed,
             False,
             allow_tf32x3_arith,
@@ -3216,10 +3280,10 @@ def adj_matmul(
             type_typestr(a.dtype).encode(),
             ctypes.c_void_p(a.ptr),
             ctypes.c_void_p(adj_d.ptr),
-            ctypes.c_void_p(b.ptr),
+            ctypes.c_void_p(adj_b.ptr),
             ctypes.c_void_p(adj_b.ptr),
             alpha,
-            0.0,
+            1.0,
             a.is_transposed,
             True,
             allow_tf32x3_arith,
@@ -3236,10 +3300,10 @@ def adj_matmul(
             type_typestr(a.dtype).encode(),
             ctypes.c_void_p(adj_d.ptr),
             ctypes.c_void_p(a.ptr),
-            ctypes.c_void_p(b.ptr),
+            ctypes.c_void_p(adj_b.ptr),
             ctypes.c_void_p(adj_b.ptr),
             alpha,
-            0.0,
+            1.0,
             False,
             not a.is_transposed,
             allow_tf32x3_arith,
@@ -3249,25 +3313,13 @@ def adj_matmul(
             raise RuntimeError("adj_matmul failed.")
     # adj_c
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(adj_c.ptr),
-        0.0,
-        beta,
-        not a.is_transposed,
-        not b.is_transposed,
-        allow_tf32x3_arith,
-        1,
+    warp.launch(
+        kernel=warp.utils.add_kernel_2d,
+        dim=adj_c.shape,
+        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
+        device=device,
+        record_tape=False
     )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
 def batched_matmul(
@@ -3476,9 +3528,9 @@ def adj_batched_matmul(
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))))
-        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()))
-        adj_c.assign(beta * adj_d.numpy())
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))) + adj_a.numpy())
+        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()) + adj_b.numpy())
+        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
     # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
@@ -3502,10 +3554,10 @@ def adj_batched_matmul(
                 type_typestr(a.dtype).encode(),
                 ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
                 ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
                 ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
                 alpha,
-                0.0,
+                1.0,
                 True,
                 b.is_transposed,
                 allow_tf32x3_arith,
@@ -3522,10 +3574,10 @@ def adj_batched_matmul(
                 type_typestr(a.dtype).encode(),
                 ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
                 ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
                 ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
                 alpha,
-                0.0,
+                1.0,
                 not b.is_transposed,
                 False,
                 allow_tf32x3_arith,
@@ -3544,10 +3596,10 @@ def adj_batched_matmul(
                 type_typestr(a.dtype).encode(),
                 ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
                 ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
                 ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
                 alpha,
-                0.0,
+                1.0,
                 a.is_transposed,
                 True,
                 allow_tf32x3_arith,
@@ -3564,10 +3616,10 @@ def adj_batched_matmul(
                 type_typestr(a.dtype).encode(),
                 ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
                 ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
                 ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
                 alpha,
-                0.0,
+                1.0,
                 False,
                 not a.is_transposed,
                 allow_tf32x3_arith,
@@ -3575,27 +3627,6 @@ def adj_batched_matmul(
             )
             if not ret:
                 raise RuntimeError("adj_matmul failed.")
-        # adj_c
-        ret = runtime.core.cutlass_gemm(
-            cc,
-            m,
-            n,
-            k,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
-            ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
-            ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
-            ctypes.c_void_p(adj_c[idx_start:idx_end,:,:].ptr),
-            0.0,
-            beta,
-            not a.is_transposed,
-            not b.is_transposed,
-            allow_tf32x3_arith,
-            max_batch_count,
-        )
-        if not ret:
-            raise RuntimeError("adj_batched_matmul failed.")
     idx_start = iters * max_batch_count
@@ -3609,10 +3640,10 @@ def adj_batched_matmul(
             type_typestr(a.dtype).encode(),
             ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
             ctypes.c_void_p(b[idx_start:,:,:].ptr),
-            ctypes.c_void_p(a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
             ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
             alpha,
-            0.0,
+            1.0,
             True,
             b.is_transposed,
             allow_tf32x3_arith,
@@ -3629,10 +3660,10 @@ def adj_batched_matmul(
             type_typestr(a.dtype).encode(),
             ctypes.c_void_p(b[idx_start:,:,:].ptr),
             ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
-            ctypes.c_void_p(a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
             ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
             alpha,
-            0.0,
+            1.0,
             not b.is_transposed,
             False,
             allow_tf32x3_arith,
@@ -3651,10 +3682,10 @@ def adj_batched_matmul(
             type_typestr(a.dtype).encode(),
             ctypes.c_void_p(a[idx_start:,:,:].ptr),
             ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
-            ctypes.c_void_p(b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
             ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
             alpha,
-            0.0,
+            1.0,
             a.is_transposed,
             True,
             allow_tf32x3_arith,
@@ -3671,10 +3702,10 @@ def adj_batched_matmul(
             type_typestr(a.dtype).encode(),
             ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
             ctypes.c_void_p(a[idx_start:,:,:].ptr),
-            ctypes.c_void_p(b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
             ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
             alpha,
-            0.0,
+            1.0,
             False,
             not a.is_transposed,
             allow_tf32x3_arith,
@@ -3684,25 +3715,13 @@ def adj_batched_matmul(
             raise RuntimeError("adj_matmul failed.")
     # adj_c
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a[idx_start:,:,:].ptr),
-        ctypes.c_void_p(b[idx_start:,:,:].ptr),
-        ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
-        ctypes.c_void_p(adj_c[idx_start:,:,:].ptr),
-        0.0,
-        beta,
-        not a.is_transposed,
-        not b.is_transposed,
-        allow_tf32x3_arith,
-        remainder,
+    warp.launch(
+        kernel=warp.utils.add_kernel_3d,
+        dim=adj_c.shape,
+        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
+        device=device,
+        record_tape=False
     )
-    if not ret:
-        raise RuntimeError("adj_batched_matmul failed.")
 class HashGrid:
     def __init__(self, dim_x, dim_y, dim_z, device=None):
@@ -3957,7 +3976,7 @@ def infer_argument_types(args, template_types, arg_names=None):
             arg_types.append(arg._cls)
         # elif arg_type in [warp.types.launch_bounds_t, warp.types.shape_t, warp.types.range_t]:
         #     arg_types.append(arg_type)
-        # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.bvh_query_t]:
+        # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.mesh_query_point_t, warp.mesh_query_ray_t, warp.bvh_query_t]:
         #     arg_types.append(arg_type)
         elif arg is None:
             # allow passing None for arrays
@@ -3995,6 +4014,8 @@ simple_type_codes = {
     launch_bounds_t: "lb",
     hash_grid_query_t: "hgq",
     mesh_query_aabb_t: "mqa",
+    mesh_query_point_t: "mqp",
+    mesh_query_ray_t: "mqr",
     bvh_query_t: "bvhq",
 }

warp/utils.py CHANGED Viewed

@@ -666,3 +666,17 @@ class ScopedTimer:
                 print("{}{} took {:.2f} ms".format(indent, self.name, self.elapsed))
             ScopedTimer.indent -= 1
+# helper kernels for adj_matmul
+@wp.kernel
+def add_kernel_2d(x: wp.array2d(dtype=Any), acc: wp.array2d(dtype=Any), beta: Any):
+    i, j = wp.tid()
+    x[i,j] = x[i,j] + beta * acc[i,j]
+@wp.kernel
+def add_kernel_3d(x: wp.array3d(dtype=Any), acc: wp.array3d(dtype=Any), beta: Any):
+    i, j, k = wp.tid()
+    x[i,j,k] = x[i,j,k] + beta * acc[i,j,k]

{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: warp-lang
-Version: 1.0.0b5
+Version: 1.0.0b6
 Summary: A Python framework for high-performance simulation and graphics programming
 Author-email: NVIDIA <mmacklin@nvidia.com>
 License: NVSCL
@@ -25,6 +25,7 @@ Requires-Dist: isort ; extra == 'dev'
 Requires-Dist: nvtx ; extra == 'dev'
 Requires-Dist: furo ; extra == 'dev'
 Requires-Dist: sphinx-copybutton ; extra == 'dev'
+Requires-Dist: coverage[toml] ; extra == 'dev'
 # NVIDIA Warp