PyPI - warp-lang - Versions diffs - 1.0.0b5__py3-none-manylinux2014_x86_64.whl → 1.0.0b6__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.0.0b5__py3-none-manylinux2014_x86_64.whl → 1.0.0b6__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

docs/conf.py +3 -4
examples/env/env_ant.py +1 -1
examples/env/env_cartpole.py +1 -1
examples/env/env_humanoid.py +1 -1
examples/example_dem.py +28 -26
examples/example_diffray.py +37 -30
examples/example_fluid.py +7 -3
examples/example_jacobian_ik.py +1 -1
examples/example_mesh_intersect.py +10 -7
examples/example_nvdb.py +3 -3
examples/example_render_opengl.py +19 -10
examples/example_sim_cartpole.py +9 -5
examples/example_sim_cloth.py +29 -25
examples/example_sim_fk_grad.py +2 -2
examples/example_sim_fk_grad_torch.py +3 -3
examples/example_sim_grad_bounce.py +11 -8
examples/example_sim_grad_cloth.py +12 -9
examples/example_sim_granular.py +2 -2
examples/example_sim_granular_collision_sdf.py +13 -13
examples/example_sim_neo_hookean.py +3 -3
examples/example_sim_particle_chain.py +2 -2
examples/example_sim_quadruped.py +8 -5
examples/example_sim_rigid_chain.py +8 -5
examples/example_sim_rigid_contact.py +13 -10
examples/example_sim_rigid_fem.py +2 -2
examples/example_sim_rigid_gyroscopic.py +2 -2
examples/example_sim_rigid_kinematics.py +1 -1
examples/example_sim_trajopt.py +3 -2
examples/fem/example_apic_fluid.py +5 -7
examples/fem/example_diffusion_mgpu.py +18 -16
warp/__init__.py +3 -2
warp/bin/warp.so +0 -0
warp/build_dll.py +29 -9
warp/builtins.py +206 -7
warp/codegen.py +58 -38
warp/config.py +3 -1
warp/context.py +234 -128
warp/fem/__init__.py +2 -2
warp/fem/cache.py +2 -1
warp/fem/field/nodal_field.py +18 -17
warp/fem/geometry/hexmesh.py +11 -6
warp/fem/geometry/quadmesh_2d.py +16 -12
warp/fem/geometry/tetmesh.py +19 -8
warp/fem/geometry/trimesh_2d.py +18 -7
warp/fem/integrate.py +341 -196
warp/fem/quadrature/__init__.py +1 -1
warp/fem/quadrature/pic_quadrature.py +138 -53
warp/fem/quadrature/quadrature.py +81 -9
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_space.py +169 -51
warp/fem/space/grid_2d_function_space.py +2 -2
warp/fem/space/grid_3d_function_space.py +2 -2
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +9 -6
warp/fem/space/quadmesh_2d_function_space.py +2 -2
warp/fem/space/shape/cube_shape_function.py +27 -15
warp/fem/space/shape/square_shape_function.py +29 -18
warp/fem/space/tetmesh_function_space.py +2 -2
warp/fem/space/topology.py +10 -0
warp/fem/space/trimesh_2d_function_space.py +2 -2
warp/fem/utils.py +10 -5
warp/native/array.h +49 -8
warp/native/builtin.h +31 -14
warp/native/cuda_util.cpp +8 -3
warp/native/cuda_util.h +1 -0
warp/native/exports.h +1177 -1108
warp/native/intersect.h +4 -4
warp/native/intersect_adj.h +8 -8
warp/native/mat.h +65 -6
warp/native/mesh.h +126 -5
warp/native/quat.h +28 -4
warp/native/vec.h +76 -14
warp/native/warp.cu +1 -6
warp/render/render_opengl.py +261 -109
warp/sim/import_mjcf.py +13 -7
warp/sim/import_urdf.py +14 -14
warp/sim/inertia.py +17 -18
warp/sim/model.py +67 -67
warp/sim/render.py +1 -1
warp/sparse.py +6 -6
warp/stubs.py +19 -81
warp/tape.py +1 -1
warp/tests/__main__.py +3 -6
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/{test_kinematics.py → disabled_kinematics.py} +10 -12
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +102 -106
warp/tests/test_arithmetic.py +39 -40
warp/tests/test_array.py +46 -48
warp/tests/test_array_reduce.py +25 -19
warp/tests/test_atomic.py +62 -26
warp/tests/test_bool.py +16 -11
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +9 -12
warp/tests/test_closest_point_edge_edge.py +53 -57
warp/tests/test_codegen.py +164 -134
warp/tests/test_compile_consts.py +13 -19
warp/tests/test_conditional.py +30 -32
warp/tests/test_copy.py +9 -12
warp/tests/test_ctypes.py +90 -98
warp/tests/test_dense.py +20 -14
warp/tests/test_devices.py +34 -35
warp/tests/test_dlpack.py +74 -75
warp/tests/test_examples.py +215 -97
warp/tests/test_fabricarray.py +15 -21
warp/tests/test_fast_math.py +14 -11
warp/tests/test_fem.py +280 -97
warp/tests/test_fp16.py +19 -15
warp/tests/test_func.py +177 -194
warp/tests/test_generics.py +71 -77
warp/tests/test_grad.py +83 -32
warp/tests/test_grad_customs.py +7 -9
warp/tests/test_hash_grid.py +6 -10
warp/tests/test_import.py +9 -23
warp/tests/test_indexedarray.py +19 -21
warp/tests/test_intersect.py +15 -9
warp/tests/test_large.py +17 -19
warp/tests/test_launch.py +14 -17
warp/tests/test_lerp.py +63 -63
warp/tests/test_lvalue.py +84 -35
warp/tests/test_marching_cubes.py +9 -13
warp/tests/test_mat.py +388 -3004
warp/tests/test_mat_lite.py +9 -12
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +10 -11
warp/tests/test_matmul.py +104 -100
warp/tests/test_matmul_lite.py +72 -98
warp/tests/test_mesh.py +35 -32
warp/tests/test_mesh_query_aabb.py +18 -25
warp/tests/test_mesh_query_point.py +39 -23
warp/tests/test_mesh_query_ray.py +9 -21
warp/tests/test_mlp.py +8 -9
warp/tests/test_model.py +89 -93
warp/tests/test_modules_lite.py +15 -25
warp/tests/test_multigpu.py +87 -114
warp/tests/test_noise.py +10 -12
warp/tests/test_operators.py +14 -21
warp/tests/test_options.py +10 -11
warp/tests/test_pinned.py +16 -18
warp/tests/test_print.py +16 -20
warp/tests/test_quat.py +121 -88
warp/tests/test_rand.py +12 -13
warp/tests/test_reload.py +27 -32
warp/tests/test_rounding.py +7 -10
warp/tests/test_runlength_encode.py +105 -106
warp/tests/test_smoothstep.py +8 -9
warp/tests/test_snippet.py +13 -22
warp/tests/test_sparse.py +30 -29
warp/tests/test_spatial.py +179 -174
warp/tests/test_streams.py +100 -107
warp/tests/test_struct.py +98 -67
warp/tests/test_tape.py +11 -17
warp/tests/test_torch.py +89 -86
warp/tests/test_transient_module.py +9 -12
warp/tests/test_types.py +328 -50
warp/tests/test_utils.py +217 -218
warp/tests/test_vec.py +133 -2133
warp/tests/test_vec_lite.py +8 -11
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +391 -382
warp/tests/test_volume_write.py +122 -135
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +291 -0
warp/tests/{test_base.py → unittest_utils.py} +138 -25
warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
warp/tests/{test_debug.py → walkthough_debug.py} +2 -15
warp/thirdparty/unittest_parallel.py +257 -54
warp/types.py +119 -98
warp/utils.py +14 -0
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/METADATA +2 -1
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/RECORD +182 -178
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
warp/tests/test_all.py +0 -239
warp/tests/test_conditional_unequal_types_kernels.py +0 -14
warp/tests/test_coverage.py +0 -38
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
{warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0

warp/tests/test_math.py CHANGED Viewed

@@ -5,13 +5,13 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-from typing import NamedTuple
 import unittest
+from typing import NamedTuple
 import numpy as np
 import warp as wp
-from warp.tests.test_base import *
+from warp.tests.unittest_utils import *
 wp.init()
@@ -176,19 +176,18 @@ def test_mat_type(test, device):
         raise ValueError("mat to string error")
-def register(parent):
-    devices = get_test_devices()
+devices = get_test_devices()
+class TestMath(unittest.TestCase):
+    pass
-    class TestMath(parent):
-        pass
-    add_function_test(TestMath, "test_scalar_math", test_scalar_math, devices=devices)
-    add_function_test(TestMath, "test_vec_type", test_vec_type, devices=devices)
-    add_function_test(TestMath, "test_mat_type", test_mat_type, devices=devices)
-    return TestMath
+add_function_test(TestMath, "test_scalar_math", test_scalar_math, devices=devices)
+add_function_test(TestMath, "test_vec_type", test_vec_type, devices=devices)
+add_function_test(TestMath, "test_mat_type", test_mat_type, devices=devices)
 if __name__ == "__main__":
     wp.build.clear_kernel_cache()
-    _ = register(unittest.TestCase)
     unittest.main(verbosity=2)

warp/tests/test_matmul.py CHANGED Viewed

@@ -1,11 +1,21 @@
-import numpy as np
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
 import unittest
+import numpy as np
 import warp as wp
-from warp.tests.test_base import *
+from warp.tests.unittest_utils import *
 wp.init()
+from warp.context import runtime  # noqa: E402
 class gemm_test_bed_runner:
     def __init__(self, dtype, device):
@@ -21,63 +31,54 @@ class gemm_test_bed_runner:
                 np.ceil(rng.uniform(low=low, high=high, size=(m, k))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
             B = wp.array2d(
                 np.ceil(rng.uniform(low=low, high=high, size=(k, n))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
             C = wp.array2d(
                 np.ceil(rng.uniform(low=low, high=high, size=(m, n))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
-            D = wp.array2d(
-                np.zeros((m, n)),
-                dtype=self.dtype,
-                device=self.device,
-                requires_grad=True)
+            D = wp.array2d(np.zeros((m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
         else:
             A = wp.array3d(
                 np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
             B = wp.array3d(
                 np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
             C = wp.array3d(
                 np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
-            )
-            D = wp.array3d(
-                np.zeros((batch_count, m, n)),
-                dtype=self.dtype,
-                device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
+            D = wp.array3d(np.zeros((batch_count, m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
         return A, B, C, D
     def run_and_verify(self, m, n, k, batch_count, alpha, beta):
         A, B, C, D = self.alloc(m, n, k, batch_count)
         ones = wp.zeros_like(D)
         ones.fill_(1.0)
         if batch_count == 1:
             tape = wp.Tape()
             with tape:
                 wp.matmul(A, B, C, D, alpha, beta, False, self.device)
-            tape.backward(grads={D : ones})
+            tape.backward(grads={D: ones})
             D_np = alpha * (A.numpy() @ B.numpy()) + beta * C.numpy()
             assert np.array_equal(D_np, D.numpy())
@@ -89,8 +90,8 @@ class gemm_test_bed_runner:
             tape = wp.Tape()
             with tape:
                 wp.batched_matmul(A, B, C, D, alpha, beta, False, self.device)
-            tape.backward(grads={D : ones})
+            tape.backward(grads={D: ones})
             D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
             assert np.array_equal(D_np, D.numpy())
@@ -132,75 +133,45 @@ class gemm_test_bed_runner_transpose:
                 np.ceil(rng.uniform(low=low, high=high, size=(m, k))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
             B = wp.array2d(
                 np.ceil(rng.uniform(low=low, high=high, size=(k, n))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
             C = wp.array2d(
                 np.ceil(rng.uniform(low=low, high=high, size=(m, n))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
-            )
-            D = wp.array2d(
-                np.zeros((m, n)),
-                dtype=self.dtype,
-                device=self.device,
-                requires_grad=True
-            )
-            AT = wp.array2d(
-                A.numpy().transpose([1, 0]),
-                dtype=self.dtype,
-                device=self.device,
-                requires_grad=True
-            )
-            BT = wp.array2d(
-                B.numpy().transpose([1, 0]),
-                dtype=self.dtype,
-                device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
+            D = wp.array2d(np.zeros((m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+            AT = wp.array2d(A.numpy().transpose([1, 0]), dtype=self.dtype, device=self.device, requires_grad=True)
+            BT = wp.array2d(B.numpy().transpose([1, 0]), dtype=self.dtype, device=self.device, requires_grad=True)
         else:
             A = wp.array3d(
                 np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
             B = wp.array3d(
                 np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
             C = wp.array3d(
                 np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
                 dtype=self.dtype,
                 device=self.device,
-                requires_grad=True
-            )
-            D = wp.array3d(
-                np.zeros((batch_count, m, n)),
-                dtype=self.dtype,
-                device=self.device,
-                requires_grad=True
-            )
-            AT = wp.array3d(
-                A.numpy().transpose([0, 2, 1]),
-                dtype=self.dtype,
-                device=self.device,
-                requires_grad=True
-            )
-            BT = wp.array3d(
-                B.numpy().transpose([0, 2, 1]),
-                dtype=self.dtype,
-                device=self.device,
-                requires_grad=True
+                requires_grad=True,
             )
+            D = wp.array3d(np.zeros((batch_count, m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+            AT = wp.array3d(A.numpy().transpose([0, 2, 1]), dtype=self.dtype, device=self.device, requires_grad=True)
+            BT = wp.array3d(B.numpy().transpose([0, 2, 1]), dtype=self.dtype, device=self.device, requires_grad=True)
         return A, B, C, D, AT, BT
     def run_and_verify(self, m, n, k, batch_count, alpha, beta):
@@ -219,17 +190,17 @@ class gemm_test_bed_runner_transpose:
         ones3.fill_(1.0)
         if batch_count == 1:
-            ATT1 = AT1.transpose([1, 0])
+            ATT1 = AT1.transpose([1, 0])
             BTT1 = BT1.transpose([1, 0])
-            ATT2 = AT2.transpose([1, 0])
+            ATT2 = AT2.transpose([1, 0])
             BTT2 = BT2.transpose([1, 0])
             tape = wp.Tape()
             with tape:
                 wp.matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
                 wp.matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
                 wp.matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
-            tape.backward(grads={D1 : ones1, D2 : ones2, D3 : ones3})
+            tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
             D_np = alpha * (A.numpy() @ B.numpy()) + beta * C1.numpy()
             assert np.array_equal(D_np, D1.numpy())
             assert np.array_equal(D_np, D2.numpy())
@@ -240,7 +211,7 @@ class gemm_test_bed_runner_transpose:
             adj_C_np = beta * ones1.numpy()
         else:
-            ATT1 = AT1.transpose([0, 2, 1])
+            ATT1 = AT1.transpose([0, 2, 1])
             BTT1 = BT1.transpose([0, 2, 1])
             ATT2 = AT2.transpose([0, 2, 1])
             BTT2 = BT2.transpose([0, 2, 1])
@@ -249,8 +220,8 @@ class gemm_test_bed_runner_transpose:
                 wp.batched_matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
                 wp.batched_matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
                 wp.batched_matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
-            tape.backward(grads={D1 : ones1, D2 : ones2, D3 : ones3})
+            tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
             D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C1.numpy()
             assert np.array_equal(D_np, D1.numpy())
             assert np.array_equal(D_np, D2.numpy())
@@ -288,11 +259,13 @@ def test_f16(test, device):
     gemm_test_bed_runner_transpose(wp.float16, device).run()
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_f32(test, device):
     gemm_test_bed_runner(wp.float32, device).run()
     gemm_test_bed_runner_transpose(wp.float32, device).run()
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_f64(test, device):
     gemm_test_bed_runner(wp.float64, device).run()
     gemm_test_bed_runner_transpose(wp.float64, device).run()
@@ -304,6 +277,7 @@ def matrix_sum_kernel(arr: wp.array2d(dtype=float), loss: wp.array(dtype=float))
     wp.atomic_add(loss, 0, arr[i, j])
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_tape(test, device):
     rng = np.random.default_rng(42)
     low = -4.5
@@ -331,6 +305,7 @@ def test_tape(test, device):
     tape.backward(loss=loss)
     A_grad = A.grad.numpy()
+    tape.reset()
     # test adjoint
     D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
@@ -342,6 +317,7 @@ def test_tape(test, device):
     assert_array_equal(A.grad, wp.zeros_like(A))
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_operator(test, device):
     rng = np.random.default_rng(42)
     low = -4.5
@@ -377,6 +353,7 @@ def test_operator(test, device):
     assert_array_equal(A.grad, wp.zeros_like(A))
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_large_batch_count(test, device):
     rng = np.random.default_rng(42)
     low = -4.5
@@ -386,31 +363,38 @@ def test_large_batch_count(test, device):
     k = 4
     batch_count = 65535 * 2 + int(65535 / 2)
     A = wp.array3d(
-        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))), dtype=float, device=device, requires_grad=True
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
     )
     B = wp.array3d(
-        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))), dtype=float, device=device, requires_grad=True
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
     )
     C = wp.array3d(
-        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))), dtype=float, device=device, requires_grad=True
-    )
-    D = wp.array3d(
-        np.zeros((batch_count, m, n)), dtype=float, device=device, requires_grad=True
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
     )
+    D = wp.array3d(np.zeros((batch_count, m, n)), dtype=float, device=device, requires_grad=True)
     ones = wp.zeros_like(D)
     ones.fill_(1.0)
     alpha = 1.0
     beta = 1.0
     tape = wp.Tape()
     with tape:
         wp.batched_matmul(A, B, C, D, alpha=alpha, beta=beta, allow_tf32x3_arith=False, device=device)
-    tape.backward(grads={D : ones})
+    tape.backward(grads={D: ones})
     D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
     assert np.array_equal(D_np, D.numpy())
     adj_A_np = alpha * np.matmul(ones.numpy(), B.numpy().transpose((0, 2, 1)))
     adj_B_np = alpha * np.matmul(A.numpy().transpose((0, 2, 1)), ones.numpy())
     adj_C_np = beta * ones.numpy()
@@ -420,30 +404,50 @@ def test_large_batch_count(test, device):
     assert np.array_equal(adj_C_np, C.grad.numpy())
-def register(parent):
-    devices = [d for d in get_test_devices()]
+def test_adjoint_accumulation(test, device):
+    a_np = np.ones(shape=(2,3))
+    b_np = np.ones(shape=(3,2))
+    c_np = np.zeros(shape=(2,2))
+    d_np = np.zeros(shape=(2,2))
-    class TestMatmul(parent):
-        pass
+    a_wp = wp.from_numpy(a_np, dtype=float, requires_grad=True)
+    b_wp = wp.from_numpy(b_np, dtype=float, requires_grad=True)
+    c_wp = wp.from_numpy(c_np, dtype=float, requires_grad=True)
+    d1_wp = wp.from_numpy(d_np, dtype=float, requires_grad=True)
+    d2_wp = wp.from_numpy(d_np, dtype=float, requires_grad=True)
-    if devices:
-        # check if CUTLASS is available
-        from warp.context import runtime
+    tape = wp.Tape()
+    with tape:
+        wp.matmul(a_wp, b_wp, c_wp, d1_wp, alpha=1.0, beta=1.0)
+        wp.matmul(a_wp, b_wp, d1_wp, d2_wp, alpha=1.0, beta=1.0)
+    d_grad = wp.zeros_like(d2_wp)
+    d_grad.fill_(1.)
+    grads = {d2_wp : d_grad}
+    tape.backward(grads=grads)
+    assert np.array_equal(a_wp.grad.numpy(), 4.0 * np.ones(shape=(2,3)))
+    assert np.array_equal(b_wp.grad.numpy(), 4.0 * np.ones(shape=(3,2)))
+    assert np.array_equal(c_wp.grad.numpy(), np.ones(shape=(2,2)))
+devices = get_test_devices()
+class TestMatmul(unittest.TestCase):
+    pass
-        if runtime.core.is_cutlass_enabled():
-            # add_function_test(TestMatmul, "test_f16", test_f16, devices=devices)
-            add_function_test(TestMatmul, "test_f32", test_f32, devices=devices)
-            add_function_test(TestMatmul, "test_f64", test_f64, devices=devices)
-            add_function_test(TestMatmul, "test_tape", test_tape, devices=devices)
-            add_function_test(TestMatmul, "test_operator", test_operator, devices=devices)
-            add_function_test(TestMatmul, "test_large_batch_count", test_large_batch_count, devices=devices)
-        else:
-            print("Skipping matmul tests because CUTLASS is not supported in this build")
-    return TestMatmul
+# add_function_test(TestMatmul, "test_f16", test_f16, devices=devices)
+add_function_test(TestMatmul, "test_f32", test_f32, devices=devices)
+add_function_test(TestMatmul, "test_f64", test_f64, devices=devices)
+add_function_test(TestMatmul, "test_tape", test_tape, devices=devices)
+add_function_test(TestMatmul, "test_operator", test_operator, devices=devices)
+add_function_test(TestMatmul, "test_large_batch_count", test_large_batch_count, devices=devices)
+add_function_test(TestMatmul, "test_adjoint_accumulation", test_adjoint_accumulation, devices=devices)
 if __name__ == "__main__":
     wp.build.clear_kernel_cache()
-    _ = register(unittest.TestCase)
     unittest.main(verbosity=2, failfast=False)