PyPI - warp-lang - Versions diffs - 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl - Mend

warp-lang 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (271) hide show

docs/conf.py +17 -5
examples/env/env_ant.py +1 -1
examples/env/env_cartpole.py +1 -1
examples/env/env_humanoid.py +1 -1
examples/env/env_usd.py +4 -1
examples/env/environment.py +8 -9
examples/example_dem.py +34 -33
examples/example_diffray.py +364 -337
examples/example_fluid.py +32 -23
examples/example_jacobian_ik.py +97 -93
examples/example_marching_cubes.py +6 -16
examples/example_mesh.py +6 -16
examples/example_mesh_intersect.py +16 -14
examples/example_nvdb.py +14 -16
examples/example_raycast.py +14 -13
examples/example_raymarch.py +16 -23
examples/example_render_opengl.py +19 -10
examples/example_sim_cartpole.py +82 -78
examples/example_sim_cloth.py +45 -48
examples/example_sim_fk_grad.py +51 -44
examples/example_sim_fk_grad_torch.py +47 -40
examples/example_sim_grad_bounce.py +108 -133
examples/example_sim_grad_cloth.py +99 -113
examples/example_sim_granular.py +5 -6
examples/{example_sim_sdf_shape.py → example_sim_granular_collision_sdf.py} +37 -26
examples/example_sim_neo_hookean.py +51 -55
examples/example_sim_particle_chain.py +4 -4
examples/example_sim_quadruped.py +126 -81
examples/example_sim_rigid_chain.py +54 -61
examples/example_sim_rigid_contact.py +66 -70
examples/example_sim_rigid_fem.py +3 -3
examples/example_sim_rigid_force.py +1 -1
examples/example_sim_rigid_gyroscopic.py +3 -4
examples/example_sim_rigid_kinematics.py +28 -39
examples/example_sim_trajopt.py +112 -110
examples/example_sph.py +9 -8
examples/example_wave.py +7 -7
examples/fem/bsr_utils.py +30 -17
examples/fem/example_apic_fluid.py +85 -69
examples/fem/example_convection_diffusion.py +97 -93
examples/fem/example_convection_diffusion_dg.py +142 -149
examples/fem/example_convection_diffusion_dg0.py +141 -136
examples/fem/example_deformed_geometry.py +146 -0
examples/fem/example_diffusion.py +115 -84
examples/fem/example_diffusion_3d.py +116 -86
examples/fem/example_diffusion_mgpu.py +102 -79
examples/fem/example_mixed_elasticity.py +139 -100
examples/fem/example_navier_stokes.py +175 -162
examples/fem/example_stokes.py +143 -111
examples/fem/example_stokes_transfer.py +186 -157
examples/fem/mesh_utils.py +59 -97
examples/fem/plot_utils.py +138 -17
tools/ci/publishing/build_nodes_info.py +54 -0
warp/__init__.py +4 -3
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +836 -492
warp/codegen.py +864 -553
warp/config.py +3 -1
warp/context.py +389 -172
warp/fem/__init__.py +24 -6
warp/fem/cache.py +318 -25
warp/fem/dirichlet.py +7 -3
warp/fem/domain.py +14 -0
warp/fem/field/__init__.py +30 -38
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +244 -138
warp/fem/field/restriction.py +8 -6
warp/fem/field/test.py +127 -59
warp/fem/field/trial.py +117 -60
warp/fem/geometry/__init__.py +5 -1
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +24 -1
warp/fem/geometry/geometry.py +86 -14
warp/fem/geometry/grid_2d.py +112 -54
warp/fem/geometry/grid_3d.py +134 -65
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +85 -33
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +451 -115
warp/fem/geometry/trimesh_2d.py +197 -92
warp/fem/integrate.py +534 -268
warp/fem/operator.py +58 -31
warp/fem/polynomial.py +11 -0
warp/fem/quadrature/__init__.py +1 -1
warp/fem/quadrature/pic_quadrature.py +150 -58
warp/fem/quadrature/quadrature.py +209 -57
warp/fem/space/__init__.py +230 -53
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +49 -2
warp/fem/space/function_space.py +90 -39
warp/fem/space/grid_2d_function_space.py +149 -496
warp/fem/space/grid_3d_function_space.py +173 -538
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +129 -76
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +46 -34
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +132 -1039
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +104 -742
warp/fem/types.py +13 -11
warp/fem/utils.py +335 -60
warp/native/array.h +120 -34
warp/native/builtin.h +101 -72
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +22 -40
warp/native/clang/clang.cpp +1 -0
warp/native/crt.h +2 -0
warp/native/cuda_util.cpp +8 -3
warp/native/cuda_util.h +1 -0
warp/native/exports.h +1522 -1243
warp/native/intersect.h +19 -4
warp/native/intersect_adj.h +8 -8
warp/native/mat.h +76 -17
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -18
warp/native/mesh.h +395 -40
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +44 -34
warp/native/reduce.cpp +1 -1
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +163 -155
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +18 -14
warp/native/vec.h +103 -21
warp/native/warp.cpp +2 -1
warp/native/warp.cu +28 -3
warp/native/warp.h +4 -3
warp/render/render_opengl.py +261 -109
warp/sim/__init__.py +1 -2
warp/sim/articulation.py +385 -185
warp/sim/import_mjcf.py +59 -48
warp/sim/import_urdf.py +15 -15
warp/sim/import_usd.py +174 -102
warp/sim/inertia.py +17 -18
warp/sim/integrator_xpbd.py +4 -3
warp/sim/model.py +330 -250
warp/sim/render.py +1 -1
warp/sparse.py +625 -152
warp/stubs.py +341 -309
warp/tape.py +9 -6
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +94 -74
warp/tests/test_array.py +82 -101
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +22 -12
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +18 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +165 -134
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +237 -0
warp/tests/test_fabricarray.py +22 -24
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1034 -124
warp/tests/test_fp16.py +23 -16
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +123 -181
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +35 -34
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +24 -25
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +304 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +60 -22
warp/tests/test_mesh_query_aabb.py +21 -25
warp/tests/test_mesh_query_point.py +111 -22
warp/tests/test_mesh_query_ray.py +12 -24
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +90 -86
warp/tests/test_transient_module.py +10 -12
warp/tests/test_types.py +363 -0
warp/tests/test_utils.py +451 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +418 -376
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +291 -0
warp/tests/unittest_utils.py +342 -0
warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +589 -0
warp/types.py +622 -211
warp/utils.py +54 -393
warp_lang-1.0.0b6.dist-info/METADATA +238 -0
warp_lang-1.0.0b6.dist-info/RECORD +409 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
examples/example_cache_management.py +0 -40
examples/example_multigpu.py +0 -54
examples/example_struct.py +0 -65
examples/fem/example_stokes_transfer_3d.py +0 -210
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/fem/field/discrete_field.py +0 -80
warp/fem/space/nodal_function_space.py +0 -233
warp/tests/test_all.py +0 -223
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-1.0.0b2.dist-info/METADATA +0 -26
warp_lang-1.0.0b2.dist-info/RECORD +0 -380
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0

warp/tests/test_math.py CHANGED Viewed

@@ -5,13 +5,13 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-from typing import NamedTuple
 import unittest
+from typing import NamedTuple
 import numpy as np
 import warp as wp
-from warp.tests.test_base import *
+from warp.tests.unittest_utils import *
 wp.init()
@@ -84,16 +84,110 @@ def test_scalar_math(test, device):
         )
-def register(parent):
-    devices = get_test_devices()
+def test_vec_type(test, device):
+    vec5 = wp.vec(length=5, dtype=float)
+    v = vec5()
+    w = vec5()
+    a = vec5(1.0)
+    b = vec5(0.0, 0.0, 0.0, 0.0, 0.0)
+    c = vec5(0.0)
+    v[0] = 1.0
+    v.x = 0.0
+    v[1:] = [1.0, 1.0, 1.0, 1.0]
+    w[0] = 1.0
+    w[1:] = [0.0, 0.0, 0.0, 0.0]
+    if v[0] != w[1] or v.x != w.y:
+        raise ValueError("vec setter error")
+    for x in v[1:]:
+        if x != 1.0:
+            raise ValueError("vec slicing error")
+    if b != c:
+        raise ValueError("vec equality error")
+    if str(v) != "[0.0, 1.0, 1.0, 1.0, 1.0]":
+        raise ValueError("vec to string error")
+def test_mat_type(test, device):
+    mat55 = wp.mat(shape=(5, 5), dtype=float)
+    m1 = mat55()
+    m2 = mat55()
+    for i in range(5):
+        for j in range(5):
+            if i == j:
+                m1[i, j] = 1.0
+            else:
+                m1[i, j] = 0.0
+    for i in range(5):
+        m2[i] = [1.0, 1.0, 1.0, 1.0, 1.0]
+    a = mat55(1.0)
+    b = mat55(
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+    )
+    if m1 != b:
+        raise ValueError("mat element setting error")
+    if m2 != a:
+        raise ValueError("mat row setting error")
+    if m1[0, 0] != 1.0:
+        raise ValueError("mat element getting error")
+    if m2[0] != [1.0, 1.0, 1.0, 1.0, 1.0]:
+        raise ValueError("mat row getting error")
+    if (
+        str(b)
+        != "[[1.0, 0.0, 0.0, 0.0, 0.0],\n [0.0, 1.0, 0.0, 0.0, 0.0],\n [0.0, 0.0, 1.0, 0.0, 0.0],\n [0.0, 0.0, 0.0, 1.0, 0.0],\n [0.0, 0.0, 0.0, 0.0, 1.0]]"
+    ):
+        raise ValueError("mat to string error")
+devices = get_test_devices()
+class TestMath(unittest.TestCase):
+    pass
-    class TestMath(parent):
-        pass
-    add_function_test(TestMath, "test_scalar_math", test_scalar_math, devices=devices)
-    return TestMath
+add_function_test(TestMath, "test_scalar_math", test_scalar_math, devices=devices)
+add_function_test(TestMath, "test_vec_type", test_vec_type, devices=devices)
+add_function_test(TestMath, "test_mat_type", test_mat_type, devices=devices)
 if __name__ == "__main__":
-    _ = register(unittest.TestCase)
+    wp.build.clear_kernel_cache()
     unittest.main(verbosity=2)

warp/tests/test_matmul.py CHANGED Viewed

@@ -1,88 +1,107 @@
-import numpy as np
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
 import unittest
-import warp as wp
-from warp.tests.test_base import *
+import numpy as np
-np.random.seed(0)
+import warp as wp
+from warp.tests.unittest_utils import *
 wp.init()
-wp.config.mode = "debug"
+from warp.context import runtime  # noqa: E402
-class GemmTestbedRunner:
+class gemm_test_bed_runner:
     def __init__(self, dtype, device):
         self.dtype = dtype
         self.device = device
     def alloc(self, m, n, k, batch_count):
+        rng = np.random.default_rng(42)
         low = -4.5
         high = 3.5
         if batch_count == 1:
             A = wp.array2d(
-                np.ceil(np.random.uniform(low=low, high=high, size=(m, k))), dtype=self.dtype, device=self.device
+                np.ceil(rng.uniform(low=low, high=high, size=(m, k))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
             )
             B = wp.array2d(
-                np.ceil(np.random.uniform(low=low, high=high, size=(k, n))), dtype=self.dtype, device=self.device
+                np.ceil(rng.uniform(low=low, high=high, size=(k, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
             )
             C = wp.array2d(
-                np.ceil(np.random.uniform(low=low, high=high, size=(m, n))), dtype=self.dtype, device=self.device
+                np.ceil(rng.uniform(low=low, high=high, size=(m, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
             )
-            D = wp.array2d(np.zeros((m, n)), dtype=self.dtype, device=self.device)
-            adj_A = wp.array2d(np.zeros((m, k)), dtype=self.dtype, device=self.device)
-            adj_B = wp.array2d(np.zeros((k, n)), dtype=self.dtype, device=self.device)
-            adj_C = wp.array2d(np.zeros((m, n)), dtype=self.dtype, device=self.device)
-            adj_D = wp.array2d(np.ones((m, n)), dtype=self.dtype, device=self.device)
+            D = wp.array2d(np.zeros((m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
         else:
-            A = wp.array2d(
-                np.ceil(np.random.uniform(low=low, high=high, size=(batch_count, m, k))),
+            A = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
                 dtype=self.dtype,
                 device=self.device,
+                requires_grad=True,
             )
-            B = wp.array2d(
-                np.ceil(np.random.uniform(low=low, high=high, size=(batch_count, k, n))),
+            B = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
                 dtype=self.dtype,
                 device=self.device,
+                requires_grad=True,
             )
-            C = wp.array2d(
-                np.ceil(np.random.uniform(low=low, high=high, size=(batch_count, m, n))),
+            C = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
                 dtype=self.dtype,
                 device=self.device,
+                requires_grad=True,
             )
-            D = wp.array2d(np.zeros((batch_count, m, n)), dtype=self.dtype, device=self.device)
-            adj_A = wp.array2d(np.zeros((batch_count, m, k)), dtype=self.dtype, device=self.device)
-            adj_B = wp.array2d(np.zeros((batch_count, k, n)), dtype=self.dtype, device=self.device)
-            adj_C = wp.array2d(np.zeros((batch_count, m, n)), dtype=self.dtype, device=self.device)
-            adj_D = wp.array2d(np.ones((batch_count, m, n)), dtype=self.dtype, device=self.device)
-        return A, B, C, D, adj_A, adj_B, adj_C, adj_D
+            D = wp.array3d(np.zeros((batch_count, m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+        return A, B, C, D
     def run_and_verify(self, m, n, k, batch_count, alpha, beta):
-        A, B, C, D, adj_A, adj_B, adj_C, adj_D = self.alloc(m, n, k, batch_count)
+        A, B, C, D = self.alloc(m, n, k, batch_count)
+        ones = wp.zeros_like(D)
+        ones.fill_(1.0)
         if batch_count == 1:
-            wp.matmul(A, B, C, D, alpha, beta, False, self.device)
+            tape = wp.Tape()
+            with tape:
+                wp.matmul(A, B, C, D, alpha, beta, False, self.device)
+            tape.backward(grads={D: ones})
             D_np = alpha * (A.numpy() @ B.numpy()) + beta * C.numpy()
             assert np.array_equal(D_np, D.numpy())
-            wp.adj_matmul(A, B, C, adj_A, adj_B, adj_C, adj_D, alpha, beta, False, self.device)
-            adj_A_np = alpha * np.matmul(adj_D.numpy(), B.numpy().transpose())
-            adj_B_np = alpha * (A.numpy().transpose() @ adj_D.numpy())
-            adj_C_np = beta * adj_D.numpy()
+            adj_A_np = alpha * np.matmul(ones.numpy(), B.numpy().transpose())
+            adj_B_np = alpha * (A.numpy().transpose() @ ones.numpy())
+            adj_C_np = beta * ones.numpy()
-            assert np.array_equal(adj_A_np, adj_A.numpy())
-            assert np.array_equal(adj_B_np, adj_B.numpy())
-            assert np.array_equal(adj_C_np, adj_C.numpy())
         else:
-            wp.batched_matmul(A, B, C, D, alpha, beta, False, self.device)
+            tape = wp.Tape()
+            with tape:
+                wp.batched_matmul(A, B, C, D, alpha, beta, False, self.device)
+            tape.backward(grads={D: ones})
             D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
             assert np.array_equal(D_np, D.numpy())
-            wp.adj_batched_matmul(A, B, C, adj_A, adj_B, adj_C, adj_D, alpha, beta, False, self.device)
-            adj_A_np = alpha * np.matmul(adj_D.numpy(), B.numpy().transpose((0, 2, 1)))
-            adj_B_np = alpha * np.matmul(A.numpy().transpose((0, 2, 1)), adj_D.numpy())
-            adj_C_np = beta * adj_D.numpy()
-            assert np.array_equal(adj_A_np, adj_A.numpy())
-            assert np.array_equal(adj_B_np, adj_B.numpy())
-            assert np.array_equal(adj_C_np, adj_C.numpy())
+            adj_A_np = alpha * np.matmul(ones.numpy(), B.numpy().transpose((0, 2, 1)))
+            adj_B_np = alpha * np.matmul(A.numpy().transpose((0, 2, 1)), ones.numpy())
+            adj_C_np = beta * ones.numpy()
+        assert np.array_equal(adj_A_np, A.grad.numpy())
+        assert np.array_equal(adj_B_np, B.grad.numpy())
+        assert np.array_equal(adj_C_np, C.grad.numpy())
     def run(self):
         Ms = [64, 128, 512]
@@ -100,17 +119,156 @@ class GemmTestbedRunner:
                             self.run_and_verify(m, n, k, batch_count, alpha, beta)
+class gemm_test_bed_runner_transpose:
+    def __init__(self, dtype, device):
+        self.dtype = dtype
+        self.device = device
+    def alloc(self, m, n, k, batch_count):
+        rng = np.random.default_rng(42)
+        low = -4.5
+        high = 3.5
+        if batch_count == 1:
+            A = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(m, k))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            B = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(k, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            C = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(m, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            D = wp.array2d(np.zeros((m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+            AT = wp.array2d(A.numpy().transpose([1, 0]), dtype=self.dtype, device=self.device, requires_grad=True)
+            BT = wp.array2d(B.numpy().transpose([1, 0]), dtype=self.dtype, device=self.device, requires_grad=True)
+        else:
+            A = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            B = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            C = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            D = wp.array3d(np.zeros((batch_count, m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+            AT = wp.array3d(A.numpy().transpose([0, 2, 1]), dtype=self.dtype, device=self.device, requires_grad=True)
+            BT = wp.array3d(B.numpy().transpose([0, 2, 1]), dtype=self.dtype, device=self.device, requires_grad=True)
+        return A, B, C, D, AT, BT
+    def run_and_verify(self, m, n, k, batch_count, alpha, beta):
+        A, B, C1, D1, AT1, BT1 = self.alloc(m, n, k, batch_count)
+        C2 = wp.clone(C1)
+        C3 = wp.clone(C1)
+        D2 = wp.clone(D1)
+        D3 = wp.clone(D1)
+        AT2 = wp.clone(AT1)
+        BT2 = wp.clone(BT1)
+        ones1 = wp.zeros_like(D1)
+        ones1.fill_(1.0)
+        ones2 = wp.zeros_like(D2)
+        ones2.fill_(1.0)
+        ones3 = wp.zeros_like(D3)
+        ones3.fill_(1.0)
+        if batch_count == 1:
+            ATT1 = AT1.transpose([1, 0])
+            BTT1 = BT1.transpose([1, 0])
+            ATT2 = AT2.transpose([1, 0])
+            BTT2 = BT2.transpose([1, 0])
+            tape = wp.Tape()
+            with tape:
+                wp.matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
+                wp.matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
+                wp.matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
+            tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
+            D_np = alpha * (A.numpy() @ B.numpy()) + beta * C1.numpy()
+            assert np.array_equal(D_np, D1.numpy())
+            assert np.array_equal(D_np, D2.numpy())
+            assert np.array_equal(D_np, D3.numpy())
+            adj_A_np = alpha * (ones1.numpy() @ B.numpy().transpose())
+            adj_B_np = alpha * (A.numpy().transpose() @ ones1.numpy())
+            adj_C_np = beta * ones1.numpy()
+        else:
+            ATT1 = AT1.transpose([0, 2, 1])
+            BTT1 = BT1.transpose([0, 2, 1])
+            ATT2 = AT2.transpose([0, 2, 1])
+            BTT2 = BT2.transpose([0, 2, 1])
+            tape = wp.Tape()
+            with tape:
+                wp.batched_matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
+                wp.batched_matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
+                wp.batched_matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
+            tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
+            D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C1.numpy()
+            assert np.array_equal(D_np, D1.numpy())
+            assert np.array_equal(D_np, D2.numpy())
+            assert np.array_equal(D_np, D3.numpy())
+            adj_A_np = alpha * np.matmul(ones1.numpy(), B.numpy().transpose((0, 2, 1)))
+            adj_B_np = alpha * np.matmul(A.numpy().transpose((0, 2, 1)), ones1.numpy())
+            adj_C_np = beta * ones1.numpy()
+        assert np.array_equal(adj_A_np, A.grad.numpy())
+        assert np.array_equal(adj_A_np, ATT1.grad.numpy())
+        assert np.array_equal(adj_A_np, ATT2.grad.numpy())
+        assert np.array_equal(adj_B_np, B.grad.numpy())
+        assert np.array_equal(adj_B_np, BTT1.grad.numpy())
+        assert np.array_equal(adj_B_np, BTT2.grad.numpy())
+        assert np.array_equal(adj_C_np, C1.grad.numpy())
+        assert np.array_equal(adj_C_np, C2.grad.numpy())
+        assert np.array_equal(adj_C_np, C3.grad.numpy())
+    def run(self):
+        m = 16
+        n = 32
+        k = 64
+        batch_counts = [1, 4]
+        beta = 1.0
+        alpha = 1.0
+        for batch_count in batch_counts:
+            self.run_and_verify(m, n, k, batch_count, alpha, beta)
 # NOTE: F16 tests are slow due to the performance of the reference numpy F16 matmuls performed on CPU.
 def test_f16(test, device):
-    GemmTestbedRunner(wp.float16, device).run()
+    gemm_test_bed_runner(wp.float16, device).run()
+    gemm_test_bed_runner_transpose(wp.float16, device).run()
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_f32(test, device):
-    GemmTestbedRunner(wp.float32, device).run()
+    gemm_test_bed_runner(wp.float32, device).run()
+    gemm_test_bed_runner_transpose(wp.float32, device).run()
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_f64(test, device):
-    GemmTestbedRunner(wp.float64, device).run()
+    gemm_test_bed_runner(wp.float64, device).run()
+    gemm_test_bed_runner_transpose(wp.float64, device).run()
 @wp.kernel
@@ -119,20 +277,22 @@ def matrix_sum_kernel(arr: wp.array2d(dtype=float), loss: wp.array(dtype=float))
     wp.atomic_add(loss, 0, arr[i, j])
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_tape(test, device):
+    rng = np.random.default_rng(42)
     low = -4.5
     high = 3.5
     m = 64
     n = 128
     k = 256
     A = wp.array2d(
-        np.ceil(np.random.uniform(low=low, high=high, size=(m, k))), dtype=float, device=device, requires_grad=True
+        np.ceil(rng.uniform(low=low, high=high, size=(m, k))), dtype=float, device=device, requires_grad=True
     )
     B = wp.array2d(
-        np.ceil(np.random.uniform(low=low, high=high, size=(k, n))), dtype=float, device=device, requires_grad=True
+        np.ceil(rng.uniform(low=low, high=high, size=(k, n))), dtype=float, device=device, requires_grad=True
     )
     C = wp.array2d(
-        np.ceil(np.random.uniform(low=low, high=high, size=(m, n))), dtype=float, device=device, requires_grad=True
+        np.ceil(rng.uniform(low=low, high=high, size=(m, n))), dtype=float, device=device, requires_grad=True
     )
     D = wp.array2d(np.zeros((m, n)), dtype=float, device=device, requires_grad=True)
     loss = wp.zeros(1, dtype=float, device=device, requires_grad=True)
@@ -145,6 +305,7 @@ def test_tape(test, device):
     tape.backward(loss=loss)
     A_grad = A.grad.numpy()
+    tape.reset()
     # test adjoint
     D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
@@ -156,17 +317,19 @@ def test_tape(test, device):
     assert_array_equal(A.grad, wp.zeros_like(A))
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
 def test_operator(test, device):
+    rng = np.random.default_rng(42)
     low = -4.5
     high = 3.5
     m = 64
     n = 128
     k = 256
     A = wp.array2d(
-        np.ceil(np.random.uniform(low=low, high=high, size=(m, k))), dtype=float, device=device, requires_grad=True
+        np.ceil(rng.uniform(low=low, high=high, size=(m, k))), dtype=float, device=device, requires_grad=True
     )
     B = wp.array2d(
-        np.ceil(np.random.uniform(low=low, high=high, size=(k, n))), dtype=float, device=device, requires_grad=True
+        np.ceil(rng.uniform(low=low, high=high, size=(k, n))), dtype=float, device=device, requires_grad=True
     )
     loss = wp.zeros(1, dtype=float, device=device, requires_grad=True)
@@ -180,7 +343,6 @@ def test_operator(test, device):
     # test adjoint
     D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
-    # deep copy, needed because transpose data is not contiguous
     B_transpose = wp.array2d(B.transpose().numpy(), dtype=float, device=device)
     adj_A = D.grad @ B_transpose
@@ -191,28 +353,101 @@ def test_operator(test, device):
     assert_array_equal(A.grad, wp.zeros_like(A))
-def register(parent):
-    devices = [d for d in get_test_devices()]
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
+def test_large_batch_count(test, device):
+    rng = np.random.default_rng(42)
+    low = -4.5
+    high = 3.5
+    m = 2
+    n = 3
+    k = 4
+    batch_count = 65535 * 2 + int(65535 / 2)
+    A = wp.array3d(
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
+    )
+    B = wp.array3d(
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
+    )
+    C = wp.array3d(
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
+    )
+    D = wp.array3d(np.zeros((batch_count, m, n)), dtype=float, device=device, requires_grad=True)
+    ones = wp.zeros_like(D)
+    ones.fill_(1.0)
+    alpha = 1.0
+    beta = 1.0
-    class TestMatmul(parent):
-        pass
+    tape = wp.Tape()
+    with tape:
+        wp.batched_matmul(A, B, C, D, alpha=alpha, beta=beta, allow_tf32x3_arith=False, device=device)
+    tape.backward(grads={D: ones})
-    if devices:
-        # check if CUTLASS is available
-        from warp.context import runtime
+    D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
+    assert np.array_equal(D_np, D.numpy())
+    adj_A_np = alpha * np.matmul(ones.numpy(), B.numpy().transpose((0, 2, 1)))
+    adj_B_np = alpha * np.matmul(A.numpy().transpose((0, 2, 1)), ones.numpy())
+    adj_C_np = beta * ones.numpy()
+    assert np.array_equal(adj_A_np, A.grad.numpy())
+    assert np.array_equal(adj_B_np, B.grad.numpy())
+    assert np.array_equal(adj_C_np, C.grad.numpy())
+def test_adjoint_accumulation(test, device):
+    a_np = np.ones(shape=(2,3))
+    b_np = np.ones(shape=(3,2))
+    c_np = np.zeros(shape=(2,2))
+    d_np = np.zeros(shape=(2,2))
+    a_wp = wp.from_numpy(a_np, dtype=float, requires_grad=True)
+    b_wp = wp.from_numpy(b_np, dtype=float, requires_grad=True)
+    c_wp = wp.from_numpy(c_np, dtype=float, requires_grad=True)
+    d1_wp = wp.from_numpy(d_np, dtype=float, requires_grad=True)
+    d2_wp = wp.from_numpy(d_np, dtype=float, requires_grad=True)
+    tape = wp.Tape()
+    with tape:
+        wp.matmul(a_wp, b_wp, c_wp, d1_wp, alpha=1.0, beta=1.0)
+        wp.matmul(a_wp, b_wp, d1_wp, d2_wp, alpha=1.0, beta=1.0)
+    d_grad = wp.zeros_like(d2_wp)
+    d_grad.fill_(1.)
+    grads = {d2_wp : d_grad}
+    tape.backward(grads=grads)
+    assert np.array_equal(a_wp.grad.numpy(), 4.0 * np.ones(shape=(2,3)))
+    assert np.array_equal(b_wp.grad.numpy(), 4.0 * np.ones(shape=(3,2)))
+    assert np.array_equal(c_wp.grad.numpy(), np.ones(shape=(2,2)))
+devices = get_test_devices()
+class TestMatmul(unittest.TestCase):
+    pass
-        if runtime.core.is_cutlass_enabled():
-            # add_function_test(TestMatmul, "test_f16", test_f16, devices=devices)
-            add_function_test(TestMatmul, "test_f32", test_f32, devices=devices)
-            add_function_test(TestMatmul, "test_f64", test_f64, devices=devices)
-            add_function_test(TestMatmul, "test_tape", test_tape, devices=devices)
-            add_function_test(TestMatmul, "test_operator", test_operator, devices=devices)
-        else:
-            print("Skipping matmul tests because CUTLASS is not supported in this build")
-    return TestMatmul
+# add_function_test(TestMatmul, "test_f16", test_f16, devices=devices)
+add_function_test(TestMatmul, "test_f32", test_f32, devices=devices)
+add_function_test(TestMatmul, "test_f64", test_f64, devices=devices)
+add_function_test(TestMatmul, "test_tape", test_tape, devices=devices)
+add_function_test(TestMatmul, "test_operator", test_operator, devices=devices)
+add_function_test(TestMatmul, "test_large_batch_count", test_large_batch_count, devices=devices)
+add_function_test(TestMatmul, "test_adjoint_accumulation", test_adjoint_accumulation, devices=devices)
 if __name__ == "__main__":
-    c = register(unittest.TestCase)
+    wp.build.clear_kernel_cache()
     unittest.main(verbosity=2, failfast=False)