PyPI - warp-lang - Versions diffs - 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl - Mend

warp-lang 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (271) hide show

docs/conf.py +17 -5
examples/env/env_ant.py +1 -1
examples/env/env_cartpole.py +1 -1
examples/env/env_humanoid.py +1 -1
examples/env/env_usd.py +4 -1
examples/env/environment.py +8 -9
examples/example_dem.py +34 -33
examples/example_diffray.py +364 -337
examples/example_fluid.py +32 -23
examples/example_jacobian_ik.py +97 -93
examples/example_marching_cubes.py +6 -16
examples/example_mesh.py +6 -16
examples/example_mesh_intersect.py +16 -14
examples/example_nvdb.py +14 -16
examples/example_raycast.py +14 -13
examples/example_raymarch.py +16 -23
examples/example_render_opengl.py +19 -10
examples/example_sim_cartpole.py +82 -78
examples/example_sim_cloth.py +45 -48
examples/example_sim_fk_grad.py +51 -44
examples/example_sim_fk_grad_torch.py +47 -40
examples/example_sim_grad_bounce.py +108 -133
examples/example_sim_grad_cloth.py +99 -113
examples/example_sim_granular.py +5 -6
examples/{example_sim_sdf_shape.py → example_sim_granular_collision_sdf.py} +37 -26
examples/example_sim_neo_hookean.py +51 -55
examples/example_sim_particle_chain.py +4 -4
examples/example_sim_quadruped.py +126 -81
examples/example_sim_rigid_chain.py +54 -61
examples/example_sim_rigid_contact.py +66 -70
examples/example_sim_rigid_fem.py +3 -3
examples/example_sim_rigid_force.py +1 -1
examples/example_sim_rigid_gyroscopic.py +3 -4
examples/example_sim_rigid_kinematics.py +28 -39
examples/example_sim_trajopt.py +112 -110
examples/example_sph.py +9 -8
examples/example_wave.py +7 -7
examples/fem/bsr_utils.py +30 -17
examples/fem/example_apic_fluid.py +85 -69
examples/fem/example_convection_diffusion.py +97 -93
examples/fem/example_convection_diffusion_dg.py +142 -149
examples/fem/example_convection_diffusion_dg0.py +141 -136
examples/fem/example_deformed_geometry.py +146 -0
examples/fem/example_diffusion.py +115 -84
examples/fem/example_diffusion_3d.py +116 -86
examples/fem/example_diffusion_mgpu.py +102 -79
examples/fem/example_mixed_elasticity.py +139 -100
examples/fem/example_navier_stokes.py +175 -162
examples/fem/example_stokes.py +143 -111
examples/fem/example_stokes_transfer.py +186 -157
examples/fem/mesh_utils.py +59 -97
examples/fem/plot_utils.py +138 -17
tools/ci/publishing/build_nodes_info.py +54 -0
warp/__init__.py +4 -3
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +836 -492
warp/codegen.py +864 -553
warp/config.py +3 -1
warp/context.py +389 -172
warp/fem/__init__.py +24 -6
warp/fem/cache.py +318 -25
warp/fem/dirichlet.py +7 -3
warp/fem/domain.py +14 -0
warp/fem/field/__init__.py +30 -38
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +244 -138
warp/fem/field/restriction.py +8 -6
warp/fem/field/test.py +127 -59
warp/fem/field/trial.py +117 -60
warp/fem/geometry/__init__.py +5 -1
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +24 -1
warp/fem/geometry/geometry.py +86 -14
warp/fem/geometry/grid_2d.py +112 -54
warp/fem/geometry/grid_3d.py +134 -65
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +85 -33
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +451 -115
warp/fem/geometry/trimesh_2d.py +197 -92
warp/fem/integrate.py +534 -268
warp/fem/operator.py +58 -31
warp/fem/polynomial.py +11 -0
warp/fem/quadrature/__init__.py +1 -1
warp/fem/quadrature/pic_quadrature.py +150 -58
warp/fem/quadrature/quadrature.py +209 -57
warp/fem/space/__init__.py +230 -53
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +49 -2
warp/fem/space/function_space.py +90 -39
warp/fem/space/grid_2d_function_space.py +149 -496
warp/fem/space/grid_3d_function_space.py +173 -538
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +129 -76
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +46 -34
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +132 -1039
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +104 -742
warp/fem/types.py +13 -11
warp/fem/utils.py +335 -60
warp/native/array.h +120 -34
warp/native/builtin.h +101 -72
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +22 -40
warp/native/clang/clang.cpp +1 -0
warp/native/crt.h +2 -0
warp/native/cuda_util.cpp +8 -3
warp/native/cuda_util.h +1 -0
warp/native/exports.h +1522 -1243
warp/native/intersect.h +19 -4
warp/native/intersect_adj.h +8 -8
warp/native/mat.h +76 -17
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -18
warp/native/mesh.h +395 -40
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +44 -34
warp/native/reduce.cpp +1 -1
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +163 -155
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +18 -14
warp/native/vec.h +103 -21
warp/native/warp.cpp +2 -1
warp/native/warp.cu +28 -3
warp/native/warp.h +4 -3
warp/render/render_opengl.py +261 -109
warp/sim/__init__.py +1 -2
warp/sim/articulation.py +385 -185
warp/sim/import_mjcf.py +59 -48
warp/sim/import_urdf.py +15 -15
warp/sim/import_usd.py +174 -102
warp/sim/inertia.py +17 -18
warp/sim/integrator_xpbd.py +4 -3
warp/sim/model.py +330 -250
warp/sim/render.py +1 -1
warp/sparse.py +625 -152
warp/stubs.py +341 -309
warp/tape.py +9 -6
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +94 -74
warp/tests/test_array.py +82 -101
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +22 -12
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +18 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +165 -134
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +237 -0
warp/tests/test_fabricarray.py +22 -24
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1034 -124
warp/tests/test_fp16.py +23 -16
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +123 -181
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +35 -34
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +24 -25
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +304 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +60 -22
warp/tests/test_mesh_query_aabb.py +21 -25
warp/tests/test_mesh_query_point.py +111 -22
warp/tests/test_mesh_query_ray.py +12 -24
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +90 -86
warp/tests/test_transient_module.py +10 -12
warp/tests/test_types.py +363 -0
warp/tests/test_utils.py +451 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +418 -376
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +291 -0
warp/tests/unittest_utils.py +342 -0
warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +589 -0
warp/types.py +622 -211
warp/utils.py +54 -393
warp_lang-1.0.0b6.dist-info/METADATA +238 -0
warp_lang-1.0.0b6.dist-info/RECORD +409 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
examples/example_cache_management.py +0 -40
examples/example_multigpu.py +0 -54
examples/example_struct.py +0 -65
examples/fem/example_stokes_transfer_3d.py +0 -210
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/fem/field/discrete_field.py +0 -80
warp/fem/space/nodal_function_space.py +0 -233
warp/tests/test_all.py +0 -223
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-1.0.0b2.dist-info/METADATA +0 -26
warp_lang-1.0.0b2.dist-info/RECORD +0 -380
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0

warp/types.py CHANGED Viewed

@@ -5,9 +5,12 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+from __future__ import annotations
 import builtins
 import ctypes
 import hashlib
+import inspect
 import struct
 import zlib
 from typing import Any, Callable, Generic, List, Tuple, TypeVar, Union
@@ -49,12 +52,14 @@ def constant(x):
     global _constant_hash
     # hash the constant value
-    if isinstance(x, int):
+    if isinstance(x, builtins.bool):
+        # This needs to come before the check for `int` since all boolean
+        # values are also instances of `int`.
+        _constant_hash.update(struct.pack("?", x))
+    elif isinstance(x, int):
         _constant_hash.update(struct.pack("<q", x))
     elif isinstance(x, float):
         _constant_hash.update(struct.pack("<d", x))
-    elif isinstance(x, builtins.bool):
-        _constant_hash.update(struct.pack("?", x))
     elif isinstance(x, float16):
         # float16 is a special case
         p = ctypes.pointer(ctypes.c_float(x.value))
@@ -155,17 +160,31 @@ def vector(length, dtype):
             else:
                 raise KeyError(f"Invalid key {key}, expected int or slice")
+        def __getattr__(self, name):
+            idx = "xyzw".find(name)
+            if idx != -1:
+                return self.__getitem__(idx)
+            return self.__getattribute__(name)
+        def __setattr__(self, name, value):
+            idx = "xyzw".find(name)
+            if idx != -1:
+                return self.__setitem__(idx, value)
+            return super().__setattr__(name, value)
         def __add__(self, y):
             return warp.add(self, y)
         def __radd__(self, y):
-            return warp.add(self, y)
+            return warp.add(y, self)
         def __sub__(self, y):
             return warp.sub(self, y)
-        def __rsub__(self, x):
-            return warp.sub(x, self)
+        def __rsub__(self, y):
+            return warp.sub(y, self)
         def __mul__(self, y):
             return warp.mul(self, y)
@@ -173,17 +192,17 @@ def vector(length, dtype):
         def __rmul__(self, x):
             return warp.mul(x, self)
-        def __div__(self, y):
+        def __truediv__(self, y):
             return warp.div(self, y)
-        def __rdiv__(self, x):
+        def __rtruediv__(self, x):
             return warp.div(x, self)
-        def __pos__(self, y):
-            return warp.pos(self, y)
+        def __pos__(self):
+            return warp.pos(self)
-        def __neg__(self, y):
-            return warp.neg(self, y)
+        def __neg__(self):
+            return warp.neg(self)
         def __str__(self):
             return f"[{', '.join(map(str, self))}]"
@@ -275,13 +294,13 @@ def matrix(shape, dtype):
             return warp.add(self, y)
         def __radd__(self, y):
-            return warp.add(self, y)
+            return warp.add(y, self)
         def __sub__(self, y):
             return warp.sub(self, y)
-        def __rsub__(self, x):
-            return warp.sub(x, self)
+        def __rsub__(self, y):
+            return warp.sub(y, self)
         def __mul__(self, y):
             return warp.mul(self, y)
@@ -295,17 +314,17 @@ def matrix(shape, dtype):
         def __rmatmul__(self, x):
             return warp.mul(x, self)
-        def __div__(self, y):
+        def __truediv__(self, y):
             return warp.div(self, y)
-        def __rdiv__(self, x):
+        def __rtruediv__(self, x):
             return warp.div(x, self)
-        def __pos__(self, y):
-            return warp.pos(self, y)
+        def __pos__(self):
+            return warp.pos(self)
-        def __neg__(self, y):
-            return warp.neg(self, y)
+        def __neg__(self):
+            return warp.neg(self)
         def __str__(self):
             row_str = []
@@ -511,23 +530,63 @@ class quatd(quaternion(dtype=float64)):
 def transformation(dtype=Any):
     class transform_t(vector(length=7, dtype=dtype)):
+        _wp_init_from_components_sig_ = inspect.Signature(
+            (
+                inspect.Parameter(
+                    "p",
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                    default=(0.0, 0.0, 0.0),
+                ),
+                inspect.Parameter(
+                    "q",
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                    default=(0.0, 0.0, 0.0, 1.0),
+                ),
+            ),
+        )
         _wp_type_params_ = [dtype]
         _wp_generic_type_str_ = "transform_t"
         _wp_constructor_ = "transformation"
-        def __init__(self, p=(0.0, 0.0, 0.0), q=(0.0, 0.0, 0.0, 1.0)):
-            super().__init__()
+        def __init__(self, *args, **kwargs):
+            if len(args) == 1 and len(kwargs) == 0:
+                if getattr(args[0], "_wp_generic_type_str_") == self._wp_generic_type_str_:
+                    # Copy constructor.
+                    super().__init__(*args[0])
+                    return
-            self[0:3] = vector(length=3, dtype=dtype)(*p)
-            self[3:7] = quaternion(dtype=dtype)(*q)
+            try:
+                # For backward compatibility, try to check if the arguments
+                # match the original signature that'd allow initializing
+                # the `p` and `q` components separately.
+                bound_args = self._wp_init_from_components_sig_.bind(*args, **kwargs)
+                bound_args.apply_defaults()
+                p, q = bound_args.args
+            except (TypeError, ValueError):
+                # Fallback to the vector's constructor.
+                super().__init__(*args)
+                return
+            # Even if the arguments match the original “from components”
+            # signature, we still need to make sure that they represent
+            # sequences that can be unpacked.
+            if hasattr(p, "__len__") and hasattr(q, "__len__"):
+                # Initialize from the `p` and `q` components.
+                super().__init__()
+                self[0:3] = vector(length=3, dtype=dtype)(*p)
+                self[3:7] = quaternion(dtype=dtype)(*q)
+                return
+            # Fallback to the vector's constructor.
+            super().__init__(*args)
         @property
         def p(self):
-            return self[0:3]
+            return vec3(self[0:3])
         @property
         def q(self):
-            return self[3:7]
+            return quat(self[3:7])
     return transform_t
@@ -851,18 +910,21 @@ class range_t:
 # definition just for kernel type (cannot be a parameter), see bvh.h
 class bvh_query_t:
+    """Object used to track state during BVH traversal."""
     def __init__(self):
         pass
 # definition just for kernel type (cannot be a parameter), see mesh.h
 class mesh_query_aabb_t:
+    """Object used to track state during mesh traversal."""
     def __init__(self):
         pass
 # definition just for kernel type (cannot be a parameter), see hash_grid.h
 class hash_grid_query_t:
+    """Object used to track state during neighbor traversal."""
     def __init__(self):
         pass
@@ -999,7 +1061,7 @@ def type_scalar_type(dtype):
 def type_size_in_bytes(dtype):
     if dtype.__module__ == "ctypes":
         return ctypes.sizeof(dtype)
-    elif type_is_struct(dtype):
+    elif isinstance(dtype, warp.codegen.Struct):
         return ctypes.sizeof(dtype.ctype)
     elif dtype == float or dtype == int:
         return 4
@@ -1020,8 +1082,6 @@ def type_to_warp(dtype):
 def type_typestr(dtype):
-    from warp.codegen import Struct
     if dtype == bool:
         return "?"
     elif dtype == float16:
@@ -1046,7 +1106,7 @@ def type_typestr(dtype):
         return "<i8"
     elif dtype == uint64:
         return "<u8"
-    elif isinstance(dtype, Struct):
+    elif isinstance(dtype, warp.codegen.Struct):
         return f"|V{ctypes.sizeof(dtype.ctype)}"
     elif issubclass(dtype, ctypes.Array):
         return type_typestr(dtype._wp_scalar_type_)
@@ -1060,9 +1120,16 @@ def type_repr(t):
         return str(f"array(ndim={t.ndim}, dtype={t.dtype})")
     if type_is_vector(t):
         return str(f"vector(length={t._shape_[0]}, dtype={t._wp_scalar_type_})")
-    elif type_is_matrix(t):
+    if type_is_matrix(t):
         return str(f"matrix(shape=({t._shape_[0]}, {t._shape_[1]}), dtype={t._wp_scalar_type_})")
-    else:
+    if isinstance(t, warp.codegen.Struct):
+        return type_repr(t.cls)
+    if t in scalar_types:
+        return t.__name__
+    try:
+        return t.__module__ + "." + t.__qualname__
+    except AttributeError:
         return str(t)
@@ -1080,15 +1147,6 @@ def type_is_float(t):
     return t in float_types
-def type_is_struct(dtype):
-    from warp.codegen import Struct
-    if isinstance(dtype, Struct):
-        return True
-    else:
-        return False
 # returns True if the passed *type* is a vector
 def type_is_vector(t):
     if hasattr(t, "_wp_generic_type_str_") and t._wp_generic_type_str_ == "vec_t":
@@ -1162,6 +1220,17 @@ def types_equal(a, b, match_generic=False):
             if p1 == Float and p2 == Float:
                 return True
+        # convert to canonical types
+        if p1 == float:
+            p1 = float32
+        elif p1 == int:
+            p1 = int32
+        if p2 == float:
+            p2 = float32
+        elif b == int:
+            p2 = int32
         if p1 in compatible_bool_types and p2 in compatible_bool_types:
             return True
         else:
@@ -1173,7 +1242,7 @@ def types_equal(a, b, match_generic=False):
         and a._wp_generic_type_str_ == b._wp_generic_type_str_
     ):
         return all([are_equal(p1, p2) for p1, p2 in zip(a._wp_type_params_, b._wp_type_params_)])
-    if is_array(a) and type(a) == type(b):
+    if is_array(a) and type(a) is type(b):
         return True
     else:
         return are_equal(a, b)
@@ -1257,6 +1326,7 @@ class array(Array):
         self._grad = None
         # __array_interface__ or __cuda_array_interface__, evaluated lazily and cached
         self._array_interface = None
+        self.is_transposed = False
         # canonicalize dtype
         if dtype == int:
@@ -1801,6 +1871,7 @@ class array(Array):
         return array._vars
     def zero_(self):
+        """Zeroes-out the array entires."""
         if self.is_contiguous:
             # simple memset is usually faster than generic fill
             self.device.memset(self.ptr, 0, self.size * type_size_in_bytes(self.dtype))
@@ -1808,6 +1879,32 @@ class array(Array):
             self.fill_(0)
     def fill_(self, value):
+        """Set all array entries to `value`
+        args:
+            value: The value to set every array entry to. Must be convertible to the array's ``dtype``.
+        Raises:
+            ValueError: If `value` cannot be converted to the array's ``dtype``.
+        Examples:
+            ``fill_()`` can take lists or other sequences when filling arrays of vectors or matrices.
+            >>> arr = wp.zeros(2, dtype=wp.mat22)
+            >>> arr.numpy()
+            array([[[0., 0.],
+                    [0., 0.]],
+            <BLANKLINE>
+                   [[0., 0.],
+                    [0., 0.]]], dtype=float32)
+            >>> arr.fill_([[1, 2], [3, 4]])
+            >>> arr.numpy()
+            array([[[1., 2.],
+                    [3., 4.]],
+            <BLANKLINE>
+                   [[1., 2.],
+                    [3., 4.]]], dtype=float32)
+        """
         if self.size == 0:
             return
@@ -1854,15 +1951,18 @@ class array(Array):
             else:
                 warp.context.runtime.core.array_fill_host(carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size)
-    # equivalent to wrapping src data in an array and copying to self
     def assign(self, src):
+        """Wraps ``src`` in an :class:`warp.array` if it is not already one and copies the contents to ``self``."""
         if is_array(src):
             warp.copy(self, src)
         else:
             warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))
-    # convert array to ndarray (alias memory through array interface)
     def numpy(self):
+        """Converts the array to a :class:`numpy.ndarray` (aliasing memory through the array interface protocol)
+        If the array is on the GPU, a synchronous device-to-host copy (on the CUDA default stream) will be
+        automatically performed to ensure that any outstanding work is completed.
+        """
         if self.ptr:
             # use the CUDA default stream for synchronous behaviour with other streams
             with warp.ScopedStream(self.device.null_stream):
@@ -1883,12 +1983,16 @@ class array(Array):
                 npshape = self.shape
             return np.empty(npshape, dtype=npdtype)
-    # return a ctypes cast of the array address
-    # note #1: only CPU arrays support this method
-    # note #2: the array must be contiguous
-    # note #3: accesses to this object are *not* bounds checked
-    # note #4: for float16 types, a pointer to the internal uint16 representation is returned
     def cptr(self):
+        """Return a ctypes cast of the array address.
+        Notes:
+        #. Only CPU arrays support this method.
+        #. The array must be contiguous.
+        #. Accesses to this object are **not** bounds checked.
+        #. For ``float16`` types, a pointer to the internal ``uint16`` representation is returned.
+        """
         if not self.ptr:
             return None
@@ -1907,8 +2011,8 @@ class array(Array):
         return p
-    # returns a flattened list of items in the array as a Python list
     def list(self):
+        """Returns a flattened list of items in the array as a Python list."""
         a = self.numpy()
         if isinstance(self.dtype, warp.codegen.Struct):
@@ -1927,8 +2031,8 @@ class array(Array):
             # scalar
             return list(a.flatten())
-    # convert data from one device to another, nop if already on device
     def to(self, device):
+        """Returns a Warp array with this array's data moved to the specified device, no-op if already on device."""
         device = warp.get_device(device)
         if self.device == device:
             return self
@@ -1936,6 +2040,7 @@ class array(Array):
             return warp.clone(self, device=device)
     def flatten(self):
+        """Returns a zero-copy view of the array collapsed to 1-D. Only supported for contiguous arrays."""
         if self.ndim == 1:
             return self
@@ -1958,6 +2063,11 @@ class array(Array):
         return a
     def reshape(self, shape):
+        """Returns a reshaped array. Only supported for contiguous arrays.
+        Args:
+            shape : An int or tuple of ints specifying the shape of the returned array.
+        """
         if not self.is_contiguous:
             raise RuntimeError("Reshaping non-contiguous arrays is unsupported.")
@@ -2015,6 +2125,9 @@ class array(Array):
         return a
     def view(self, dtype):
+        """Returns a zero-copy view of this array's memory with a different data type.
+        ``dtype`` must have the same byte size of the array's native ``dtype``.
+        """
         if type_size_in_bytes(dtype) != type_size_in_bytes(self.dtype):
             raise RuntimeError("Cannot cast dtypes of unequal byte size")
@@ -2035,6 +2148,7 @@ class array(Array):
         return a
     def contiguous(self):
+        """Returns a contiguous array with this array's data. No-op if array is already contiguous."""
         if self.is_contiguous:
             return self
@@ -2042,8 +2156,14 @@ class array(Array):
         warp.copy(a, self)
         return a
-    # note: transpose operation will return an array with a non-contiguous access pattern
     def transpose(self, axes=None):
+        """Returns an zero-copy view of the array with axes transposed.
+        Note: The transpose operation will return an array with a non-contiguous access pattern.
+        Args:
+            axes (optional): Specifies the how the axes are permuted. If not specified, the axes order will be reversed.
+        """
         # noop if 1d array
         if self.ndim == 1:
             return self
@@ -2076,6 +2196,8 @@ class array(Array):
             grad=None if self.grad is None else self.grad.transpose(axes=axes),
         )
+        a.is_transposed = not self.is_transposed
         a._ref = self
         return a
@@ -2516,16 +2638,14 @@ class Mesh:
 class Volume:
+    #: Enum value to specify nearest-neighbor interpolation during sampling
     CLOSEST = constant(0)
+    #: Enum value to specify trilinear interpolation during sampling
     LINEAR = constant(1)
     def __init__(self, data: array):
         """Class representing a sparse grid.
-        Attributes:
-            CLOSEST (int): Enum value to specify nearest-neighbor interpolation during sampling
-            LINEAR (int): Enum value to specify trilinear interpolation during sampling
         Args:
             data (:class:`warp.array`): Array of bytes representing the volume in NanoVDB format
         """
@@ -2570,7 +2690,8 @@ class Volume:
         except Exception:
             pass
-    def array(self):
+    def array(self) -> array:
+        """Returns the raw memory buffer of the Volume as an array"""
         buf = ctypes.c_void_p(0)
         size = ctypes.c_uint64(0)
         if self.device.is_cpu:
@@ -2579,7 +2700,7 @@ class Volume:
             self.context.core.volume_get_buffer_info_device(self.id, ctypes.byref(buf), ctypes.byref(size))
         return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device, owner=False)
-    def get_tiles(self):
+    def get_tiles(self) -> array:
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
@@ -2592,7 +2713,7 @@ class Volume:
         num_tiles = size.value // (3 * 4)
         return array(ptr=buf.value, dtype=int32, shape=(num_tiles, 3), device=self.device, owner=True)
-    def get_voxel_size(self):
+    def get_voxel_size(self) -> Tuple[float, float, float]:
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
@@ -2601,7 +2722,7 @@ class Volume:
         return (dx.value, dy.value, dz.value)
     @classmethod
-    def load_from_nvdb(cls, file_or_buffer, device=None):
+    def load_from_nvdb(cls, file_or_buffer, device=None) -> Volume:
         """Creates a Volume object from a NanoVDB file or in-memory buffer.
         Returns:
@@ -2637,14 +2758,18 @@ class Volume:
         return cls(data_array)
     @classmethod
-    def load_from_numpy(cls, ndarray: np.array, min_world=(0.0, 0.0, 0.0), voxel_size=1.0, bg_value=0.0, device=None):
+    def load_from_numpy(
+        cls, ndarray: np.array, min_world=(0.0, 0.0, 0.0), voxel_size=1.0, bg_value=0.0, device=None
+    ) -> Volume:
         """Creates a Volume object from a dense 3D NumPy array.
+        This function is only supported for CUDA devices.
         Args:
-            min_world: The 3D coordinate of the lower corner of the volume
-            voxel_size: The size of each voxel in spatial coordinates
+            min_world: The 3D coordinate of the lower corner of the volume.
+            voxel_size: The size of each voxel in spatial coordinates.
             bg_value: Background value
-            device: The device to create the volume on, e.g.: "cpu", or "cuda:0"
+            device: The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         Returns:
@@ -2699,7 +2824,7 @@ class Volume:
                 inputs=[volume.id, warp.array(padded_array, dtype=warp.vec3, device=device)],
                 device=device,
             )
-        elif type(bg_value) == int:
+        elif isinstance(bg_value, int):
             warp.launch(
                 warp.utils.copy_dense_volume_to_nano_vdb_i,
                 dim=shape,
@@ -2726,9 +2851,11 @@ class Volume:
         translation=(0.0, 0.0, 0.0),
         points_in_world_space=False,
         device=None,
-    ):
+    ) -> Volume:
         """Allocate a new Volume based on the bounding box defined by min and max.
+        This function is only supported for CUDA devices.
         Allocate a volume that is large enough to contain voxels [min[0], min[1], min[2]] - [max[0], max[1], max[2]], inclusive.
         If points_in_world_space is true, then min and max are first converted to index space with the given voxel size and
         translation, and the volume is allocated with those.
@@ -2737,12 +2864,12 @@ class Volume:
         the resulting tiles will be available in the new volume.
         Args:
-            min (array-like): Lower 3D-coordinates of the bounding box in index space or world space, inclusive
-            max (array-like): Upper 3D-coordinates of the bounding box in index space or world space, inclusive
-            voxel_size (float): Voxel size of the new volume
+            min (array-like): Lower 3D coordinates of the bounding box in index space or world space, inclusive.
+            max (array-like): Upper 3D coordinates of the bounding box in index space or world space, inclusive.
+            voxel_size (float): Voxel size of the new volume.
             bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
-            translation (array-like): translation between the index and world spaces
-            device (Devicelike): Device the array lives on
+            translation (array-like): translation between the index and world spaces.
+            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
         if points_in_world_space:
@@ -2767,9 +2894,11 @@ class Volume:
     @classmethod
     def allocate_by_tiles(
         cls, tile_points: array, voxel_size: float, bg_value=0.0, translation=(0.0, 0.0, 0.0), device=None
-    ):
+    ) -> Volume:
         """Allocate a new Volume with active tiles for each point tile_points.
+        This function is only supported for CUDA devices.
         The smallest unit of allocation is a dense tile of 8x8x8 voxels.
         This is the primary method for allocating sparse volumes. It uses an array of points indicating the tiles that must be allocated.
@@ -2779,13 +2908,13 @@ class Volume:
         Args:
             tile_points (:class:`warp.array`): Array of positions that define the tiles to be allocated.
-                The array can be a 2d, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
+                The array can be a 2D, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
                 or can be a 1D array of :class:`warp.vec3` values, indicating world space positions.
                 Repeated points per tile are allowed and will be efficiently deduplicated.
-            voxel_size (float): Voxel size of the new volume
+            voxel_size (float): Voxel size of the new volume.
             bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
-            translation (array-like): translation between the index and world spaces
-            device (Devicelike): Device the array lives on
+            translation (array-like): Translation between the index and world spaces.
+            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
         from warp.context import runtime
@@ -2822,7 +2951,7 @@ class Volume:
                 translation[2],
                 in_world_space,
             )
-        elif type(bg_value) == int:
+        elif isinstance(bg_value, int):
             volume.id = volume.context.core.volume_i_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
@@ -2853,6 +2982,67 @@ class Volume:
         return volume
+# definition just for kernel type (cannot be a parameter), see mesh.h
+# NOTE: its layout must match the corresponding struct defined in C.
+# NOTE: it needs to be defined after `indexedarray` to workaround a circular import issue.
+class mesh_query_point_t:
+    """Output for the mesh query point functions.
+    Attributes:
+        result (bool): Whether a point is found within the given constraints.
+        sign (float32): A value < 0 if query point is inside the mesh, >=0 otherwise.
+                        Note that mesh must be watertight for this to be robust
+        face (int32): Index of the closest face.
+        u (float32): Barycentric u coordinate of the closest point.
+        v (float32): Barycentric v coordinate of the closest point.
+    See Also:
+        :func:`mesh_query_point`, :func:`mesh_query_point_no_sign`,
+        :func:`mesh_query_furthest_point_no_sign`,
+        :func:`mesh_query_point_sign_normal`,
+        and :func:`mesh_query_point_sign_winding_number`.
+    """
+    from warp.codegen import Var
+    vars = {
+        "result": Var("result", bool),
+        "sign": Var("sign", float32),
+        "face": Var("face", int32),
+        "u": Var("u", float32),
+        "v": Var("v", float32),
+    }
+# definition just for kernel type (cannot be a parameter), see mesh.h
+# NOTE: its layout must match the corresponding struct defined in C.
+class mesh_query_ray_t:
+    """Output for the mesh query ray functions.
+    Attributes:
+        result (bool): Whether a hit is found within the given constraints.
+        sign (float32): A value > 0 if the ray hit in front of the face, returns < 0 otherwise.
+        face (int32): Index of the closest face.
+        t (float32): Distance of the closest hit along the ray.
+        u (float32): Barycentric u coordinate of the closest hit.
+        v (float32): Barycentric v coordinate of the closest hit.
+        normal (vec3f): Face normal.
+    See Also:
+        :func:`mesh_query_ray`.
+    """
+    from warp.codegen import Var
+    vars = {
+        "result": Var("result", bool),
+        "sign": Var("sign", float32),
+        "face": Var("face", int32),
+        "t": Var("t", float32),
+        "u": Var("u", float32),
+        "v": Var("v", float32),
+        "normal": Var("normal", vec3),
+    }
 def matmul(
     a: array2d,
     b: array2d,
@@ -2889,6 +3079,11 @@ def matmul(
             "wp.matmul currently only supports operation between {A, B, C, D} matrices of the same type."
         )
+    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
+        )
     m = a.shape[0]
     n = b.shape[1]
     k = a.shape[1]
@@ -2923,13 +3118,13 @@ def matmul(
         ctypes.c_void_p(d.ptr),
         alpha,
         beta,
-        True,
-        True,
+        not a.is_transposed,
+        not b.is_transposed,
         allow_tf32x3_arith,
         1,
     )
     if not ret:
-        raise RuntimeError("Matmul failed.")
+        raise RuntimeError("matmul failed.")
 def adj_matmul(
@@ -2993,6 +3188,19 @@ def adj_matmul(
             "wp.adj_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
         )
+    if (
+        (not a.is_contiguous and not a.is_transposed)
+        or (not b.is_contiguous and not b.is_transposed)
+        or (not c.is_contiguous)
+        or (not adj_a.is_contiguous and not adj_a.is_transposed)
+        or (not adj_b.is_contiguous and not adj_b.is_transposed)
+        or (not adj_c.is_contiguous)
+        or (not adj_d.is_contiguous)
+    ):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
+        )
     m = a.shape[0]
     n = b.shape[1]
     k = a.shape[1]
@@ -3013,75 +3221,105 @@ def adj_matmul(
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()))
-        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()))
-        adj_c.assign(beta * adj_d.numpy())
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()) + adj_a.numpy())
+        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()) + adj_b.numpy())
+        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
     cc = device.arch
     # adj_a
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        k,
-        n,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_a.ptr),
-        alpha,
-        0.0,
-        True,
-        False,
-        allow_tf32x3_arith,
-        1,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not a.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            k,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(b.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            alpha,
+            1.0,
+            True,
+            b.is_transposed,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            m,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(b.ptr),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            alpha,
+            1.0,
+            not b.is_transposed,
+            False,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_b
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        k,
-        n,
-        m,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_b.ptr),
-        alpha,
-        0.0,
-        False,
-        True,
-        allow_tf32x3_arith,
-        1,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not b.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            n,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a.ptr),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            alpha,
+            1.0,
+            a.is_transposed,
+            True,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            n,
+            k,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(a.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            alpha,
+            1.0,
+            False,
+            not a.is_transposed,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_c
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(adj_c.ptr),
-        0.0,
-        beta,
-        True,
-        True,
-        allow_tf32x3_arith,
-        1,
+    warp.launch(
+        kernel=warp.utils.add_kernel_2d,
+        dim=adj_c.shape,
+        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
+        device=device,
+        record_tape=False
     )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
 def batched_matmul(
@@ -3120,6 +3358,11 @@ def batched_matmul(
             "wp.batched_matmul currently only supports operation between {A, B, C, D} matrices of the same type."
         )
+    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
+        )
     m = a.shape[1]
     n = b.shape[2]
     k = a.shape[2]
@@ -3131,7 +3374,7 @@ def batched_matmul(
     if runtime.tape:
         runtime.tape.record_func(
-            backward=lambda: adj_matmul(
+            backward=lambda: adj_batched_matmul(
                 a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
             ),
             arrays=[a, b, c, d],
@@ -3142,26 +3385,55 @@ def batched_matmul(
         d.assign(alpha * np.matmul(a.numpy(), b.numpy()) + beta * c.numpy())
         return
+    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
+    max_batch_count = 65535
+    iters = int(batch_count / max_batch_count)
+    remainder = batch_count % max_batch_count
     cc = device.arch
+    for i in range(iters):
+        idx_start = i * max_batch_count
+        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            n,
+            k,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(c[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(d[idx_start:idx_end,:,:].ptr),
+            alpha,
+            beta,
+            not a.is_transposed,
+            not b.is_transposed,
+            allow_tf32x3_arith,
+            max_batch_count,
+        )
+        if not ret:
+            raise RuntimeError("Batched matmul failed.")
+    idx_start = iters * max_batch_count
     ret = runtime.core.cutlass_gemm(
         cc,
         m,
         n,
         k,
         type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(c.ptr),
-        ctypes.c_void_p(d.ptr),
+        ctypes.c_void_p(a[idx_start:,:,:].ptr),
+        ctypes.c_void_p(b[idx_start:,:,:].ptr),
+        ctypes.c_void_p(c[idx_start:,:,:].ptr),
+        ctypes.c_void_p(d[idx_start:,:,:].ptr),
         alpha,
         beta,
-        True,
-        True,
+        not a.is_transposed,
+        not b.is_transposed,
         allow_tf32x3_arith,
-        batch_count,
+        remainder,
     )
     if not ret:
-        raise RuntimeError("Batched matmul failed.")
+        raise RuntimeError("Batched matmul failed.")
 def adj_batched_matmul(
@@ -3241,78 +3513,215 @@ def adj_batched_matmul(
             )
         )
+    if (
+        (not a.is_contiguous and not a.is_transposed)
+        or (not b.is_contiguous and not b.is_transposed)
+        or (not c.is_contiguous)
+        or (not adj_a.is_contiguous and not adj_a.is_transposed)
+        or (not adj_b.is_contiguous and not adj_b.is_transposed)
+        or (not adj_c.is_contiguous)
+        or (not adj_d.is_contiguous)
+    ):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
+        )
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))))
-        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()))
-        adj_c.assign(beta * adj_d.numpy())
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))) + adj_a.numpy())
+        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()) + adj_b.numpy())
+        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
+    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
+    max_batch_count = 65535
+    iters = int(batch_count / max_batch_count)
+    remainder = batch_count % max_batch_count
     cc = device.arch
+    for i in range(iters):
+        idx_start = i * max_batch_count
+        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
+        # adj_a
+        if not a.is_transposed:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                m,
+                k,
+                n,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                True,
+                b.is_transposed,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        else:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                k,
+                m,
+                n,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                not b.is_transposed,
+                False,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        # adj_b
+        if not b.is_transposed:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                k,
+                n,
+                m,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                a.is_transposed,
+                True,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        else:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                n,
+                k,
+                m,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                False,
+                not a.is_transposed,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+    idx_start = iters * max_batch_count
     # adj_a
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        k,
-        n,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_a.ptr),
-        alpha,
-        0.0,
-        True,
-        False,
-        allow_tf32x3_arith,
-        batch_count,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not a.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            k,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            True,
+            b.is_transposed,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            m,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            not b.is_transposed,
+            False,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_b
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        k,
-        n,
-        m,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_b.ptr),
-        alpha,
-        0.0,
-        False,
-        True,
-        allow_tf32x3_arith,
-        batch_count,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not b.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            n,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            a.is_transposed,
+            True,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            n,
+            k,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            False,
+            not a.is_transposed,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_c
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(adj_c.ptr),
-        0.0,
-        beta,
-        True,
-        True,
-        allow_tf32x3_arith,
-        batch_count,
+    warp.launch(
+        kernel=warp.utils.add_kernel_3d,
+        dim=adj_c.shape,
+        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
+        device=device,
+        record_tape=False
     )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
 class HashGrid:
     def __init__(self, dim_x, dim_y, dim_z, device=None):
@@ -3511,7 +3920,7 @@ def type_matches_template(arg_type, template_type):
         return True
     elif is_array(template_type):
         # ensure the argument type is a non-generic array with matching dtype and dimensionality
-        if type(arg_type) != type(template_type):
+        if type(arg_type) is not type(template_type):
             return False
         if not type_matches_template(arg_type.dtype, template_type.dtype):
             return False
@@ -3567,7 +3976,7 @@ def infer_argument_types(args, template_types, arg_names=None):
             arg_types.append(arg._cls)
         # elif arg_type in [warp.types.launch_bounds_t, warp.types.shape_t, warp.types.range_t]:
         #     arg_types.append(arg_type)
-        # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.bvh_query_t]:
+        # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.mesh_query_point_t, warp.mesh_query_ray_t, warp.bvh_query_t]:
         #     arg_types.append(arg_type)
         elif arg is None:
             # allow passing None for arrays
@@ -3605,6 +4014,8 @@ simple_type_codes = {
     launch_bounds_t: "lb",
     hash_grid_query_t: "hgq",
     mesh_query_aabb_t: "mqa",
+    mesh_query_point_t: "mqp",
+    mesh_query_ray_t: "mqr",
     bvh_query_t: "bvhq",
 }