PyPI - warp-lang - Versions diffs - 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (300) hide show

warp/__init__.py +10 -4
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +868 -507
warp/codegen.py +1074 -638
warp/config.py +3 -3
warp/constants.py +6 -0
warp/context.py +715 -222
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +147 -44
warp/native/builtin.h +122 -149
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +34 -43
warp/native/clang/clang.cpp +13 -8
warp/native/crt.h +2 -0
warp/native/cuda_crt.h +5 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -952
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +1 -1
warp/native/marching.cu +157 -161
warp/native/mat.h +80 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -23
warp/native/mesh.h +446 -46
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +1 -1
warp/native/reduce.cu +10 -12
warp/native/runlength_encode.cu +6 -10
warp/native/scan.cu +8 -11
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +164 -154
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +14 -30
warp/native/vec.h +107 -23
warp/native/volume.h +120 -0
warp/native/warp.cpp +560 -30
warp/native/warp.cu +431 -44
warp/native/warp.h +13 -4
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +335 -119
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +8 -0
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +158 -16
warp/sim/model.py +795 -291
warp/sim/render.py +3 -3
warp/sim/utils.py +3 -0
warp/sparse.py +640 -150
warp/stubs.py +606 -267
warp/tape.py +61 -10
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +212 -97
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +42 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +208 -130
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +178 -109
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +32 -31
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +140 -22
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +9 -6
warp/types.py +1089 -366
warp/utils.py +93 -387
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/tests/test_all.py +0 -219
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.10.1.dist-info/METADATA +0 -21
warp_lang-0.10.1.dist-info/RECORD +0 -188
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/LICENSE.md +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/types.py CHANGED Viewed

@@ -5,19 +5,17 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+from __future__ import annotations
+import builtins
 import ctypes
 import hashlib
+import inspect
 import struct
 import zlib
-import numpy as np
+from typing import Any, Callable, Generic, List, Tuple, TypeVar, Union
-from typing import Any
-from typing import Tuple
-from typing import TypeVar
-from typing import Generic
-from typing import List
-from typing import Callable
-from typing import Union
+import numpy as np
 import warp
@@ -54,12 +52,14 @@ def constant(x):
     global _constant_hash
     # hash the constant value
-    if isinstance(x, int):
+    if isinstance(x, builtins.bool):
+        # This needs to come before the check for `int` since all boolean
+        # values are also instances of `int`.
+        _constant_hash.update(struct.pack("?", x))
+    elif isinstance(x, int):
         _constant_hash.update(struct.pack("<q", x))
     elif isinstance(x, float):
         _constant_hash.update(struct.pack("<d", x))
-    elif isinstance(x, bool):
-        _constant_hash.update(struct.pack("?", x))
     elif isinstance(x, float16):
         # float16 is a special case
         p = ctypes.pointer(ctypes.c_float(x.value))
@@ -149,28 +149,74 @@ def vector(length, dtype):
         def __setitem__(self, key, value):
             if isinstance(key, int):
-                super().__setitem__(key, vec_t.scalar_import(value))
-                return value
+                try:
+                    return super().__setitem__(key, vec_t.scalar_import(value))
+                except (TypeError, ctypes.ArgumentError):
+                    raise TypeError(
+                        f"Expected to assign a `{self._wp_scalar_type_.__name__}` value "
+                        f"but got `{type(value).__name__}` instead"
+                    ) from None
             elif isinstance(key, slice):
+                try:
+                    iter(value)
+                except TypeError:
+                    raise TypeError(
+                        f"Expected to assign a slice from a sequence of values "
+                        f"but got `{type(value).__name__}` instead"
+                    ) from None
                 if self._wp_scalar_type_ == float16:
-                    super().__setitem__(key, [vec_t.scalar_import(x) for x in value])
-                    return value
-                else:
+                    converted = []
+                    try:
+                        for x in value:
+                            converted.append(vec_t.scalar_import(x))
+                    except ctypes.ArgumentError:
+                        raise TypeError(
+                            f"Expected to assign a slice from a sequence of `float16` values "
+                            f"but got `{type(x).__name__}` instead"
+                        ) from None
+                    value = converted
+                try:
                     return super().__setitem__(key, value)
+                except TypeError:
+                    for x in value:
+                        try:
+                            self._type_(x)
+                        except TypeError:
+                            raise TypeError(
+                                f"Expected to assign a slice from a sequence of `{self._wp_scalar_type_.__name__}` values "
+                                f"but got `{type(x).__name__}` instead"
+                            ) from None
             else:
                 raise KeyError(f"Invalid key {key}, expected int or slice")
+        def __getattr__(self, name):
+            idx = "xyzw".find(name)
+            if idx != -1:
+                return self.__getitem__(idx)
+            return self.__getattribute__(name)
+        def __setattr__(self, name, value):
+            idx = "xyzw".find(name)
+            if idx != -1:
+                return self.__setitem__(idx, value)
+            return super().__setattr__(name, value)
         def __add__(self, y):
             return warp.add(self, y)
         def __radd__(self, y):
-            return warp.add(self, y)
+            return warp.add(y, self)
         def __sub__(self, y):
             return warp.sub(self, y)
-        def __rsub__(self, x):
-            return warp.sub(x, self)
+        def __rsub__(self, y):
+            return warp.sub(y, self)
         def __mul__(self, y):
             return warp.mul(self, y)
@@ -178,17 +224,17 @@ def vector(length, dtype):
         def __rmul__(self, x):
             return warp.mul(x, self)
-        def __div__(self, y):
+        def __truediv__(self, y):
             return warp.div(self, y)
-        def __rdiv__(self, x):
+        def __rtruediv__(self, x):
             return warp.div(x, self)
-        def __pos__(self, y):
-            return warp.pos(self, y)
+        def __pos__(self):
+            return warp.pos(self)
-        def __neg__(self, y):
-            return warp.neg(self, y)
+        def __neg__(self):
+            return warp.neg(self)
         def __str__(self):
             return f"[{', '.join(map(str, self))}]"
@@ -280,13 +326,13 @@ def matrix(shape, dtype):
             return warp.add(self, y)
         def __radd__(self, y):
-            return warp.add(self, y)
+            return warp.add(y, self)
         def __sub__(self, y):
             return warp.sub(self, y)
-        def __rsub__(self, x):
-            return warp.sub(x, self)
+        def __rsub__(self, y):
+            return warp.sub(y, self)
         def __mul__(self, y):
             return warp.mul(self, y)
@@ -300,17 +346,17 @@ def matrix(shape, dtype):
         def __rmatmul__(self, x):
             return warp.mul(x, self)
-        def __div__(self, y):
+        def __truediv__(self, y):
             return warp.div(self, y)
-        def __rdiv__(self, x):
+        def __rtruediv__(self, x):
             return warp.div(x, self)
-        def __pos__(self, y):
-            return warp.pos(self, y)
+        def __pos__(self):
+            return warp.pos(self)
-        def __neg__(self, y):
-            return warp.neg(self, y)
+        def __neg__(self):
+            return warp.neg(self)
         def __str__(self):
             row_str = []
@@ -341,10 +387,28 @@ def matrix(shape, dtype):
         def set_row(self, r, v):
             if r < 0 or r >= self._shape_[0]:
                 raise IndexError("Invalid row index")
+            try:
+                iter(v)
+            except TypeError:
+                raise TypeError(
+                    f"Expected to assign a slice from a sequence of values "
+                    f"but got `{type(v).__name__}` instead"
+                ) from None
             row_start = r * self._shape_[1]
             row_end = row_start + self._shape_[1]
             if self._wp_scalar_type_ == float16:
-                v = [mat_t.scalar_import(x) for x in v]
+                converted = []
+                try:
+                    for x in v:
+                        converted.append(mat_t.scalar_import(x))
+                except ctypes.ArgumentError:
+                    raise TypeError(
+                        f"Expected to assign a slice from a sequence of `float16` values "
+                        f"but got `{type(x).__name__}` instead"
+                    ) from None
+                v = converted
             super().__setitem__(slice(row_start, row_end), v)
         def __getitem__(self, key):
@@ -352,6 +416,8 @@ def matrix(shape, dtype):
                 # element indexing m[i,j]
                 if len(key) != 2:
                     raise KeyError(f"Invalid key, expected one or two indices, got {len(key)}")
+                if any(isinstance(x, slice) for x in key):
+                    raise KeyError(f"Slices are not supported when indexing matrices using the `m[i, j]` notation")
                 return mat_t.scalar_export(super().__getitem__(key[0] * self._shape_[1] + key[1]))
             elif isinstance(key, int):
                 # row vector indexing m[r]
@@ -364,12 +430,20 @@ def matrix(shape, dtype):
                 # element indexing m[i,j] = x
                 if len(key) != 2:
                     raise KeyError(f"Invalid key, expected one or two indices, got {len(key)}")
-                super().__setitem__(key[0] * self._shape_[1] + key[1], mat_t.scalar_import(value))
-                return value
+                if any(isinstance(x, slice) for x in key):
+                    raise KeyError(f"Slices are not supported when indexing matrices using the `m[i, j]` notation")
+                try:
+                    return super().__setitem__(key[0] * self._shape_[1] + key[1], mat_t.scalar_import(value))
+                except (TypeError, ctypes.ArgumentError):
+                    raise TypeError(
+                        f"Expected to assign a `{self._wp_scalar_type_.__name__}` value "
+                        f"but got `{type(value).__name__}` instead"
+                    ) from None
             elif isinstance(key, int):
                 # row vector indexing m[r] = v
-                self.set_row(key, value)
-                return value
+                return self.set_row(key, value)
+            elif isinstance(key, slice):
+                raise KeyError(f"Slices are not supported when indexing matrices using the `m[start:end]` notation")
             else:
                 raise KeyError(f"Invalid key {key}, expected int or pair of ints")
@@ -392,6 +466,23 @@ class void:
         pass
+class bool:
+    _length_ = 1
+    _type_ = ctypes.c_bool
+    def __init__(self, x=False):
+        self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value != 0)
+    def __int__(self) -> int:
+        return int(self.value != 0)
 class float16:
     _length_ = 1
     _type_ = ctypes.c_uint16
@@ -399,6 +490,15 @@ class float16:
     def __init__(self, x=0.0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0.0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
 class float32:
     _length_ = 1
@@ -407,6 +507,15 @@ class float32:
     def __init__(self, x=0.0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0.0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
 class float64:
     _length_ = 1
@@ -415,6 +524,15 @@ class float64:
     def __init__(self, x=0.0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0.0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
 class int8:
     _length_ = 1
@@ -423,6 +541,18 @@ class int8:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class uint8:
     _length_ = 1
@@ -431,6 +561,18 @@ class uint8:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class int16:
     _length_ = 1
@@ -439,6 +581,18 @@ class int16:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class uint16:
     _length_ = 1
@@ -447,6 +601,18 @@ class uint16:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class int32:
     _length_ = 1
@@ -455,6 +621,18 @@ class int32:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class uint32:
     _length_ = 1
@@ -463,6 +641,18 @@ class uint32:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class int64:
     _length_ = 1
@@ -471,6 +661,18 @@ class int64:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class uint64:
     _length_ = 1
@@ -479,6 +681,18 @@ class uint64:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 def quaternion(dtype=Any):
     class quat_t(vector(length=4, dtype=dtype)):
@@ -508,23 +722,63 @@ class quatd(quaternion(dtype=float64)):
 def transformation(dtype=Any):
     class transform_t(vector(length=7, dtype=dtype)):
+        _wp_init_from_components_sig_ = inspect.Signature(
+            (
+                inspect.Parameter(
+                    "p",
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                    default=(0.0, 0.0, 0.0),
+                ),
+                inspect.Parameter(
+                    "q",
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                    default=(0.0, 0.0, 0.0, 1.0),
+                ),
+            ),
+        )
         _wp_type_params_ = [dtype]
         _wp_generic_type_str_ = "transform_t"
         _wp_constructor_ = "transformation"
-        def __init__(self, p=(0.0, 0.0, 0.0), q=(0.0, 0.0, 0.0, 1.0)):
-            super().__init__()
+        def __init__(self, *args, **kwargs):
+            if len(args) == 1 and len(kwargs) == 0:
+                if getattr(args[0], "_wp_generic_type_str_") == self._wp_generic_type_str_:
+                    # Copy constructor.
+                    super().__init__(*args[0])
+                    return
+            try:
+                # For backward compatibility, try to check if the arguments
+                # match the original signature that'd allow initializing
+                # the `p` and `q` components separately.
+                bound_args = self._wp_init_from_components_sig_.bind(*args, **kwargs)
+                bound_args.apply_defaults()
+                p, q = bound_args.args
+            except (TypeError, ValueError):
+                # Fallback to the vector's constructor.
+                super().__init__(*args)
+                return
+            # Even if the arguments match the original “from components”
+            # signature, we still need to make sure that they represent
+            # sequences that can be unpacked.
+            if hasattr(p, "__len__") and hasattr(q, "__len__"):
+                # Initialize from the `p` and `q` components.
+                super().__init__()
+                self[0:3] = vector(length=3, dtype=dtype)(*p)
+                self[3:7] = quaternion(dtype=dtype)(*q)
+                return
-            self[0:3] = vector(length=3, dtype=dtype)(*p)
-            self[3:7] = quaternion(dtype=dtype)(*q)
+            # Fallback to the vector's constructor.
+            super().__init__(*args)
         @property
         def p(self):
-            return self[0:3]
+            return vec3(self[0:3])
         @property
         def q(self):
-            return self[3:7]
+            return quat(self[3:7])
     return transform_t
@@ -808,6 +1062,7 @@ vector_types = [
 ]
 np_dtype_to_warp_type = {
+    np.dtype(np.bool_): bool,
     np.dtype(np.int8): int8,
     np.dtype(np.uint8): uint8,
     np.dtype(np.int16): int16,
@@ -824,6 +1079,7 @@ np_dtype_to_warp_type = {
 }
 warp_type_to_np_dtype = {
+    bool: np.bool_,
     int8: np.int8,
     int16: np.int16,
     int32: np.int32,
@@ -846,18 +1102,21 @@ class range_t:
 # definition just for kernel type (cannot be a parameter), see bvh.h
 class bvh_query_t:
+    """Object used to track state during BVH traversal."""
     def __init__(self):
         pass
 # definition just for kernel type (cannot be a parameter), see mesh.h
 class mesh_query_aabb_t:
+    """Object used to track state during mesh traversal."""
     def __init__(self):
         pass
 # definition just for kernel type (cannot be a parameter), see hash_grid.h
 class hash_grid_query_t:
+    """Object used to track state during neighbor traversal."""
     def __init__(self):
         pass
@@ -869,6 +1128,8 @@ LAUNCH_MAX_DIMS = 4
 # must match array.h
 ARRAY_TYPE_REGULAR = 0
 ARRAY_TYPE_INDEXED = 1
+ARRAY_TYPE_FABRIC = 2
+ARRAY_TYPE_FABRIC_INDEXED = 3
 # represents bounds for kernel launch (number of threads across multiple dimensions)
@@ -992,7 +1253,7 @@ def type_scalar_type(dtype):
 def type_size_in_bytes(dtype):
     if dtype.__module__ == "ctypes":
         return ctypes.sizeof(dtype)
-    elif type_is_struct(dtype):
+    elif isinstance(dtype, warp.codegen.Struct):
         return ctypes.sizeof(dtype.ctype)
     elif dtype == float or dtype == int:
         return 4
@@ -1013,9 +1274,9 @@ def type_to_warp(dtype):
 def type_typestr(dtype):
-    from warp.codegen import Struct
-    if dtype == float16:
+    if dtype == bool:
+        return "?"
+    elif dtype == float16:
         return "<f2"
     elif dtype == float32:
         return "<f4"
@@ -1037,7 +1298,7 @@ def type_typestr(dtype):
         return "<i8"
     elif dtype == uint64:
         return "<u8"
-    elif isinstance(dtype, Struct):
+    elif isinstance(dtype, warp.codegen.Struct):
         return f"|V{ctypes.sizeof(dtype.ctype)}"
     elif issubclass(dtype, ctypes.Array):
         return type_typestr(dtype._wp_scalar_type_)
@@ -1051,9 +1312,16 @@ def type_repr(t):
         return str(f"array(ndim={t.ndim}, dtype={t.dtype})")
     if type_is_vector(t):
         return str(f"vector(length={t._shape_[0]}, dtype={t._wp_scalar_type_})")
-    elif type_is_matrix(t):
+    if type_is_matrix(t):
         return str(f"matrix(shape=({t._shape_[0]}, {t._shape_[1]}), dtype={t._wp_scalar_type_})")
-    else:
+    if isinstance(t, warp.codegen.Struct):
+        return type_repr(t.cls)
+    if t in scalar_types:
+        return t.__name__
+    try:
+        return t.__module__ + "." + t.__qualname__
+    except AttributeError:
         return str(t)
@@ -1071,15 +1339,6 @@ def type_is_float(t):
     return t in float_types
-def type_is_struct(dtype):
-    from warp.codegen import Struct
-    if isinstance(dtype, Struct):
-        return True
-    else:
-        return False
 # returns True if the passed *type* is a vector
 def type_is_vector(t):
     if hasattr(t, "_wp_generic_type_str_") and t._wp_generic_type_str_ == "vec_t":
@@ -1098,7 +1357,7 @@ def type_is_matrix(t):
 # returns true for all value types (int, float, bool, scalars, vectors, matrices)
 def type_is_value(x):
-    if (x == int) or (x == float) or (x == bool) or (x in scalar_types) or issubclass(x, ctypes.Array):
+    if (x == int) or (x == float) or (x == builtins.bool) or (x in scalar_types) or issubclass(x, ctypes.Array):
         return True
     else:
         return False
@@ -1126,14 +1385,16 @@ def types_equal(a, b, match_generic=False):
     # convert to canonical types
     if a == float:
         a = float32
-    if a == int:
+    elif a == int:
         a = int32
     if b == float:
         b = float32
-    if b == int:
+    elif b == int:
         b = int32
+    compatible_bool_types = [builtins.bool, bool]
     def are_equal(p1, p2):
         if match_generic:
             if p1 == Any or p2 == Any:
@@ -1150,7 +1411,22 @@ def types_equal(a, b, match_generic=False):
                 return True
             if p1 == Float and p2 == Float:
                 return True
-        return p1 == p2
+        # convert to canonical types
+        if p1 == float:
+            p1 = float32
+        elif p1 == int:
+            p1 = int32
+        if p2 == float:
+            p2 = float32
+        elif b == int:
+            p2 = int32
+        if p1 in compatible_bool_types and p2 in compatible_bool_types:
+            return True
+        else:
+            return p1 == p2
     if (
         hasattr(a, "_wp_generic_type_str_")
@@ -1158,9 +1434,7 @@ def types_equal(a, b, match_generic=False):
         and a._wp_generic_type_str_ == b._wp_generic_type_str_
     ):
         return all([are_equal(p1, p2) for p1, p2 in zip(a._wp_type_params_, b._wp_type_params_)])
-    if isinstance(a, array) and isinstance(b, array):
-        return True
-    if isinstance(a, indexedarray) and isinstance(b, indexedarray):
+    if is_array(a) and type(a) is type(b):
         return True
     else:
         return are_equal(a, b)
@@ -1244,6 +1518,7 @@ class array(Array):
         self._grad = None
         # __array_interface__ or __cuda_array_interface__, evaluated lazily and cached
         self._array_interface = None
+        self.is_transposed = False
         # canonicalize dtype
         if dtype == int:
@@ -1317,7 +1592,9 @@ class array(Array):
             if isinstance(data, np.ndarray):
                 # construct from numpy structured array
                 if data.dtype != dtype.numpy_dtype():
-                    raise RuntimeError(f"Invalid source data type for array of structs, expected {dtype.numpy_dtype()}, got {data.dtype}")
+                    raise RuntimeError(
+                        f"Invalid source data type for array of structs, expected {dtype.numpy_dtype()}, got {data.dtype}"
+                    )
                 arr = data
             elif isinstance(data, (list, tuple)):
                 # construct from a sequence of structs
@@ -1329,9 +1606,13 @@ class array(Array):
                     # convert to numpy
                     arr = np.frombuffer(ctype_arr, dtype=dtype.ctype)
                 except Exception as e:
-                    raise RuntimeError(f"Error while trying to construct Warp array from a sequence of Warp structs: {e}")
+                    raise RuntimeError(
+                        f"Error while trying to construct Warp array from a sequence of Warp structs: {e}"
+                    )
             else:
-                raise RuntimeError(f"Invalid data argument for array of structs, expected a sequence of structs or a NumPy structured array")
+                raise RuntimeError(
+                    "Invalid data argument for array of structs, expected a sequence of structs or a NumPy structured array"
+                )
         else:
             # convert input data to the given dtype
             npdtype = warp_type_to_np_dtype.get(scalar_dtype)
@@ -1416,7 +1697,7 @@ class array(Array):
     def _init_from_ptr(self, ptr, dtype, shape, strides, capacity, device, owner, pinned):
         if dtype == Any:
-            raise RuntimeError(f"A concrete data type is required to create the array")
+            raise RuntimeError("A concrete data type is required to create the array")
         device = warp.get_device(device)
@@ -1450,7 +1731,7 @@ class array(Array):
     def _init_new(self, dtype, shape, strides, device, pinned):
         if dtype == Any:
-            raise RuntimeError(f"A concrete data type is required to create the array")
+            raise RuntimeError("A concrete data type is required to create the array")
         device = warp.get_device(device)
@@ -1753,7 +2034,7 @@ class array(Array):
         return self._requires_grad
     @requires_grad.setter
-    def requires_grad(self, value: bool):
+    def requires_grad(self, value: builtins.bool):
         if value and self._grad is None:
             self._alloc_grad()
         elif not value:
@@ -1778,12 +2059,11 @@ class array(Array):
         # member attributes available during code-gen (e.g.: d = array.shape[0])
         # Note: we use a shared dict for all array instances
         if array._vars is None:
-            from warp.codegen import Var
-            array._vars = {"shape": Var("shape", shape_t)}
+            array._vars = {"shape": warp.codegen.Var("shape", shape_t)}
         return array._vars
     def zero_(self):
+        """Zeroes-out the array entries."""
         if self.is_contiguous:
             # simple memset is usually faster than generic fill
             self.device.memset(self.ptr, 0, self.size * type_size_in_bytes(self.dtype))
@@ -1791,6 +2071,32 @@ class array(Array):
             self.fill_(0)
     def fill_(self, value):
+        """Set all array entries to `value`
+        args:
+            value: The value to set every array entry to. Must be convertible to the array's ``dtype``.
+        Raises:
+            ValueError: If `value` cannot be converted to the array's ``dtype``.
+        Examples:
+            ``fill_()`` can take lists or other sequences when filling arrays of vectors or matrices.
+            >>> arr = wp.zeros(2, dtype=wp.mat22)
+            >>> arr.numpy()
+            array([[[0., 0.],
+                    [0., 0.]],
+            <BLANKLINE>
+                   [[0., 0.],
+                    [0., 0.]]], dtype=float32)
+            >>> arr.fill_([[1, 2], [3, 4]])
+            >>> arr.numpy()
+            array([[[1., 2.],
+                    [3., 4.]],
+            <BLANKLINE>
+                   [[1., 2.],
+                    [3., 4.]]], dtype=float32)
+        """
         if self.size == 0:
             return
@@ -1837,19 +2143,22 @@ class array(Array):
             else:
                 warp.context.runtime.core.array_fill_host(carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size)
-    # equivalent to wrapping src data in an array and copying to self
     def assign(self, src):
+        """Wraps ``src`` in an :class:`warp.array` if it is not already one and copies the contents to ``self``."""
         if is_array(src):
             warp.copy(self, src)
         else:
             warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))
-    # convert array to ndarray (alias memory through array interface)
     def numpy(self):
+        """Converts the array to a :class:`numpy.ndarray` (aliasing memory through the array interface protocol)
+        If the array is on the GPU, a synchronous device-to-host copy (on the CUDA default stream) will be
+        automatically performed to ensure that any outstanding work is completed.
+        """
         if self.ptr:
             # use the CUDA default stream for synchronous behaviour with other streams
             with warp.ScopedStream(self.device.null_stream):
-                a = self.to("cpu")
+                a = self.to("cpu", requires_grad=False)
             # convert through __array_interface__
             # Note: this handles arrays of structs using `descr`, so the result will be a structured NumPy array
             return np.array(a, copy=False)
@@ -1866,12 +2175,16 @@ class array(Array):
                 npshape = self.shape
             return np.empty(npshape, dtype=npdtype)
-    # return a ctypes cast of the array address
-    # note #1: only CPU arrays support this method
-    # note #2: the array must be contiguous
-    # note #3: accesses to this object are *not* bounds checked
-    # note #4: for float16 types, a pointer to the internal uint16 representation is returned
     def cptr(self):
+        """Return a ctypes cast of the array address.
+        Notes:
+        #. Only CPU arrays support this method.
+        #. The array must be contiguous.
+        #. Accesses to this object are **not** bounds checked.
+        #. For ``float16`` types, a pointer to the internal ``uint16`` representation is returned.
+        """
         if not self.ptr:
             return None
@@ -1890,8 +2203,8 @@ class array(Array):
         return p
-    # returns a flattened list of items in the array as a Python list
     def list(self):
+        """Returns a flattened list of items in the array as a Python list."""
         a = self.numpy()
         if isinstance(self.dtype, warp.codegen.Struct):
@@ -1910,15 +2223,16 @@ class array(Array):
             # scalar
             return list(a.flatten())
-    # convert data from one device to another, nop if already on device
-    def to(self, device):
+    def to(self, device, requires_grad=None):
+        """Returns a Warp array with this array's data moved to the specified device, no-op if already on device."""
         device = warp.get_device(device)
         if self.device == device:
             return self
         else:
-            return warp.clone(self, device=device)
+            return warp.clone(self, device=device, requires_grad=requires_grad)
     def flatten(self):
+        """Returns a zero-copy view of the array collapsed to 1-D. Only supported for contiguous arrays."""
         if self.ndim == 1:
             return self
@@ -1941,6 +2255,11 @@ class array(Array):
         return a
     def reshape(self, shape):
+        """Returns a reshaped array. Only supported for contiguous arrays.
+        Args:
+            shape : An int or tuple of ints specifying the shape of the returned array.
+        """
         if not self.is_contiguous:
             raise RuntimeError("Reshaping non-contiguous arrays is unsupported.")
@@ -1998,6 +2317,9 @@ class array(Array):
         return a
     def view(self, dtype):
+        """Returns a zero-copy view of this array's memory with a different data type.
+        ``dtype`` must have the same byte size of the array's native ``dtype``.
+        """
         if type_size_in_bytes(dtype) != type_size_in_bytes(self.dtype):
             raise RuntimeError("Cannot cast dtypes of unequal byte size")
@@ -2018,6 +2340,7 @@ class array(Array):
         return a
     def contiguous(self):
+        """Returns a contiguous array with this array's data. No-op if array is already contiguous."""
         if self.is_contiguous:
             return self
@@ -2025,8 +2348,14 @@ class array(Array):
         warp.copy(a, self)
         return a
-    # note: transpose operation will return an array with a non-contiguous access pattern
     def transpose(self, axes=None):
+        """Returns an zero-copy view of the array with axes transposed.
+        Note: The transpose operation will return an array with a non-contiguous access pattern.
+        Args:
+            axes (optional): Specifies the how the axes are permuted. If not specified, the axes order will be reversed.
+        """
         # noop if 1d array
         if self.ndim == 1:
             return self
@@ -2059,6 +2388,8 @@ class array(Array):
             grad=None if self.grad is None else self.grad.transpose(axes=axes),
         )
+        a.is_transposed = not self.is_transposed
         a._ref = self
         return a
@@ -2093,7 +2424,7 @@ def from_ptr(ptr, length, dtype=None, shape=None, device=None):
         dtype=dtype,
         length=length,
         capacity=length * type_size_in_bytes(dtype),
-        ptr=ctypes.cast(ptr, ctypes.POINTER(ctypes.c_size_t)).contents.value,
+        ptr=0 if ptr == 0 else ctypes.cast(ptr, ctypes.POINTER(ctypes.c_size_t)).contents.value,
         shape=shape,
         device=device,
         owner=False,
@@ -2101,12 +2432,113 @@ def from_ptr(ptr, length, dtype=None, shape=None, device=None):
     )
-class indexedarray(Generic[T]):
+# A base class for non-contiguous arrays, providing the implementation of common methods like
+# contiguous(), to(), numpy(), list(), assign(), zero_(), and fill_().
+class noncontiguous_array_base(Generic[T]):
+    def __init__(self, array_type_id):
+        self.type_id = array_type_id
+        self.is_contiguous = False
+    # return a contiguous copy
+    def contiguous(self):
+        a = warp.empty_like(self)
+        warp.copy(a, self)
+        return a
+    # copy data from one device to another, nop if already on device
+    def to(self, device):
+        device = warp.get_device(device)
+        if self.device == device:
+            return self
+        else:
+            return warp.clone(self, device=device)
+    # return a contiguous numpy copy
+    def numpy(self):
+        # use the CUDA default stream for synchronous behaviour with other streams
+        with warp.ScopedStream(self.device.null_stream):
+            return self.contiguous().numpy()
+    # returns a flattened list of items in the array as a Python list
+    def list(self):
+        # use the CUDA default stream for synchronous behaviour with other streams
+        with warp.ScopedStream(self.device.null_stream):
+            return self.contiguous().list()
+    # equivalent to wrapping src data in an array and copying to self
+    def assign(self, src):
+        if is_array(src):
+            warp.copy(self, src)
+        else:
+            warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))
+    def zero_(self):
+        self.fill_(0)
+    def fill_(self, value):
+        if self.size == 0:
+            return
+        # try to convert the given value to the array dtype
+        try:
+            if isinstance(self.dtype, warp.codegen.Struct):
+                if isinstance(value, self.dtype.cls):
+                    cvalue = value.__ctype__()
+                elif value == 0:
+                    # allow zero-initializing structs using default constructor
+                    cvalue = self.dtype().__ctype__()
+                else:
+                    raise ValueError(
+                        f"Invalid initializer value for struct {self.dtype.cls.__name__}, expected struct instance or 0"
+                    )
+            elif issubclass(self.dtype, ctypes.Array):
+                # vector/matrix
+                cvalue = self.dtype(value)
+            else:
+                # scalar
+                if type(value) in warp.types.scalar_types:
+                    value = value.value
+                if self.dtype == float16:
+                    cvalue = self.dtype._type_(float_to_half_bits(value))
+                else:
+                    cvalue = self.dtype._type_(value)
+        except Exception as e:
+            raise ValueError(f"Failed to convert the value to the array data type: {e}")
+        cvalue_ptr = ctypes.pointer(cvalue)
+        cvalue_size = ctypes.sizeof(cvalue)
+        ctype = self.__ctype__()
+        ctype_ptr = ctypes.pointer(ctype)
+        if self.device.is_cuda:
+            warp.context.runtime.core.array_fill_device(
+                self.device.context, ctype_ptr, self.type_id, cvalue_ptr, cvalue_size
+            )
+        else:
+            warp.context.runtime.core.array_fill_host(ctype_ptr, self.type_id, cvalue_ptr, cvalue_size)
+# helper to check index array properties
+def check_index_array(indices, expected_device):
+    if not isinstance(indices, array):
+        raise ValueError(f"Indices must be a Warp array, got {type(indices)}")
+    if indices.ndim != 1:
+        raise ValueError(f"Index array must be one-dimensional, got {indices.ndim}")
+    if indices.dtype != int32:
+        raise ValueError(f"Index array must use int32, got dtype {indices.dtype}")
+    if indices.device != expected_device:
+        raise ValueError(f"Index array device ({indices.device} does not match data array device ({expected_device}))")
+class indexedarray(noncontiguous_array_base[T]):
     # member attributes available during code-gen (e.g.: d = arr.shape[0])
     # (initialized when needed)
     _vars = None
     def __init__(self, data: array = None, indices: Union[array, List[array]] = None, dtype=None, ndim=None):
+        super().__init__(ARRAY_TYPE_INDEXED)
         # canonicalize types
         if dtype is not None:
             if dtype == int:
@@ -2136,17 +2568,6 @@ class indexedarray(Generic[T]):
             shape = list(data.shape)
             if indices is not None:
-                # helper to check index array properties
-                def check_index_array(inds, data):
-                    if inds.ndim != 1:
-                        raise ValueError(f"Index array must be one-dimensional, got {inds.ndim}")
-                    if inds.dtype != int32:
-                        raise ValueError(f"Index array must use int32, got dtype {inds.dtype}")
-                    if inds.device != data.device:
-                        raise ValueError(
-                            f"Index array device ({inds.device} does not match data array device ({data.device}))"
-                        )
                 if isinstance(indices, (list, tuple)):
                     if len(indices) > self.ndim:
                         raise ValueError(
@@ -2154,16 +2575,14 @@ class indexedarray(Generic[T]):
                         )
                     for i in range(len(indices)):
-                        if isinstance(indices[i], array):
-                            check_index_array(indices[i], data)
+                        if indices[i] is not None:
+                            check_index_array(indices[i], data.device)
                             self.indices[i] = indices[i]
                             shape[i] = len(indices[i])
-                        elif indices[i] is not None:
-                            raise TypeError(f"Invalid index array type: {type(indices[i])}")
                 elif isinstance(indices, array):
                     # only a single index array was provided
-                    check_index_array(indices, data)
+                    check_index_array(indices, data.device)
                     self.indices[0] = indices
                     shape[0] = len(indices)
@@ -2185,8 +2604,6 @@ class indexedarray(Generic[T]):
         for d in self.shape:
             self.size *= d
-        self.is_contiguous = False
     def __len__(self):
         return self.shape[0]
@@ -2206,89 +2623,9 @@ class indexedarray(Generic[T]):
         # member attributes available during code-gen (e.g.: d = arr.shape[0])
         # Note: we use a shared dict for all indexedarray instances
         if indexedarray._vars is None:
-            from warp.codegen import Var
-            indexedarray._vars = {"shape": Var("shape", shape_t)}
+            indexedarray._vars = {"shape": warp.codegen.Var("shape", shape_t)}
         return indexedarray._vars
-    def contiguous(self):
-        a = warp.empty_like(self)
-        warp.copy(a, self)
-        return a
-    # convert data from one device to another, nop if already on device
-    def to(self, device):
-        device = warp.get_device(device)
-        if self.device == device:
-            return self
-        else:
-            return warp.clone(self, device=device)
-    # return a contiguous numpy copy
-    def numpy(self):
-        # use the CUDA default stream for synchronous behaviour with other streams
-        with warp.ScopedStream(self.device.null_stream):
-            return self.contiguous().numpy()
-    # returns a flattened list of items in the array as a Python list
-    def list(self):
-        # use the CUDA default stream for synchronous behaviour with other streams
-        with warp.ScopedStream(self.device.null_stream):
-            return self.contiguous().list()
-    def zero_(self):
-        self.fill_(0)
-    def fill_(self, value):
-        if self.size == 0:
-            return
-        # try to convert the given value to the array dtype
-        try:
-            if isinstance(self.dtype, warp.codegen.Struct):
-                if isinstance(value, self.dtype.cls):
-                    cvalue = value.__ctype__()
-                elif value == 0:
-                    # allow zero-initializing structs using default constructor
-                    cvalue = self.dtype().__ctype__()
-                else:
-                    raise ValueError(
-                        f"Invalid initializer value for struct {self.dtype.cls.__name__}, expected struct instance or 0"
-                    )
-            elif issubclass(self.dtype, ctypes.Array):
-                # vector/matrix
-                cvalue = self.dtype(value)
-            else:
-                # scalar
-                if type(value) in warp.types.scalar_types:
-                    value = value.value
-                if self.dtype == float16:
-                    cvalue = self.dtype._type_(float_to_half_bits(value))
-                else:
-                    cvalue = self.dtype._type_(value)
-        except Exception as e:
-            raise ValueError(f"Failed to convert the value to the array data type: {e}")
-        cvalue_ptr = ctypes.pointer(cvalue)
-        cvalue_size = ctypes.sizeof(cvalue)
-        ctype = self.__ctype__()
-        ctype_ptr = ctypes.pointer(ctype)
-        if self.device.is_cuda:
-            warp.context.runtime.core.array_fill_device(
-                self.device.context, ctype_ptr, ARRAY_TYPE_INDEXED, cvalue_ptr, cvalue_size
-            )
-        else:
-            warp.context.runtime.core.array_fill_host(ctype_ptr, ARRAY_TYPE_INDEXED, cvalue_ptr, cvalue_size)
-    # equivalent to wrapping src data in an array and copying to self
-    def assign(self, src):
-        if is_array(src):
-            warp.copy(self, src)
-        else:
-            warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))
 # aliases for indexedarrays with small dimensions
 def indexedarray1d(*args, **kwargs):
@@ -2314,16 +2651,22 @@ def indexedarray4d(*args, **kwargs):
     return indexedarray(*args, **kwargs)
-array_types = (array, indexedarray)
+from warp.fabric import fabricarray, indexedfabricarray  # noqa: E402
+array_types = (array, indexedarray, fabricarray, indexedfabricarray)
 def array_type_id(a):
-    if isinstance(a, warp.array):
-        return warp.types.ARRAY_TYPE_REGULAR
-    elif isinstance(a, warp.indexedarray):
-        return warp.types.ARRAY_TYPE_INDEXED
+    if isinstance(a, array):
+        return ARRAY_TYPE_REGULAR
+    elif isinstance(a, indexedarray):
+        return ARRAY_TYPE_INDEXED
+    elif isinstance(a, fabricarray):
+        return ARRAY_TYPE_FABRIC
+    elif isinstance(a, indexedfabricarray):
+        return ARRAY_TYPE_FABRIC_INDEXED
     else:
-        raise ValueError(f"Invalid array")
+        raise ValueError("Invalid array type")
 class Bvh:
@@ -2381,11 +2724,11 @@ class Bvh:
                 with self.device.context_guard:
                     runtime.core.bvh_destroy_device(self.id)
-        except:
+        except Exception:
             pass
     def refit(self):
-        """Refit the Bvh. This should be called after users modify the `lowers` and `uppers` arrays."""
+        """Refit the BVH. This should be called after users modify the `lowers` and `uppers` arrays."""
         from warp.context import runtime
@@ -2471,7 +2814,7 @@ class Mesh:
                 # use CUDA context guard to avoid side effects during garbage collection
                 with self.device.context_guard:
                     runtime.core.mesh_destroy_device(self.id)
-        except:
+        except Exception:
             pass
     def refit(self):
@@ -2487,16 +2830,14 @@ class Mesh:
 class Volume:
+    #: Enum value to specify nearest-neighbor interpolation during sampling
     CLOSEST = constant(0)
+    #: Enum value to specify trilinear interpolation during sampling
     LINEAR = constant(1)
     def __init__(self, data: array):
         """Class representing a sparse grid.
-        Attributes:
-            CLOSEST (int): Enum value to specify nearest-neighbor interpolation during sampling
-            LINEAR (int): Enum value to specify trilinear interpolation during sampling
         Args:
             data (:class:`warp.array`): Array of bytes representing the volume in NanoVDB format
         """
@@ -2538,10 +2879,11 @@ class Volume:
                 with self.device.context_guard:
                     runtime.core.volume_destroy_device(self.id)
-        except:
+        except Exception:
             pass
-    def array(self):
+    def array(self) -> array:
+        """Returns the raw memory buffer of the Volume as an array"""
         buf = ctypes.c_void_p(0)
         size = ctypes.c_uint64(0)
         if self.device.is_cpu:
@@ -2550,7 +2892,7 @@ class Volume:
             self.context.core.volume_get_buffer_info_device(self.id, ctypes.byref(buf), ctypes.byref(size))
         return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device, owner=False)
-    def get_tiles(self):
+    def get_tiles(self) -> array:
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
@@ -2563,7 +2905,7 @@ class Volume:
         num_tiles = size.value // (3 * 4)
         return array(ptr=buf.value, dtype=int32, shape=(num_tiles, 3), device=self.device, owner=True)
-    def get_voxel_size(self):
+    def get_voxel_size(self) -> Tuple[float, float, float]:
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
@@ -2572,7 +2914,13 @@ class Volume:
         return (dx.value, dy.value, dz.value)
     @classmethod
-    def load_from_nvdb(cls, file_or_buffer, device=None):
+    def load_from_nvdb(cls, file_or_buffer, device=None) -> Volume:
+        """Creates a Volume object from a NanoVDB file or in-memory buffer.
+        Returns:
+            A ``warp.Volume`` object.
+        """
         try:
             data = file_or_buffer.read()
         except AttributeError:
@@ -2601,6 +2949,90 @@ class Volume:
         data_array = array(np.frombuffer(grid_data, dtype=np.byte), device=device)
         return cls(data_array)
+    @classmethod
+    def load_from_numpy(
+        cls, ndarray: np.array, min_world=(0.0, 0.0, 0.0), voxel_size=1.0, bg_value=0.0, device=None
+    ) -> Volume:
+        """Creates a Volume object from a dense 3D NumPy array.
+        This function is only supported for CUDA devices.
+        Args:
+            min_world: The 3D coordinate of the lower corner of the volume.
+            voxel_size: The size of each voxel in spatial coordinates.
+            bg_value: Background value
+            device: The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
+        Returns:
+            A ``warp.Volume`` object.
+        """
+        import math
+        target_shape = (
+            math.ceil(ndarray.shape[0] / 8) * 8,
+            math.ceil(ndarray.shape[1] / 8) * 8,
+            math.ceil(ndarray.shape[2] / 8) * 8,
+        )
+        if hasattr(bg_value, "__len__"):
+            # vec3, assuming the numpy array is 4D
+            padded_array = np.array((target_shape[0], target_shape[1], target_shape[2], 3), dtype=np.single)
+            padded_array[:, :, :, :] = np.array(bg_value)
+            padded_array[0 : ndarray.shape[0], 0 : ndarray.shape[1], 0 : ndarray.shape[2], :] = ndarray
+        else:
+            padded_amount = (
+                math.ceil(ndarray.shape[0] / 8) * 8 - ndarray.shape[0],
+                math.ceil(ndarray.shape[1] / 8) * 8 - ndarray.shape[1],
+                math.ceil(ndarray.shape[2] / 8) * 8 - ndarray.shape[2],
+            )
+            padded_array = np.pad(
+                ndarray,
+                ((0, padded_amount[0]), (0, padded_amount[1]), (0, padded_amount[2])),
+                mode="constant",
+                constant_values=bg_value,
+            )
+        shape = padded_array.shape
+        volume = warp.Volume.allocate(
+            min_world,
+            [
+                min_world[0] + (shape[0] - 1) * voxel_size,
+                min_world[1] + (shape[1] - 1) * voxel_size,
+                min_world[2] + (shape[2] - 1) * voxel_size,
+            ],
+            voxel_size,
+            bg_value=bg_value,
+            points_in_world_space=True,
+            translation=min_world,
+            device=device,
+        )
+        # Populate volume
+        if hasattr(bg_value, "__len__"):
+            warp.launch(
+                warp.utils.copy_dense_volume_to_nano_vdb_v,
+                dim=(shape[0], shape[1], shape[2]),
+                inputs=[volume.id, warp.array(padded_array, dtype=warp.vec3, device=device)],
+                device=device,
+            )
+        elif isinstance(bg_value, int):
+            warp.launch(
+                warp.utils.copy_dense_volume_to_nano_vdb_i,
+                dim=shape,
+                inputs=[volume.id, warp.array(padded_array, dtype=warp.int32, device=device)],
+                device=device,
+            )
+        else:
+            warp.launch(
+                warp.utils.copy_dense_volume_to_nano_vdb_f,
+                dim=shape,
+                inputs=[volume.id, warp.array(padded_array, dtype=warp.float32, device=device)],
+                device=device,
+            )
+        return volume
     @classmethod
     def allocate(
         cls,
@@ -2611,9 +3043,11 @@ class Volume:
         translation=(0.0, 0.0, 0.0),
         points_in_world_space=False,
         device=None,
-    ):
+    ) -> Volume:
         """Allocate a new Volume based on the bounding box defined by min and max.
+        This function is only supported for CUDA devices.
         Allocate a volume that is large enough to contain voxels [min[0], min[1], min[2]] - [max[0], max[1], max[2]], inclusive.
         If points_in_world_space is true, then min and max are first converted to index space with the given voxel size and
         translation, and the volume is allocated with those.
@@ -2622,12 +3056,12 @@ class Volume:
         the resulting tiles will be available in the new volume.
         Args:
-            min (array-like): Lower 3D-coordinates of the bounding box in index space or world space, inclusive
-            max (array-like): Upper 3D-coordinates of the bounding box in index space or world space, inclusive
-            voxel_size (float): Voxel size of the new volume
+            min (array-like): Lower 3D coordinates of the bounding box in index space or world space, inclusive.
+            max (array-like): Upper 3D coordinates of the bounding box in index space or world space, inclusive.
+            voxel_size (float): Voxel size of the new volume.
             bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
-            translation (array-like): translation between the index and world spaces
-            device (Devicelike): Device the array lives on
+            translation (array-like): translation between the index and world spaces.
+            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
         if points_in_world_space:
@@ -2652,9 +3086,11 @@ class Volume:
     @classmethod
     def allocate_by_tiles(
         cls, tile_points: array, voxel_size: float, bg_value=0.0, translation=(0.0, 0.0, 0.0), device=None
-    ):
+    ) -> Volume:
         """Allocate a new Volume with active tiles for each point tile_points.
+        This function is only supported for CUDA devices.
         The smallest unit of allocation is a dense tile of 8x8x8 voxels.
         This is the primary method for allocating sparse volumes. It uses an array of points indicating the tiles that must be allocated.
@@ -2664,13 +3100,13 @@ class Volume:
         Args:
             tile_points (:class:`warp.array`): Array of positions that define the tiles to be allocated.
-                The array can be a 2d, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
+                The array can be a 2D, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
                 or can be a 1D array of :class:`warp.vec3` values, indicating world space positions.
                 Repeated points per tile are allowed and will be efficiently deduplicated.
-            voxel_size (float): Voxel size of the new volume
+            voxel_size (float): Voxel size of the new volume.
             bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
-            translation (array-like): translation between the index and world spaces
-            device (Devicelike): Device the array lives on
+            translation (array-like): Translation between the index and world spaces.
+            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
         from warp.context import runtime
@@ -2707,7 +3143,7 @@ class Volume:
                 translation[2],
                 in_world_space,
             )
-        elif type(bg_value) == int:
+        elif isinstance(bg_value, int):
             volume.id = volume.context.core.volume_i_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
@@ -2738,6 +3174,67 @@ class Volume:
         return volume
+# definition just for kernel type (cannot be a parameter), see mesh.h
+# NOTE: its layout must match the corresponding struct defined in C.
+# NOTE: it needs to be defined after `indexedarray` to workaround a circular import issue.
+class mesh_query_point_t:
+    """Output for the mesh query point functions.
+    Attributes:
+        result (bool): Whether a point is found within the given constraints.
+        sign (float32): A value < 0 if query point is inside the mesh, >=0 otherwise.
+                        Note that mesh must be watertight for this to be robust
+        face (int32): Index of the closest face.
+        u (float32): Barycentric u coordinate of the closest point.
+        v (float32): Barycentric v coordinate of the closest point.
+    See Also:
+        :func:`mesh_query_point`, :func:`mesh_query_point_no_sign`,
+        :func:`mesh_query_furthest_point_no_sign`,
+        :func:`mesh_query_point_sign_normal`,
+        and :func:`mesh_query_point_sign_winding_number`.
+    """
+    from warp.codegen import Var
+    vars = {
+        "result": Var("result", bool),
+        "sign": Var("sign", float32),
+        "face": Var("face", int32),
+        "u": Var("u", float32),
+        "v": Var("v", float32),
+    }
+# definition just for kernel type (cannot be a parameter), see mesh.h
+# NOTE: its layout must match the corresponding struct defined in C.
+class mesh_query_ray_t:
+    """Output for the mesh query ray functions.
+    Attributes:
+        result (bool): Whether a hit is found within the given constraints.
+        sign (float32): A value > 0 if the ray hit in front of the face, returns < 0 otherwise.
+        face (int32): Index of the closest face.
+        t (float32): Distance of the closest hit along the ray.
+        u (float32): Barycentric u coordinate of the closest hit.
+        v (float32): Barycentric v coordinate of the closest hit.
+        normal (vec3f): Face normal.
+    See Also:
+        :func:`mesh_query_ray`.
+    """
+    from warp.codegen import Var
+    vars = {
+        "result": Var("result", bool),
+        "sign": Var("sign", float32),
+        "face": Var("face", int32),
+        "t": Var("t", float32),
+        "u": Var("u", float32),
+        "v": Var("v", float32),
+        "normal": Var("normal", vec3),
+    }
 def matmul(
     a: array2d,
     b: array2d,
@@ -2745,7 +3242,7 @@ def matmul(
     d: array2d,
     alpha: float = 1.0,
     beta: float = 0.0,
-    allow_tf32x3_arith: bool = False,
+    allow_tf32x3_arith: builtins.bool = False,
     device=None,
 ):
     """Computes a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -2774,6 +3271,11 @@ def matmul(
             "wp.matmul currently only supports operation between {A, B, C, D} matrices of the same type."
         )
+    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
+        )
     m = a.shape[0]
     n = b.shape[1]
     k = a.shape[1]
@@ -2808,13 +3310,13 @@ def matmul(
         ctypes.c_void_p(d.ptr),
         alpha,
         beta,
-        True,
-        True,
+        not a.is_transposed,
+        not b.is_transposed,
         allow_tf32x3_arith,
         1,
     )
     if not ret:
-        raise RuntimeError("Matmul failed.")
+        raise RuntimeError("matmul failed.")
 def adj_matmul(
@@ -2827,7 +3329,7 @@ def adj_matmul(
     adj_d: array2d,
     alpha: float = 1.0,
     beta: float = 0.0,
-    allow_tf32x3_arith: bool = False,
+    allow_tf32x3_arith: builtins.bool = False,
     device=None,
 ):
     """Computes the adjoint of a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -2878,6 +3380,19 @@ def adj_matmul(
             "wp.adj_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
         )
+    if (
+        (not a.is_contiguous and not a.is_transposed)
+        or (not b.is_contiguous and not b.is_transposed)
+        or (not c.is_contiguous)
+        or (not adj_a.is_contiguous and not adj_a.is_transposed)
+        or (not adj_b.is_contiguous and not adj_b.is_transposed)
+        or (not adj_c.is_contiguous)
+        or (not adj_d.is_contiguous)
+    ):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
+        )
     m = a.shape[0]
     n = b.shape[1]
     k = a.shape[1]
@@ -2898,75 +3413,105 @@ def adj_matmul(
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()))
-        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()))
-        adj_c.assign(beta * adj_d.numpy())
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()) + adj_a.numpy())
+        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()) + adj_b.numpy())
+        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
     cc = device.arch
     # adj_a
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        k,
-        n,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_a.ptr),
-        alpha,
-        0.0,
-        True,
-        False,
-        allow_tf32x3_arith,
-        1,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not a.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            k,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(b.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            alpha,
+            1.0,
+            True,
+            b.is_transposed,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            m,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(b.ptr),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            alpha,
+            1.0,
+            not b.is_transposed,
+            False,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_b
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        k,
-        n,
-        m,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_b.ptr),
-        alpha,
-        0.0,
-        False,
-        True,
-        allow_tf32x3_arith,
-        1,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not b.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            n,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a.ptr),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            alpha,
+            1.0,
+            a.is_transposed,
+            True,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            n,
+            k,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(a.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            alpha,
+            1.0,
+            False,
+            not a.is_transposed,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_c
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(adj_c.ptr),
-        0.0,
-        beta,
-        True,
-        True,
-        allow_tf32x3_arith,
-        1,
+    warp.launch(
+        kernel=warp.utils.add_kernel_2d,
+        dim=adj_c.shape,
+        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
+        device=device,
+        record_tape=False
     )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
 def batched_matmul(
@@ -2976,7 +3521,7 @@ def batched_matmul(
     d: array3d,
     alpha: float = 1.0,
     beta: float = 0.0,
-    allow_tf32x3_arith: bool = False,
+    allow_tf32x3_arith: builtins.bool = False,
     device=None,
 ):
     """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -3005,6 +3550,11 @@ def batched_matmul(
             "wp.batched_matmul currently only supports operation between {A, B, C, D} matrices of the same type."
         )
+    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
+        )
     m = a.shape[1]
     n = b.shape[2]
     k = a.shape[2]
@@ -3016,7 +3566,7 @@ def batched_matmul(
     if runtime.tape:
         runtime.tape.record_func(
-            backward=lambda: adj_matmul(
+            backward=lambda: adj_batched_matmul(
                 a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
             ),
             arrays=[a, b, c, d],
@@ -3027,26 +3577,55 @@ def batched_matmul(
         d.assign(alpha * np.matmul(a.numpy(), b.numpy()) + beta * c.numpy())
         return
+    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
+    max_batch_count = 65535
+    iters = int(batch_count / max_batch_count)
+    remainder = batch_count % max_batch_count
     cc = device.arch
+    for i in range(iters):
+        idx_start = i * max_batch_count
+        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            n,
+            k,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(c[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(d[idx_start:idx_end,:,:].ptr),
+            alpha,
+            beta,
+            not a.is_transposed,
+            not b.is_transposed,
+            allow_tf32x3_arith,
+            max_batch_count,
+        )
+        if not ret:
+            raise RuntimeError("Batched matmul failed.")
+    idx_start = iters * max_batch_count
     ret = runtime.core.cutlass_gemm(
         cc,
         m,
         n,
         k,
         type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(c.ptr),
-        ctypes.c_void_p(d.ptr),
+        ctypes.c_void_p(a[idx_start:,:,:].ptr),
+        ctypes.c_void_p(b[idx_start:,:,:].ptr),
+        ctypes.c_void_p(c[idx_start:,:,:].ptr),
+        ctypes.c_void_p(d[idx_start:,:,:].ptr),
         alpha,
         beta,
-        True,
-        True,
+        not a.is_transposed,
+        not b.is_transposed,
         allow_tf32x3_arith,
-        batch_count,
+        remainder,
     )
     if not ret:
-        raise RuntimeError("Batched matmul failed.")
+        raise RuntimeError("Batched matmul failed.")
 def adj_batched_matmul(
@@ -3059,7 +3638,7 @@ def adj_batched_matmul(
     adj_d: array3d,
     alpha: float = 1.0,
     beta: float = 0.0,
-    allow_tf32x3_arith: bool = False,
+    allow_tf32x3_arith: builtins.bool = False,
     device=None,
 ):
     """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -3126,78 +3705,215 @@ def adj_batched_matmul(
             )
         )
+    if (
+        (not a.is_contiguous and not a.is_transposed)
+        or (not b.is_contiguous and not b.is_transposed)
+        or (not c.is_contiguous)
+        or (not adj_a.is_contiguous and not adj_a.is_transposed)
+        or (not adj_b.is_contiguous and not adj_b.is_transposed)
+        or (not adj_c.is_contiguous)
+        or (not adj_d.is_contiguous)
+    ):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
+        )
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))))
-        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()))
-        adj_c.assign(beta * adj_d.numpy())
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))) + adj_a.numpy())
+        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()) + adj_b.numpy())
+        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
+    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
+    max_batch_count = 65535
+    iters = int(batch_count / max_batch_count)
+    remainder = batch_count % max_batch_count
     cc = device.arch
+    for i in range(iters):
+        idx_start = i * max_batch_count
+        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
+        # adj_a
+        if not a.is_transposed:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                m,
+                k,
+                n,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                True,
+                b.is_transposed,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        else:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                k,
+                m,
+                n,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                not b.is_transposed,
+                False,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        # adj_b
+        if not b.is_transposed:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                k,
+                n,
+                m,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                a.is_transposed,
+                True,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        else:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                n,
+                k,
+                m,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                False,
+                not a.is_transposed,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+    idx_start = iters * max_batch_count
     # adj_a
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        k,
-        n,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_a.ptr),
-        alpha,
-        0.0,
-        True,
-        False,
-        allow_tf32x3_arith,
-        batch_count,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not a.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            k,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            True,
+            b.is_transposed,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            m,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            not b.is_transposed,
+            False,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_b
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        k,
-        n,
-        m,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_b.ptr),
-        alpha,
-        0.0,
-        False,
-        True,
-        allow_tf32x3_arith,
-        batch_count,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not b.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            n,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            a.is_transposed,
+            True,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            n,
+            k,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            False,
+            not a.is_transposed,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_c
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(adj_c.ptr),
-        0.0,
-        beta,
-        True,
-        True,
-        allow_tf32x3_arith,
-        batch_count,
+    warp.launch(
+        kernel=warp.utils.add_kernel_3d,
+        dim=adj_c.shape,
+        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
+        device=device,
+        record_tape=False
     )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
 class HashGrid:
     def __init__(self, dim_x, dim_y, dim_z, device=None):
@@ -3266,7 +3982,7 @@ class HashGrid:
                 with self.device.context_guard:
                     runtime.core.hash_grid_destroy_device(self.id)
-        except:
+        except Exception:
             pass
@@ -3340,7 +4056,7 @@ class MarchingCubes:
         if error:
             raise RuntimeError(
-                "Error occured buffers may not be large enough, marching cubes required at least {num_verts} vertices, and {num_tris} triangles."
+                "Buffers may not be large enough, marching cubes required at least {num_verts} vertices, and {num_tris} triangles."
             )
         # resize the geometry arrays
@@ -3396,7 +4112,7 @@ def type_matches_template(arg_type, template_type):
         return True
     elif is_array(template_type):
         # ensure the argument type is a non-generic array with matching dtype and dimensionality
-        if type(arg_type) != type(template_type):
+        if type(arg_type) is not type(template_type):
             return False
         if not type_matches_template(arg_type.dtype, template_type.dtype):
             return False
@@ -3429,7 +4145,7 @@ def infer_argument_types(args, template_types, arg_names=None):
     """Resolve argument types with the given list of template types."""
     if len(args) != len(template_types):
-        raise RuntimeError(f"Number of arguments must match number of template types.")
+        raise RuntimeError("Number of arguments must match number of template types.")
     arg_types = []
@@ -3452,7 +4168,7 @@ def infer_argument_types(args, template_types, arg_names=None):
             arg_types.append(arg._cls)
         # elif arg_type in [warp.types.launch_bounds_t, warp.types.shape_t, warp.types.range_t]:
         #     arg_types.append(arg_type)
-        # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.bvh_query_t]:
+        # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.mesh_query_point_t, warp.mesh_query_ray_t, warp.bvh_query_t]:
         #     arg_types.append(arg_type)
         elif arg is None:
             # allow passing None for arrays
@@ -3471,6 +4187,7 @@ def infer_argument_types(args, template_types, arg_names=None):
 simple_type_codes = {
     int: "i4",
     float: "f4",
+    builtins.bool: "b",
     bool: "b",
     str: "str",  # accepted by print()
     int8: "i1",
@@ -3489,6 +4206,8 @@ simple_type_codes = {
     launch_bounds_t: "lb",
     hash_grid_query_t: "hgq",
     mesh_query_aabb_t: "mqa",
+    mesh_query_point_t: "mqp",
+    mesh_query_ray_t: "mqr",
     bvh_query_t: "bvhq",
 }
@@ -3505,14 +4224,14 @@ def get_type_code(arg_type):
             # check for "special" vector/matrix subtypes
             if hasattr(arg_type, "_wp_generic_type_str_"):
                 type_str = arg_type._wp_generic_type_str_
-                if type_str == "quaternion":
+                if type_str == "quat_t":
                     return f"q{dtype_code}"
                 elif type_str == "transform_t":
                     return f"t{dtype_code}"
-                elif type_str == "spatial_vector_t":
-                    return f"sv{dtype_code}"
-                elif type_str == "spatial_matrix_t":
-                    return f"sm{dtype_code}"
+                # elif type_str == "spatial_vector_t":
+                #     return f"sv{dtype_code}"
+                # elif type_str == "spatial_matrix_t":
+                #     return f"sm{dtype_code}"
             # generic vector/matrix
             ndim = len(arg_type._shape_)
             if ndim == 1:
@@ -3535,6 +4254,10 @@ def get_type_code(arg_type):
         return f"a{arg_type.ndim}{get_type_code(arg_type.dtype)}"
     elif isinstance(arg_type, indexedarray):
         return f"ia{arg_type.ndim}{get_type_code(arg_type.dtype)}"
+    elif isinstance(arg_type, fabricarray):
+        return f"fa{arg_type.ndim}{get_type_code(arg_type.dtype)}"
+    elif isinstance(arg_type, indexedfabricarray):
+        return f"ifa{arg_type.ndim}{get_type_code(arg_type.dtype)}"
     elif isinstance(arg_type, warp.codegen.Struct):
         return warp.codegen.make_full_qualified_name(arg_type.cls)
     elif arg_type == Scalar: