PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/types.py CHANGED Viewed

@@ -5,19 +5,17 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+from __future__ import annotations
+import builtins
 import ctypes
 import hashlib
+import inspect
 import struct
 import zlib
-import numpy as np
+from typing import Any, Callable, Generic, List, Tuple, TypeVar, Union
-from typing import Any
-from typing import Tuple
-from typing import TypeVar
-from typing import Generic
-from typing import List
-from typing import Callable
-from typing import Union
+import numpy as np
 import warp
@@ -54,12 +52,14 @@ def constant(x):
     global _constant_hash
     # hash the constant value
-    if isinstance(x, int):
+    if isinstance(x, builtins.bool):
+        # This needs to come before the check for `int` since all boolean
+        # values are also instances of `int`.
+        _constant_hash.update(struct.pack("?", x))
+    elif isinstance(x, int):
         _constant_hash.update(struct.pack("<q", x))
     elif isinstance(x, float):
         _constant_hash.update(struct.pack("<d", x))
-    elif isinstance(x, bool):
-        _constant_hash.update(struct.pack("?", x))
     elif isinstance(x, float16):
         # float16 is a special case
         p = ctypes.pointer(ctypes.c_float(x.value))
@@ -75,6 +75,14 @@ def constant(x):
     return x
+def float_to_half_bits(value):
+    return warp.context.runtime.core.float_to_half_bits(value)
+def half_bits_to_float(value):
+    return warp.context.runtime.core.half_bits_to_float(value)
 # ----------------------
 # built-in types
@@ -98,19 +106,15 @@ def vector(length, dtype):
         _wp_generic_type_str_ = "vec_t"
         _wp_constructor_ = "vector"
-        def __init__(self, *args):
-            if self._wp_scalar_type_ == float16:
-                # special case for float16 type: in this case, data is stored
-                # as uint16 but it's actually half precision floating point
-                # data. This means we need to convert each of the arguments
-                # to uint16s containing half float bits before storing them in
-                # the array:
-                from warp.context import runtime
-                scalar_value = runtime.core.float_to_half_bits
-            else:
-                scalar_value = lambda x: x
+        # special handling for float16 type: in this case, data is stored
+        # as uint16 but it's actually half precision floating point
+        # data. This means we need to convert each of the arguments
+        # to uint16s containing half float bits before storing them in
+        # the array:
+        scalar_import = float_to_half_bits if _wp_scalar_type_ == float16 else lambda x: x
+        scalar_export = half_bits_to_float if _wp_scalar_type_ == float16 else lambda x: x
+        def __init__(self, *args):
             num_args = len(args)
             if num_args == 0:
                 super().__init__()
@@ -120,29 +124,99 @@ def vector(length, dtype):
                     self.__init__(*args[0])
                 else:
                     # set all elements to the same value
-                    value = scalar_value(args[0])
+                    value = vec_t.scalar_import(args[0])
                     for i in range(self._length_):
                         super().__setitem__(i, value)
             elif num_args == self._length_:
                 # set all scalar elements
                 for i in range(self._length_):
-                    super().__setitem__(i, scalar_value(args[i]))
+                    super().__setitem__(i, vec_t.scalar_import(args[i]))
             else:
                 raise ValueError(
                     f"Invalid number of arguments in vector constructor, expected {self._length_} elements, got {num_args}"
                 )
+        def __getitem__(self, key):
+            if isinstance(key, int):
+                return vec_t.scalar_export(super().__getitem__(key))
+            elif isinstance(key, slice):
+                if self._wp_scalar_type_ == float16:
+                    return [vec_t.scalar_export(x) for x in super().__getitem__(key)]
+                else:
+                    return super().__getitem__(key)
+            else:
+                raise KeyError(f"Invalid key {key}, expected int or slice")
+        def __setitem__(self, key, value):
+            if isinstance(key, int):
+                try:
+                    return super().__setitem__(key, vec_t.scalar_import(value))
+                except (TypeError, ctypes.ArgumentError):
+                    raise TypeError(
+                        f"Expected to assign a `{self._wp_scalar_type_.__name__}` value "
+                        f"but got `{type(value).__name__}` instead"
+                    ) from None
+            elif isinstance(key, slice):
+                try:
+                    iter(value)
+                except TypeError:
+                    raise TypeError(
+                        f"Expected to assign a slice from a sequence of values "
+                        f"but got `{type(value).__name__}` instead"
+                    ) from None
+                if self._wp_scalar_type_ == float16:
+                    converted = []
+                    try:
+                        for x in value:
+                            converted.append(vec_t.scalar_import(x))
+                    except ctypes.ArgumentError:
+                        raise TypeError(
+                            f"Expected to assign a slice from a sequence of `float16` values "
+                            f"but got `{type(x).__name__}` instead"
+                        ) from None
+                    value = converted
+                try:
+                    return super().__setitem__(key, value)
+                except TypeError:
+                    for x in value:
+                        try:
+                            self._type_(x)
+                        except TypeError:
+                            raise TypeError(
+                                f"Expected to assign a slice from a sequence of `{self._wp_scalar_type_.__name__}` values "
+                                f"but got `{type(x).__name__}` instead"
+                            ) from None
+            else:
+                raise KeyError(f"Invalid key {key}, expected int or slice")
+        def __getattr__(self, name):
+            idx = "xyzw".find(name)
+            if idx != -1:
+                return self.__getitem__(idx)
+            return self.__getattribute__(name)
+        def __setattr__(self, name, value):
+            idx = "xyzw".find(name)
+            if idx != -1:
+                return self.__setitem__(idx, value)
+            return super().__setattr__(name, value)
         def __add__(self, y):
             return warp.add(self, y)
         def __radd__(self, y):
-            return warp.add(self, y)
+            return warp.add(y, self)
         def __sub__(self, y):
             return warp.sub(self, y)
-        def __rsub__(self, x):
-            return warp.sub(x, self)
+        def __rsub__(self, y):
+            return warp.sub(y, self)
         def __mul__(self, y):
             return warp.mul(self, y)
@@ -150,17 +224,17 @@ def vector(length, dtype):
         def __rmul__(self, x):
             return warp.mul(x, self)
-        def __div__(self, y):
+        def __truediv__(self, y):
             return warp.div(self, y)
-        def __rdiv__(self, x):
+        def __rtruediv__(self, x):
             return warp.div(x, self)
-        def __pos__(self, y):
-            return warp.pos(self, y)
+        def __pos__(self):
+            return warp.pos(self)
-        def __neg__(self, y):
-            return warp.neg(self, y)
+        def __neg__(self):
+            return warp.neg(self)
         def __str__(self):
             return f"[{', '.join(map(str, self))}]"
@@ -171,6 +245,17 @@ def vector(length, dtype):
                     return False
             return True
+        @classmethod
+        def from_ptr(cls, ptr):
+            if ptr:
+                # create a new vector instance and initialize the contents from the binary data
+                # this skips float16 conversions, assuming that float16 data is already encoded as uint16
+                value = cls()
+                ctypes.memmove(ctypes.byref(value), ptr, ctypes.sizeof(cls._type_) * cls._length_)
+                return value
+            else:
+                raise RuntimeError("NULL pointer exception")
     return vec_t
@@ -197,19 +282,15 @@ def matrix(shape, dtype):
         _wp_row_type_ = vector(0 if shape[1] == Any else shape[1], dtype)
-        def __init__(self, *args):
-            if self._wp_scalar_type_ == float16:
-                # special case for float16 type: in this case, data is stored
-                # as uint16 but it's actually half precision floating point
-                # data. This means we need to convert each of the arguments
-                # to uint16s containing half float bits before storing them in
-                # the array:
-                from warp.context import runtime
-                scalar_value = runtime.core.float_to_half_bits
-            else:
-                scalar_value = lambda x: x
+        # special handling for float16 type: in this case, data is stored
+        # as uint16 but it's actually half precision floating point
+        # data. This means we need to convert each of the arguments
+        # to uint16s containing half float bits before storing them in
+        # the array:
+        scalar_import = float_to_half_bits if _wp_scalar_type_ == float16 else lambda x: x
+        scalar_export = half_bits_to_float if _wp_scalar_type_ == float16 else lambda x: x
+        def __init__(self, *args):
             num_args = len(args)
             if num_args == 0:
                 super().__init__()
@@ -219,13 +300,13 @@ def matrix(shape, dtype):
                     self.__init__(*args[0])
                 else:
                     # set all elements to the same value
-                    value = scalar_value(args[0])
+                    value = mat_t.scalar_import(args[0])
                     for i in range(self._length_):
                         super().__setitem__(i, value)
             elif num_args == self._length_:
                 # set all scalar elements
                 for i in range(self._length_):
-                    super().__setitem__(i, scalar_value(args[i]))
+                    super().__setitem__(i, mat_t.scalar_import(args[i]))
             elif num_args == self._shape_[0]:
                 # row vectors
                 for i, row in enumerate(args):
@@ -235,7 +316,7 @@ def matrix(shape, dtype):
                         )
                     offset = i * self._shape_[1]
                     for i in range(self._shape_[1]):
-                        super().__setitem__(offset + i, scalar_value(row[i]))
+                        super().__setitem__(offset + i, mat_t.scalar_import(row[i]))
             else:
                 raise ValueError(
                     f"Invalid number of arguments in matrix constructor, expected {self._length_} elements, got {num_args}"
@@ -245,13 +326,13 @@ def matrix(shape, dtype):
             return warp.add(self, y)
         def __radd__(self, y):
-            return warp.add(self, y)
+            return warp.add(y, self)
         def __sub__(self, y):
             return warp.sub(self, y)
-        def __rsub__(self, x):
-            return warp.sub(x, self)
+        def __rsub__(self, y):
+            return warp.sub(y, self)
         def __mul__(self, y):
             return warp.mul(self, y)
@@ -265,17 +346,17 @@ def matrix(shape, dtype):
         def __rmatmul__(self, x):
             return warp.mul(x, self)
-        def __div__(self, y):
+        def __truediv__(self, y):
             return warp.div(self, y)
-        def __rdiv__(self, x):
+        def __rtruediv__(self, x):
             return warp.div(x, self)
-        def __pos__(self, y):
-            return warp.pos(self, y)
+        def __pos__(self):
+            return warp.pos(self)
-        def __neg__(self, y):
-            return warp.neg(self, y)
+        def __neg__(self):
+            return warp.neg(self)
         def __str__(self):
             row_str = []
@@ -286,48 +367,96 @@ def matrix(shape, dtype):
             return "[" + ",\n ".join(row_str) + "]"
         def __eq__(self, other):
-            for i in range(self._length_):
-                if self[i] != other[i]:
-                    return False
+            for i in range(self._shape_[0]):
+                for j in range(self._shape_[1]):
+                    if self[i][j] != other[i][j]:
+                        return False
             return True
         def get_row(self, r):
             if r < 0 or r >= self._shape_[0]:
                 raise IndexError("Invalid row index")
             row_start = r * self._shape_[1]
             row_end = row_start + self._shape_[1]
-            return self._wp_row_type_(*super().__getitem__(slice(row_start, row_end)))
+            row_data = super().__getitem__(slice(row_start, row_end))
+            if self._wp_scalar_type_ == float16:
+                return self._wp_row_type_(*[mat_t.scalar_export(x) for x in row_data])
+            else:
+                return self._wp_row_type_(row_data)
         def set_row(self, r, v):
             if r < 0 or r >= self._shape_[0]:
                 raise IndexError("Invalid row index")
+            try:
+                iter(v)
+            except TypeError:
+                raise TypeError(
+                    f"Expected to assign a slice from a sequence of values "
+                    f"but got `{type(v).__name__}` instead"
+                ) from None
             row_start = r * self._shape_[1]
             row_end = row_start + self._shape_[1]
+            if self._wp_scalar_type_ == float16:
+                converted = []
+                try:
+                    for x in v:
+                        converted.append(mat_t.scalar_import(x))
+                except ctypes.ArgumentError:
+                    raise TypeError(
+                        f"Expected to assign a slice from a sequence of `float16` values "
+                        f"but got `{type(x).__name__}` instead"
+                    ) from None
+                v = converted
             super().__setitem__(slice(row_start, row_end), v)
         def __getitem__(self, key):
             if isinstance(key, Tuple):
                 # element indexing m[i,j]
-                return super().__getitem__(key[1] * self._shape_[0] + key[1])
+                if len(key) != 2:
+                    raise KeyError(f"Invalid key, expected one or two indices, got {len(key)}")
+                if any(isinstance(x, slice) for x in key):
+                    raise KeyError(f"Slices are not supported when indexing matrices using the `m[i, j]` notation")
+                return mat_t.scalar_export(super().__getitem__(key[0] * self._shape_[1] + key[1]))
             elif isinstance(key, int):
                 # row vector indexing m[r]
                 return self.get_row(key)
             else:
-                # slice etc.
-                return super().__getitem__(key)
+                raise KeyError(f"Invalid key {key}, expected int or pair of ints")
         def __setitem__(self, key, value):
             if isinstance(key, Tuple):
                 # element indexing m[i,j] = x
-                return super().__setitem__(key[1] * self._shape_[0] + key[1], value)
+                if len(key) != 2:
+                    raise KeyError(f"Invalid key, expected one or two indices, got {len(key)}")
+                if any(isinstance(x, slice) for x in key):
+                    raise KeyError(f"Slices are not supported when indexing matrices using the `m[i, j]` notation")
+                try:
+                    return super().__setitem__(key[0] * self._shape_[1] + key[1], mat_t.scalar_import(value))
+                except (TypeError, ctypes.ArgumentError):
+                    raise TypeError(
+                        f"Expected to assign a `{self._wp_scalar_type_.__name__}` value "
+                        f"but got `{type(value).__name__}` instead"
+                    ) from None
             elif isinstance(key, int):
                 # row vector indexing m[r] = v
-                self.set_row(key, value)
+                return self.set_row(key, value)
+            elif isinstance(key, slice):
+                raise KeyError(f"Slices are not supported when indexing matrices using the `m[start:end]` notation")
+            else:
+                raise KeyError(f"Invalid key {key}, expected int or pair of ints")
+        @classmethod
+        def from_ptr(cls, ptr):
+            if ptr:
+                # create a new matrix instance and initialize the contents from the binary data
+                # this skips float16 conversions, assuming that float16 data is already encoded as uint16
+                value = cls()
+                ctypes.memmove(ctypes.byref(value), ptr, ctypes.sizeof(cls._type_) * cls._length_)
                 return value
             else:
-                # slice etc.
-                return super().__setitem__(key, value)
+                raise RuntimeError("NULL pointer exception")
     return mat_t
@@ -337,6 +466,23 @@ class void:
         pass
+class bool:
+    _length_ = 1
+    _type_ = ctypes.c_bool
+    def __init__(self, x=False):
+        self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value != 0)
+    def __int__(self) -> int:
+        return int(self.value != 0)
 class float16:
     _length_ = 1
     _type_ = ctypes.c_uint16
@@ -344,6 +490,15 @@ class float16:
     def __init__(self, x=0.0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0.0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
 class float32:
     _length_ = 1
@@ -352,6 +507,15 @@ class float32:
     def __init__(self, x=0.0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0.0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
 class float64:
     _length_ = 1
@@ -360,6 +524,15 @@ class float64:
     def __init__(self, x=0.0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0.0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
 class int8:
     _length_ = 1
@@ -368,6 +541,18 @@ class int8:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class uint8:
     _length_ = 1
@@ -376,6 +561,18 @@ class uint8:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class int16:
     _length_ = 1
@@ -384,6 +581,18 @@ class int16:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class uint16:
     _length_ = 1
@@ -392,6 +601,18 @@ class uint16:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class int32:
     _length_ = 1
@@ -400,6 +621,18 @@ class int32:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class uint32:
     _length_ = 1
@@ -408,6 +641,18 @@ class uint32:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class int64:
     _length_ = 1
@@ -416,6 +661,18 @@ class int64:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 class uint64:
     _length_ = 1
@@ -424,6 +681,18 @@ class uint64:
     def __init__(self, x=0):
         self.value = x
+    def __bool__(self) -> bool:
+        return self.value != 0
+    def __float__(self) -> float:
+        return float(self.value)
+    def __int__(self) -> int:
+        return int(self.value)
+    def __index__(self) -> int:
+        return int(self.value)
 def quaternion(dtype=Any):
     class quat_t(vector(length=4, dtype=dtype)):
@@ -453,23 +722,63 @@ class quatd(quaternion(dtype=float64)):
 def transformation(dtype=Any):
     class transform_t(vector(length=7, dtype=dtype)):
+        _wp_init_from_components_sig_ = inspect.Signature(
+            (
+                inspect.Parameter(
+                    "p",
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                    default=(0.0, 0.0, 0.0),
+                ),
+                inspect.Parameter(
+                    "q",
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                    default=(0.0, 0.0, 0.0, 1.0),
+                ),
+            ),
+        )
         _wp_type_params_ = [dtype]
         _wp_generic_type_str_ = "transform_t"
         _wp_constructor_ = "transformation"
-        def __init__(self, p=(0.0, 0.0, 0.0), q=(0.0, 0.0, 0.0, 1.0)):
-            super().__init__()
+        def __init__(self, *args, **kwargs):
+            if len(args) == 1 and len(kwargs) == 0:
+                if getattr(args[0], "_wp_generic_type_str_") == self._wp_generic_type_str_:
+                    # Copy constructor.
+                    super().__init__(*args[0])
+                    return
+            try:
+                # For backward compatibility, try to check if the arguments
+                # match the original signature that'd allow initializing
+                # the `p` and `q` components separately.
+                bound_args = self._wp_init_from_components_sig_.bind(*args, **kwargs)
+                bound_args.apply_defaults()
+                p, q = bound_args.args
+            except (TypeError, ValueError):
+                # Fallback to the vector's constructor.
+                super().__init__(*args)
+                return
+            # Even if the arguments match the original “from components”
+            # signature, we still need to make sure that they represent
+            # sequences that can be unpacked.
+            if hasattr(p, "__len__") and hasattr(q, "__len__"):
+                # Initialize from the `p` and `q` components.
+                super().__init__()
+                self[0:3] = vector(length=3, dtype=dtype)(*p)
+                self[3:7] = quaternion(dtype=dtype)(*q)
+                return
-            self[0:3] = vector(length=3, dtype=dtype)(*p)
-            self[3:7] = quaternion(dtype=dtype)(*q)
+            # Fallback to the vector's constructor.
+            super().__init__(*args)
         @property
         def p(self):
-            return self[0:3]
+            return vec3(self[0:3])
         @property
         def q(self):
-            return self[3:7]
+            return quat(self[3:7])
     return transform_t
@@ -753,6 +1062,7 @@ vector_types = [
 ]
 np_dtype_to_warp_type = {
+    np.dtype(np.bool_): bool,
     np.dtype(np.int8): int8,
     np.dtype(np.uint8): uint8,
     np.dtype(np.int16): int16,
@@ -768,6 +1078,21 @@ np_dtype_to_warp_type = {
     np.dtype(np.float64): float64,
 }
+warp_type_to_np_dtype = {
+    bool: np.bool_,
+    int8: np.int8,
+    int16: np.int16,
+    int32: np.int32,
+    int64: np.int64,
+    uint8: np.uint8,
+    uint16: np.uint16,
+    uint32: np.uint32,
+    uint64: np.uint64,
+    float16: np.float16,
+    float32: np.float32,
+    float64: np.float64,
+}
 # represent a Python range iterator
 class range_t:
@@ -777,18 +1102,21 @@ class range_t:
 # definition just for kernel type (cannot be a parameter), see bvh.h
 class bvh_query_t:
+    """Object used to track state during BVH traversal."""
     def __init__(self):
         pass
 # definition just for kernel type (cannot be a parameter), see mesh.h
 class mesh_query_aabb_t:
+    """Object used to track state during mesh traversal."""
     def __init__(self):
         pass
 # definition just for kernel type (cannot be a parameter), see hash_grid.h
 class hash_grid_query_t:
+    """Object used to track state during neighbor traversal."""
     def __init__(self):
         pass
@@ -800,6 +1128,8 @@ LAUNCH_MAX_DIMS = 4
 # must match array.h
 ARRAY_TYPE_REGULAR = 0
 ARRAY_TYPE_INDEXED = 1
+ARRAY_TYPE_FABRIC = 2
+ARRAY_TYPE_FABRIC_INDEXED = 3
 # represents bounds for kernel launch (number of threads across multiple dimensions)
@@ -851,6 +1181,30 @@ class array_t(ctypes.Structure):
             self.shape[i] = shape[i]
             self.strides[i] = strides[i]
+    # structured type description used when array_t is packed in a struct and shared via numpy structured array.
+    @classmethod
+    def numpy_dtype(cls):
+        return cls._numpy_dtype_
+    # structured value used when array_t is packed in a struct and shared via a numpy structured array
+    def numpy_value(self):
+        return (self.data, self.grad, list(self.shape), list(self.strides), self.ndim)
+# NOTE: must match array_t._fields_
+array_t._numpy_dtype_ = {
+    "names": ["data", "grad", "shape", "strides", "ndim"],
+    "formats": ["u8", "u8", f"{ARRAY_MAX_DIMS}i4", f"{ARRAY_MAX_DIMS}i4", "i4"],
+    "offsets": [
+        array_t.data.offset,
+        array_t.grad.offset,
+        array_t.shape.offset,
+        array_t.strides.offset,
+        array_t.ndim.offset,
+    ],
+    "itemsize": ctypes.sizeof(array_t),
+}
 class indexedarray_t(ctypes.Structure):
     _fields_ = [
@@ -892,16 +1246,20 @@ def type_length(dtype):
         return dtype._length_
+def type_scalar_type(dtype):
+    return getattr(dtype, "_wp_scalar_type_", dtype)
 def type_size_in_bytes(dtype):
     if dtype.__module__ == "ctypes":
         return ctypes.sizeof(dtype)
-    elif type_is_struct(dtype):
+    elif isinstance(dtype, warp.codegen.Struct):
         return ctypes.sizeof(dtype.ctype)
     elif dtype == float or dtype == int:
         return 4
     elif hasattr(dtype, "_type_"):
         return getattr(dtype, "_length_", 1) * ctypes.sizeof(dtype._type_)
     else:
         return 0
@@ -916,9 +1274,9 @@ def type_to_warp(dtype):
 def type_typestr(dtype):
-    from warp.codegen import Struct
-    if dtype == float16:
+    if dtype == bool:
+        return "?"
+    elif dtype == float16:
         return "<f2"
     elif dtype == float32:
         return "<f4"
@@ -940,8 +1298,8 @@ def type_typestr(dtype):
         return "<i8"
     elif dtype == uint64:
         return "<u8"
-    elif isinstance(dtype, Struct):
-        return f"|V{ctypes.sizeof(dtype.ctype)}"
+    elif isinstance(dtype, warp.codegen.Struct):
+        return f"|V{ctypes.sizeof(dtype.ctype)}"
     elif issubclass(dtype, ctypes.Array):
         return type_typestr(dtype._wp_scalar_type_)
     else:
@@ -954,9 +1312,16 @@ def type_repr(t):
         return str(f"array(ndim={t.ndim}, dtype={t.dtype})")
     if type_is_vector(t):
         return str(f"vector(length={t._shape_[0]}, dtype={t._wp_scalar_type_})")
-    elif type_is_matrix(t):
+    if type_is_matrix(t):
         return str(f"matrix(shape=({t._shape_[0]}, {t._shape_[1]}), dtype={t._wp_scalar_type_})")
-    else:
+    if isinstance(t, warp.codegen.Struct):
+        return type_repr(t.cls)
+    if t in scalar_types:
+        return t.__name__
+    try:
+        return t.__module__ + "." + t.__qualname__
+    except AttributeError:
         return str(t)
@@ -974,14 +1339,6 @@ def type_is_float(t):
     return t in float_types
-def type_is_struct(dtype):
-    from warp.codegen import Struct
-    if isinstance(dtype, Struct):
-        return True
-    else:
-        return False
 # returns True if the passed *type* is a vector
 def type_is_vector(t):
     if hasattr(t, "_wp_generic_type_str_") and t._wp_generic_type_str_ == "vec_t":
@@ -1000,7 +1357,7 @@ def type_is_matrix(t):
 # returns true for all value types (int, float, bool, scalars, vectors, matrices)
 def type_is_value(x):
-    if (x == int) or (x == float) or (x == bool) or (x in scalar_types) or issubclass(x, ctypes.Array):
+    if (x == int) or (x == float) or (x == builtins.bool) or (x in scalar_types) or issubclass(x, ctypes.Array):
         return True
     else:
         return False
@@ -1028,14 +1385,16 @@ def types_equal(a, b, match_generic=False):
     # convert to canonical types
     if a == float:
         a = float32
-    if a == int:
+    elif a == int:
         a = int32
     if b == float:
         b = float32
-    if b == int:
+    elif b == int:
         b = int32
+    compatible_bool_types = [builtins.bool, bool]
     def are_equal(p1, p2):
         if match_generic:
             if p1 == Any or p2 == Any:
@@ -1052,7 +1411,22 @@ def types_equal(a, b, match_generic=False):
                 return True
             if p1 == Float and p2 == Float:
                 return True
-        return p1 == p2
+        # convert to canonical types
+        if p1 == float:
+            p1 = float32
+        elif p1 == int:
+            p1 = int32
+        if p2 == float:
+            p2 = float32
+        elif b == int:
+            p2 = int32
+        if p1 in compatible_bool_types and p2 in compatible_bool_types:
+            return True
+        else:
+            return p1 == p2
     if (
         hasattr(a, "_wp_generic_type_str_")
@@ -1060,9 +1434,7 @@ def types_equal(a, b, match_generic=False):
         and a._wp_generic_type_str_ == b._wp_generic_type_str_
     ):
         return all([are_equal(p1, p2) for p1, p2 in zip(a._wp_type_params_, b._wp_type_params_)])
-    if isinstance(a, array) and isinstance(b, array):
-        return True
-    if isinstance(a, indexedarray) and isinstance(b, indexedarray):
+    if is_array(a) and type(a) is type(b):
         return True
     else:
         return are_equal(a, b)
@@ -1093,18 +1465,18 @@ class array(Array):
         dtype: DType = Any,
         shape=None,
         strides=None,
-        length=0,
+        length=None,
         ptr=None,
-        grad_ptr=None,
-        capacity=0,
+        capacity=None,
         device=None,
+        pinned=False,
         copy=True,
-        owner=True,
+        owner=True,  # TODO: replace with deleter=None
         ndim=None,
+        grad=None,
         requires_grad=False,
-        pinned=False,
     ):
-        """Constructs a new Warp array object from existing data.
+        """Constructs a new Warp array object
         When the ``data`` argument is a valid list, tuple, or ndarray the array will be constructed from this object's data.
         For objects that are not stored sequentially in memory (e.g.: a list), then the data will first
@@ -1115,39 +1487,38 @@ class array(Array):
         allocation should reside on the same device given by the device argument, and the user should set the length
         and dtype parameter appropriately.
+        If neither ``data`` nor ``ptr`` are specified, the ``shape`` or ``length`` arguments are checked next.
+        This construction path can be used to create new uninitialized arrays, but users are encouraged to call
+        ``wp.empty()``, ``wp.zeros()``, or ``wp.full()`` instead to create new arrays.
+        If none of the above arguments are specified, a simple type annotation is constructed.  This is used when annotating
+        kernel arguments or struct members (e.g.,``arr: wp.array(dtype=float)``).  In this case, only ``dtype`` and ``ndim``
+        are taken into account and no memory is allocated for the array.
         Args:
             data (Union[list, tuple, ndarray]) An object to construct the array from, can be a Tuple, List, or generally any type convertible to an np.array
             dtype (Union): One of the built-in types, e.g.: :class:`warp.mat33`, if dtype is Any and data an ndarray then it will be inferred from the array data type
             shape (tuple): Dimensions of the array
             strides (tuple): Number of bytes in each dimension between successive elements of the array
-            length (int): Number of elements (rows) of the data type (deprecated, users should use `shape` argument)
+            length (int): Number of elements of the data type (deprecated, users should use `shape` argument)
             ptr (uint64): Address of an external memory address to alias (data should be None)
-            grad_ptr (uint64): Address of an external memory address to alias for the gradient array
             capacity (int): Maximum size in bytes of the ptr allocation (data should be None)
             device (Devicelike): Device the array lives on
             copy (bool): Whether the incoming data will be copied or aliased, this is only possible when the incoming `data` already lives on the device specified and types match
             owner (bool): Should the array object try to deallocate memory when it is deleted
             requires_grad (bool): Whether or not gradients will be tracked for this array, see :class:`warp.Tape` for details
+            grad (array): The gradient array to use
             pinned (bool): Whether to allocate pinned host memory, which allows asynchronous host-device transfers (only applicable with device="cpu")
         """
         self.owner = False
-        # convert shape to Tuple
-        if shape is None:
-            shape = tuple(length for _ in range(ndim or 1))
-        elif isinstance(shape, int):
-            shape = (shape,)
-        elif isinstance(shape, List):
-            shape = tuple(shape)
-        self.shape = shape
-        if len(shape) > ARRAY_MAX_DIMS:
-            raise RuntimeError(
-                f"Arrays may only have {ARRAY_MAX_DIMS} dimensions maximum, trying to create array with {len(shape)} dims."
-            )
+        self.ctype = None
+        self._requires_grad = False
+        self._grad = None
+        # __array_interface__ or __cuda_array_interface__, evaluated lazily and cached
+        self._array_interface = None
+        self.is_transposed = False
         # canonicalize dtype
         if dtype == int:
@@ -1155,20 +1526,78 @@ class array(Array):
         elif dtype == float:
             dtype = float32
-        if data is not None or ptr is not None:
-            from .context import runtime
-            device = runtime.get_device(device)
+        # convert shape to tuple (or leave shape=None if neither shape nor length were specified)
+        if shape is not None:
+            if isinstance(shape, int):
+                shape = (shape,)
+            else:
+                shape = tuple(shape)
+                if len(shape) > ARRAY_MAX_DIMS:
+                    raise RuntimeError(
+                        f"Failed to create array with shape {shape}, the maximum number of dimensions is {ARRAY_MAX_DIMS}"
+                    )
+        elif length is not None:
+            # backward compatibility
+            shape = (length,)
+        # determine the construction path from the given arguments
         if data is not None:
-            if device.is_capturing:
-                raise RuntimeError(f"Cannot allocate memory on device {device} while graph capture is active")
+            # data or ptr, not both
             if ptr is not None:
-                # data or ptr, not both
-                raise RuntimeError("Should only construct arrays with either data or ptr arguments, not both")
+                raise RuntimeError("Can only construct arrays with either `data` or `ptr` arguments, not both")
+            self._init_from_data(data, dtype, shape, device, copy, pinned)
+        elif ptr is not None:
+            self._init_from_ptr(ptr, dtype, shape, strides, capacity, device, owner, pinned)
+        elif shape is not None:
+            self._init_new(dtype, shape, strides, device, pinned)
+        else:
+            self._init_annotation(dtype, ndim or 1)
-            if isinstance(dtype, warp.codegen.Struct):
+        # initialize gradient, if needed
+        if self.device is not None:
+            if grad is not None:
+                # this will also check whether the gradient array is compatible
+                self.grad = grad
+            else:
+                # allocate gradient if needed
+                self._requires_grad = requires_grad
+                if requires_grad:
+                    with warp.ScopedStream(self.device.null_stream):
+                        self._alloc_grad()
+    def _init_from_data(self, data, dtype, shape, device, copy, pinned):
+        if not hasattr(data, "__len__"):
+            raise RuntimeError(f"Data must be a sequence or array, got scalar {data}")
+        if hasattr(dtype, "_wp_scalar_type_"):
+            dtype_shape = dtype._shape_
+            dtype_ndim = len(dtype_shape)
+            scalar_dtype = dtype._wp_scalar_type_
+        else:
+            dtype_shape = ()
+            dtype_ndim = 0
+            scalar_dtype = dtype
+        # convert input data to ndarray (handles lists, tuples, etc.) and determine dtype
+        if dtype == Any:
+            # infer dtype from data
+            try:
+                arr = np.array(data, copy=False, ndmin=1)
+            except Exception as e:
+                raise RuntimeError(f"Failed to convert input data to an array: {e}")
+            dtype = np_dtype_to_warp_type.get(arr.dtype)
+            if dtype is None:
+                raise RuntimeError(f"Unsupported input data dtype: {arr.dtype}")
+        elif isinstance(dtype, warp.codegen.Struct):
+            if isinstance(data, np.ndarray):
+                # construct from numpy structured array
+                if data.dtype != dtype.numpy_dtype():
+                    raise RuntimeError(
+                        f"Invalid source data type for array of structs, expected {dtype.numpy_dtype()}, got {data.dtype}"
+                    )
+                arr = data
+            elif isinstance(data, (list, tuple)):
+                # construct from a sequence of structs
                 try:
                     # convert each struct instance to its corresponding ctype
                     ctype_list = [v.__ctype__() for v in data]
@@ -1176,156 +1605,227 @@ class array(Array):
                     ctype_arr = (dtype.ctype * len(ctype_list))(*ctype_list)
                     # convert to numpy
                     arr = np.frombuffer(ctype_arr, dtype=dtype.ctype)
-                    #arr = np.array(ctype_arr, copy=False)
-                except Exception as e:
-                    raise RuntimeError(
-                        "Error while trying to construct Warp array from a Python list of Warp structs." + str(e))
-            else:
-                try:
-                    # convert tuples and lists of numeric types to ndarray
-                    arr = np.array(data, copy=False)
                 except Exception as e:
                     raise RuntimeError(
-                        "When constructing an array the data argument must be convertible to ndarray type type. Encountered an error while converting:"
-                        + str(e)
-                    )
-            if dtype == Any:
-                # infer dtype from the source data array
-                dtype = np_dtype_to_warp_type[arr.dtype]
-            # try to convert numeric src array to destination type
-            if not isinstance(dtype, warp.codegen.Struct):
-                try:
-                    arr = arr.astype(dtype=type_typestr(dtype), copy=False)
-                except:
-                    raise RuntimeError(
-                        f"Could not convert input data with type {arr.dtype} to array with type {dtype._type_}"
+                        f"Error while trying to construct Warp array from a sequence of Warp structs: {e}"
                     )
+            else:
+                raise RuntimeError(
+                    "Invalid data argument for array of structs, expected a sequence of structs or a NumPy structured array"
+                )
+        else:
+            # convert input data to the given dtype
+            npdtype = warp_type_to_np_dtype.get(scalar_dtype)
+            if npdtype is None:
+                raise RuntimeError(
+                    f"Failed to convert input data to an array with Warp type {warp.context.type_str(dtype)}"
+                )
+            try:
+                arr = np.array(data, dtype=npdtype, copy=False, ndmin=1)
+            except Exception as e:
+                raise RuntimeError(f"Failed to convert input data to an array with type {npdtype}: {e}")
+        # determine whether the input needs reshaping
+        target_npshape = None
+        if shape is not None:
+            target_npshape = (*shape, *dtype_shape)
+        elif dtype_ndim > 0:
+            # prune inner dimensions of length 1
+            while arr.ndim > 1 and arr.shape[-1] == 1:
+                arr = np.squeeze(arr, axis=-1)
+            # if the inner dims don't match exactly, check if the innermost dim is a multiple of type length
+            if arr.ndim < dtype_ndim or arr.shape[-dtype_ndim:] != dtype_shape:
+                if arr.shape[-1] == dtype._length_:
+                    target_npshape = (*arr.shape[:-1], *dtype_shape)
+                elif arr.shape[-1] % dtype._length_ == 0:
+                    target_npshape = (*arr.shape[:-1], arr.shape[-1] // dtype._length_, *dtype_shape)
+                else:
+                    if dtype_ndim == 1:
+                        raise RuntimeError(
+                            f"The inner dimensions of the input data are not compatible with the requested vector type {warp.context.type_str(dtype)}: expected an inner dimension that is a multiple of {dtype._length_}"
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"The inner dimensions of the input data are not compatible with the requested matrix type {warp.context.type_str(dtype)}: expected inner dimensions {dtype._shape_} or a multiple of {dtype._length_}"
+                        )
-            # ensure contiguous
-            arr = np.ascontiguousarray(arr)
+        if target_npshape is not None:
+            try:
+                arr = arr.reshape(target_npshape)
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to reshape the input data to the given shape {shape} and type {warp.context.type_str(dtype)}: {e}"
+                )
-            # remove any trailing dimensions of length 1
-            if arr.ndim > 1 and arr.shape[-1] == 1:
-                arr = np.squeeze(arr, axis=len(arr.shape) - 1)
+        # determine final shape and strides
+        if dtype_ndim > 0:
+            # make sure the inner dims are contiguous for vector/matrix types
+            scalar_size = type_size_in_bytes(dtype._wp_scalar_type_)
+            inner_contiguous = arr.strides[-1] == scalar_size
+            if inner_contiguous and dtype_ndim > 1:
+                inner_contiguous = arr.strides[-2] == scalar_size * dtype_shape[-1]
-            ptr = arr.__array_interface__["data"][0]
-            shape = arr.__array_interface__["shape"]
-            strides = arr.__array_interface__.get("strides", None)
+            if not inner_contiguous:
+                arr = np.ascontiguousarray(arr)
-            # Convert input shape to Warp
-            if type_length(dtype) > 1:
-                # if we are constructing an array of vectors/matrices, but input
-                # is one dimensional (i.e.: flattened) then try and reshape to
-                # to match target dtype, inferring the first dimension
-                if arr.ndim == 1:
-                    arr = arr.reshape((-1, *dtype._shape_))
+            shape = arr.shape[:-dtype_ndim] or (1,)
+            strides = arr.strides[:-dtype_ndim] or (type_size_in_bytes(dtype),)
+        else:
+            shape = arr.shape or (1,)
+            strides = arr.strides or (type_size_in_bytes(dtype),)
-                # last dimension should match dtype shape when using vector types,
-                # e.g.: array of mat22 objects should have shape (n, 2, 2)
-                dtype_ndim = len(dtype._shape_)
+        device = warp.get_device(device)
-                trailing_shape = arr.shape[-dtype_ndim:]
-                leading_shape = arr.shape[0:-dtype_ndim]
+        if device.is_cpu and not copy and not pinned:
+            # reference numpy memory directly
+            self._init_from_ptr(arr.ctypes.data, dtype, shape, strides, None, device, False, False)
+            # keep a ref to the source array to keep allocation alive
+            self._ref = arr
+        else:
+            # copy data into a new array
+            self._init_new(dtype, shape, None, device, pinned)
+            src = array(
+                ptr=arr.ctypes.data,
+                dtype=dtype,
+                shape=shape,
+                strides=strides,
+                device="cpu",
+                copy=False,
+                owner=False,
+            )
+            warp.copy(self, src)
-                if dtype._shape_ != trailing_shape:
-                    raise RuntimeError(
-                        f"Last dimensions of input array should match the specified data type, given shape {arr.shape}, expected last dimensions to match dtype shape {dtype._shape_}"
-                    )
+    def _init_from_ptr(self, ptr, dtype, shape, strides, capacity, device, owner, pinned):
+        if dtype == Any:
+            raise RuntimeError("A concrete data type is required to create the array")
-                shape = leading_shape
+        device = warp.get_device(device)
-                if strides is not None:
-                    strides = strides[0:-dtype_ndim]
+        size = 1
+        for d in shape:
+            size *= d
-            if device.is_cpu and copy is False:
-                # ref numpy memory directly
-                self.shape = shape
-                self.ptr = ptr
-                self.grad_ptr = grad_ptr
-                self.dtype = dtype
-                self.strides = strides
-                self.capacity = arr.size * type_size_in_bytes(dtype)
-                self.device = device
-                self.owner = False
-                self.pinned = False
+        contiguous_strides = strides_from_shape(shape, dtype)
-                # keep a ref to source array to keep allocation alive
-                self.ref = arr
+        if strides is None:
+            strides = contiguous_strides
+            is_contiguous = True
+            if capacity is None:
+                capacity = size * type_size_in_bytes(dtype)
+        else:
+            is_contiguous = strides == contiguous_strides
+            if capacity is None:
+                capacity = shape[0] * strides[0]
+        self.dtype = dtype
+        self.ndim = len(shape)
+        self.size = size
+        self.capacity = capacity
+        self.shape = shape
+        self.strides = strides
+        self.ptr = ptr
+        self.device = device
+        self.owner = owner
+        self.pinned = pinned if device.is_cpu else False
+        self.is_contiguous = is_contiguous
-            else:
-                # otherwise, we must transfer to device memory
-                # create a host wrapper around the numpy array
-                # and a new destination array to copy it to
-                src = array(
-                    dtype=dtype,
-                    shape=shape,
-                    strides=strides,
-                    capacity=arr.size * type_size_in_bytes(dtype),
-                    ptr=ptr,
-                    device="cpu",
-                    copy=False,
-                    owner=False,
-                )
-                dest = warp.empty(shape, dtype=dtype, device=device, requires_grad=requires_grad, pinned=pinned)
-                dest.owner = False
+    def _init_new(self, dtype, shape, strides, device, pinned):
+        if dtype == Any:
+            raise RuntimeError("A concrete data type is required to create the array")
-                # copy data using the CUDA default stream for synchronous behaviour with other streams
-                warp.copy(dest, src, stream=device.null_stream)
+        device = warp.get_device(device)
-                # object copy to self and transfer data ownership, would probably be cleaner to have _empty, _zero, etc as class methods
-                from copy import copy as shallowcopy
+        size = 1
+        for d in shape:
+            size *= d
-                self.__dict__ = shallowcopy(dest.__dict__)
-                self.owner = True
+        contiguous_strides = strides_from_shape(shape, dtype)
+        if strides is None:
+            strides = contiguous_strides
+            is_contiguous = True
+            capacity = size * type_size_in_bytes(dtype)
         else:
-            # explicit construction from ptr to external memory
-            self.shape = shape
-            self.strides = strides
-            self.capacity = capacity
-            self.dtype = dtype
-            self.ptr = ptr
-            self.grad_ptr = grad_ptr
-            self.device = device
-            self.owner = owner
-            if device is not None and device.is_cpu:
-                self.pinned = pinned
-            else:
-                self.pinned = False
+            is_contiguous = strides == contiguous_strides
+            capacity = shape[0] * strides[0]
-            self.__name__ = "array<" + type.__name__ + ">"
-        # update ndim
-        if ndim is None:
-            self.ndim = len(self.shape)
+        if capacity > 0:
+            ptr = device.allocator.alloc(capacity, pinned=pinned)
+            if ptr is None:
+                raise RuntimeError(f"Array allocation failed on device: {device} for {capacity} bytes")
         else:
-            self.ndim = ndim
+            ptr = None
-        # update size (num elements)
-        self.size = 1
-        for d in self.shape:
-            self.size *= d
+        self.dtype = dtype
+        self.ndim = len(shape)
+        self.size = size
+        self.capacity = capacity
+        self.shape = shape
+        self.strides = strides
+        self.ptr = ptr
+        self.device = device
+        self.owner = True
+        self.pinned = pinned if device.is_cpu else False
+        self.is_contiguous = is_contiguous
+    def _init_annotation(self, dtype, ndim):
+        self.dtype = dtype
+        self.ndim = ndim
+        self.size = 0
+        self.capacity = 0
+        self.shape = (0,) * ndim
+        self.strides = (0,) * ndim
+        self.ptr = None
+        self.device = None
+        self.owner = False
+        self.pinned = False
+        self.is_contiguous = False
-        self._grad = None
+    @property
+    def __array_interface__(self):
+        # raising an AttributeError here makes hasattr() return False
+        if self.device is None or not self.device.is_cpu:
+            raise AttributeError(f"__array_interface__ not supported because device is {self.device}")
-        # set up array interface access so we can treat this object as a numpy array
-        if self.ptr:
-            # update byte strides and contiguous flag
-            contiguous_strides = strides_from_shape(self.shape, self.dtype)
-            if strides is None:
-                self.strides = contiguous_strides
-                self.is_contiguous = True
+        if self._array_interface is None:
+            # get flat shape (including type shape)
+            if isinstance(self.dtype, warp.codegen.Struct):
+                # struct
+                arr_shape = self.shape
+                arr_strides = self.strides
+                descr = self.dtype.numpy_dtype()
+            elif issubclass(self.dtype, ctypes.Array):
+                # vector type, flatten the dimensions into one tuple
+                arr_shape = (*self.shape, *self.dtype._shape_)
+                dtype_strides = strides_from_shape(self.dtype._shape_, self.dtype._type_)
+                arr_strides = (*self.strides, *dtype_strides)
+                descr = None
             else:
-                self.strides = strides
-                self.is_contiguous = strides[:ndim] == contiguous_strides[:ndim]
+                # scalar type
+                arr_shape = self.shape
+                arr_strides = self.strides
+                descr = None
+            self._array_interface = {
+                "data": (self.ptr if self.ptr is not None else 0, False),
+                "shape": tuple(arr_shape),
+                "strides": tuple(arr_strides),
+                "typestr": type_typestr(self.dtype),
+                "descr": descr,  # optional description of structured array layout
+                "version": 3,
+            }
-            # store flat shape (including type shape)
+        return self._array_interface
-            if isinstance(dtype, type) and issubclass(dtype, ctypes.Array):
+    @property
+    def __cuda_array_interface__(self):
+        # raising an AttributeError here makes hasattr() return False
+        if self.device is None or not self.device.is_cuda:
+            raise AttributeError(f"__cuda_array_interface__ is not supported because device is {self.device}")
+        if self._array_interface is None:
+            # get flat shape (including type shape)
+            if issubclass(self.dtype, ctypes.Array):
                 # vector type, flatten the dimensions into one tuple
                 arr_shape = (*self.shape, *self.dtype._shape_)
                 dtype_strides = strides_from_shape(self.dtype._shape_, self.dtype._type_)
@@ -1335,44 +1835,18 @@ class array(Array):
                 arr_shape = self.shape
                 arr_strides = self.strides
-            if device.is_cpu:
-                self.__array_interface__ = {
-                    "data": (self.ptr, False),
-                    "shape": tuple(arr_shape),
-                    "strides": tuple(arr_strides),
-                    "typestr": type_typestr(self.dtype),
-                    "version": 3,
-                }
-            # set up cuda array interface access so we can treat this object as a Torch tensor
-            elif device.is_cuda:
-                self.__cuda_array_interface__ = {
-                    "data": (self.ptr, False),
-                    "shape": tuple(arr_shape),
-                    "strides": tuple(arr_strides),
-                    "typestr": type_typestr(self.dtype),
-                    "version": 2,
-                }
-            # controls if gradients will be computed by wp.Tape
-            # this will trigger allocation of a gradient array if it doesn't exist already
-            self.requires_grad = requires_grad
-        else:
-            # array has no data
-            self.strides = (0,) * self.ndim
-            self.is_contiguous = False
-            self.requires_grad = False
+            self._array_interface = {
+                "data": (self.ptr if self.ptr is not None else 0, False),
+                "shape": tuple(arr_shape),
+                "strides": tuple(arr_strides),
+                "typestr": type_typestr(self.dtype),
+                "version": 2,
+            }
-        self.ctype = None
+        return self._array_interface
     def __del__(self):
-        if self.owner and self.device is not None and self.ptr is not None:
-            # TODO: ill-timed gc could trigger superfluous context switches here
-            #       Delegate to a separate thread? (e.g., device_free_async)
-            if self.device.is_capturing:
-                raise RuntimeError(f"Cannot free memory on device {self.device} while graph capture is active")
+        if self.owner:
             # use CUDA context guard to avoid side effects during garbage collection
             with self.device.context_guard:
                 self.device.allocator.free(self.ptr, self.capacity, self.pinned)
@@ -1385,7 +1859,7 @@ class array(Array):
             # for 'empty' arrays we just return the type information, these are used in kernel function signatures
             return f"array{self.dtype}"
         else:
-            return str(self.to("cpu").numpy())
+            return str(self.numpy())
     def __getitem__(self, key):
         if isinstance(key, int):
@@ -1436,7 +1910,7 @@ class array(Array):
                 if stop < 0:
                     stop = self.shape[idx] + stop
-                if start < 0 or start > self.shape[idx] - 1:
+                if start < 0 or start >= self.shape[idx]:
                     raise RuntimeError(f"Invalid indexing in slice: {start}:{stop}:{step}")
                 if stop < 1 or stop > self.shape[idx]:
                     raise RuntimeError(f"Invalid indexing in slice: {start}:{stop}:{step}")
@@ -1460,23 +1934,37 @@ class array(Array):
                 start = k
                 if start < 0:
                     start = self.shape[idx] + start
-                if start < 0 or start > self.shape[idx] - 1:
+                if start < 0 or start >= self.shape[idx]:
                     raise RuntimeError(f"Invalid indexing in slice: {k}")
                 new_dim -= 1
                 ptr_offset += self.strides[idx] * start
+        # handle grad
+        if self.grad is not None:
+            new_grad = array(
+                ptr=self.grad.ptr + ptr_offset if self.grad.ptr is not None else None,
+                dtype=self.grad.dtype,
+                shape=tuple(new_shape),
+                strides=tuple(new_strides),
+                device=self.grad.device,
+                pinned=self.grad.pinned,
+                owner=False,
+            )
+            # store back-ref to stop data being destroyed
+            new_grad._ref = self.grad
+        else:
+            new_grad = None
         a = array(
+            ptr=self.ptr + ptr_offset if self.ptr is not None else None,
             dtype=self.dtype,
             shape=tuple(new_shape),
             strides=tuple(new_strides),
-            ptr=self.ptr + ptr_offset,
-            grad_ptr=(self.grad_ptr + ptr_offset if self.grad_ptr is not None else None),
-            capacity=self.capacity,
             device=self.device,
+            pinned=self.pinned,
             owner=False,
-            ndim=new_dim,
-            requires_grad=self.requires_grad,
+            grad=new_grad,
         )
         # store back-ref to stop data being destroyed
@@ -1494,7 +1982,7 @@ class array(Array):
     def __ctype__(self):
         if self.ctype is None:
             data = 0 if self.ptr is None else ctypes.c_uint64(self.ptr)
-            grad = 0 if self.grad_ptr is None else ctypes.c_uint64(self.grad_ptr)
+            grad = 0 if self.grad is None or self.grad.ptr is None else ctypes.c_uint64(self.grad.ptr)
             self.ctype = array_t(data=data, grad=grad, ndim=self.ndim, shape=self.shape, strides=self.strides)
         return self.ctype
@@ -1522,25 +2010,31 @@ class array(Array):
         return self._grad
     @grad.setter
-    def grad(self, value):
-        # trigger re-creation of C-representation
-        self.ctype = None
-        if value is None:
-            self.grad_ptr = None
+    def grad(self, grad):
+        if grad is None:
             self._grad = None
-            return
-        if self._grad is None:
-            self.grad_ptr = value.ptr
-            self._grad = value
+            self._requires_grad = False
         else:
-            self._grad.assign(value)
+            # make sure the given gradient array is compatible
+            if (
+                grad.dtype != self.dtype
+                or grad.shape != self.shape
+                or grad.strides != self.strides
+                or grad.device != self.device
+            ):
+                raise ValueError("The given gradient array is incompatible")
+            self._grad = grad
+            self._requires_grad = True
+        # trigger re-creation of C-representation
+        self.ctype = None
     @property
     def requires_grad(self):
         return self._requires_grad
     @requires_grad.setter
-    def requires_grad(self, value: bool):
+    def requires_grad(self, value: builtins.bool):
         if value and self._grad is None:
             self._alloc_grad()
         elif not value:
@@ -1548,18 +2042,15 @@ class array(Array):
         self._requires_grad = value
-    def _alloc_grad(self):
-        if self.grad_ptr is None:
-            num_bytes = self.size * type_size_in_bytes(self.dtype)
-            self.grad_ptr = self.device.allocator.alloc(num_bytes, pinned=self.pinned)
-            if self.grad_ptr is None:
-                raise RuntimeError("Memory allocation failed on device: {} for {} bytes".format(self.device, num_bytes))
-            with warp.ScopedStream(self.device.null_stream):
-                self.device.memset(self.grad_ptr, 0, num_bytes)
+        # trigger re-creation of C-representation
+        self.ctype = None
+    def _alloc_grad(self):
         self._grad = array(
-            ptr=self.grad_ptr, shape=self.shape, dtype=self.dtype, device=self.device, requires_grad=False, owner=False
+            dtype=self.dtype, shape=self.shape, strides=self.strides, device=self.device, pinned=self.pinned
         )
+        self._grad.zero_()
         # trigger re-creation of C-representation
         self.ctype = None
@@ -1568,171 +2059,195 @@ class array(Array):
         # member attributes available during code-gen (e.g.: d = array.shape[0])
         # Note: we use a shared dict for all array instances
         if array._vars is None:
-            from warp.codegen import Var
-            array._vars = {"shape": Var("shape", shape_t)}
+            array._vars = {"shape": warp.codegen.Var("shape", shape_t)}
         return array._vars
     def zero_(self):
-        if not self.is_contiguous:
-            raise RuntimeError("Assigning to non-contiguous arrays is unsupported.")
-        if self.device is not None and self.ptr is not None:
-            self.device.memset(
-                ctypes.c_void_p(self.ptr), ctypes.c_int(0), ctypes.c_size_t(self.size * type_size_in_bytes(self.dtype))
-            )
+        """Zeroes-out the array entries."""
+        if self.is_contiguous:
+            # simple memset is usually faster than generic fill
+            self.device.memset(self.ptr, 0, self.size * type_size_in_bytes(self.dtype))
+        else:
+            self.fill_(0)
     def fill_(self, value):
-        if not self.is_contiguous:
-            raise RuntimeError("Assigning to non-contiguous arrays is unsupported.")
-        if self.device is not None and self.ptr is not None:
-            if isinstance(value, ctypes.Array):
-                # in this case we're filling the array with a vector or
-                # something similar, eg arr.fill_(wp.vec3(1.0,2.0,3.0)).
-                # check input type:
-                value_type_ok = False
-                if issubclass(self.dtype, ctypes.Array):
-                    value_type_ok = (self.dtype._length_ == value._length_) and (self.dtype._type_ == value._type_)
-                if not value_type_ok:
-                    raise RuntimeError(
-                        "wp.array has Array type elements (eg vec, mat etc). Value type must match element type in wp.array.fill_() method"
-                    )
-                src = ctypes.cast(value, ctypes.POINTER(ctypes.c_void_p))
-                srcsize = value._length_ * ctypes.sizeof(value._type_)
-                dst = ctypes.cast(self.ptr, ctypes.POINTER(ctypes.c_int))
-                self.device.memtile(dst, src, srcsize, self.size)
+        """Set all array entries to `value`
+        args:
+            value: The value to set every array entry to. Must be convertible to the array's ``dtype``.
+        Raises:
+            ValueError: If `value` cannot be converted to the array's ``dtype``.
+        Examples:
+            ``fill_()`` can take lists or other sequences when filling arrays of vectors or matrices.
+            >>> arr = wp.zeros(2, dtype=wp.mat22)
+            >>> arr.numpy()
+            array([[[0., 0.],
+                    [0., 0.]],
+            <BLANKLINE>
+                   [[0., 0.],
+                    [0., 0.]]], dtype=float32)
+            >>> arr.fill_([[1, 2], [3, 4]])
+            >>> arr.numpy()
+            array([[[1., 2.],
+                    [3., 4.]],
+            <BLANKLINE>
+                   [[1., 2.],
+                    [3., 4.]]], dtype=float32)
+        """
+        if self.size == 0:
+            return
-            else:
-                # In this case we're just filling the array with a scalar,
-                # eg arr.fill_(1.0). If the elements are scalars, we need to
-                # set them all to "value", otherwise we need to set all the
-                # components of all the vector elements to "value":
-                # work out array element type:
-                elem_type = self.dtype._type_ if issubclass(self.dtype, ctypes.Array) else type_ctype(self.dtype)
-                elem_size = ctypes.sizeof(elem_type)
-                # convert value to array type
-                # we need a special case for float16 because it's annoying...
-                if types_equal(self.dtype, float16) or (
-                    hasattr(self.dtype, "_wp_scalar_type_") and types_equal(self.dtype._wp_scalar_type_, float16)
-                ):
-                    # special case for float16:
-                    # If you just do elem_type(value), it'll just convert "value"
-                    # to uint16 then interpret the bits as float16, which will
-                    # mess the data up. Instead, we use float_to_half_bits() to
-                    # convert "value" to a float16 and return its bits in a uint16:
-                    from warp.context import runtime
-                    src_value = elem_type(runtime.core.float_to_half_bits(ctypes.c_float(value)))
+        # try to convert the given value to the array dtype
+        try:
+            if isinstance(self.dtype, warp.codegen.Struct):
+                if isinstance(value, self.dtype.cls):
+                    cvalue = value.__ctype__()
+                elif value == 0:
+                    # allow zero-initializing structs using default constructor
+                    cvalue = self.dtype().__ctype__()
                 else:
-                    src_value = elem_type(value)
-                # use memset for these special cases because it's quicker (probably...):
-                total_bytes = self.size * type_size_in_bytes(self.dtype)
-                if elem_size in [1, 2, 4] and (total_bytes % 4 == 0):
-                    # interpret as a 4 byte integer:
-                    dest_value = ctypes.cast(ctypes.pointer(src_value), ctypes.POINTER(ctypes.c_int)).contents
-                    if elem_size == 1:
-                        # need to repeat the bits, otherwise we'll get an array interleaved with zeros:
-                        dest_value.value = dest_value.value & 0x000000FF
-                        dest_value.value = (
-                            dest_value.value
-                            + (dest_value.value << 8)
-                            + (dest_value.value << 16)
-                            + (dest_value.value << 24)
-                        )
-                    elif elem_size == 2:
-                        # need to repeat the bits, otherwise we'll get an array interleaved with zeros:
-                        dest_value.value = dest_value.value & 0x0000FFFF
-                        dest_value.value = dest_value.value + (dest_value.value << 16)
-                    self.device.memset(
-                        ctypes.cast(self.ptr, ctypes.POINTER(ctypes.c_int)), dest_value, ctypes.c_size_t(total_bytes)
+                    raise ValueError(
+                        f"Invalid initializer value for struct {self.dtype.cls.__name__}, expected struct instance or 0"
                     )
+            elif issubclass(self.dtype, ctypes.Array):
+                # vector/matrix
+                cvalue = self.dtype(value)
+            else:
+                # scalar
+                if type(value) in warp.types.scalar_types:
+                    value = value.value
+                if self.dtype == float16:
+                    cvalue = self.dtype._type_(float_to_half_bits(value))
                 else:
-                    num_elems = self.size * self.dtype._length_ if issubclass(self.dtype, ctypes.Array) else self.size
-                    dst = ctypes.cast(self.ptr, ctypes.POINTER(ctypes.c_int))
-                    self.device.memtile(dst, ctypes.pointer(src_value), elem_size, num_elems)
+                    cvalue = self.dtype._type_(value)
+        except Exception as e:
+            raise ValueError(f"Failed to convert the value to the array data type: {e}")
+        cvalue_ptr = ctypes.pointer(cvalue)
+        cvalue_size = ctypes.sizeof(cvalue)
+        # prefer using memtile for contiguous arrays, because it should be faster than generic fill
+        if self.is_contiguous:
+            self.device.memtile(self.ptr, cvalue_ptr, cvalue_size, self.size)
+        else:
+            carr = self.__ctype__()
+            carr_ptr = ctypes.pointer(carr)
+            if self.device.is_cuda:
+                warp.context.runtime.core.array_fill_device(
+                    self.device.context, carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size
+                )
+            else:
+                warp.context.runtime.core.array_fill_host(carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size)
-    # equivalent to wrapping src data in an array and copying to self
     def assign(self, src):
-        if isinstance(src, array):
+        """Wraps ``src`` in an :class:`warp.array` if it is not already one and copies the contents to ``self``."""
+        if is_array(src):
             warp.copy(self, src)
         else:
-            warp.copy(self, array(src, dtype=self.dtype, copy=False, device="cpu"))
+            warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))
-    # convert array to ndarray (alias memory through array interface)
     def numpy(self):
-        # use the CUDA default stream for synchronous behaviour with other streams
-        with warp.ScopedStream(self.device.null_stream):
-            if self.ptr is None:
-                return np.empty(shape=self.shape, dtype=self.dtype)
+        """Converts the array to a :class:`numpy.ndarray` (aliasing memory through the array interface protocol)
+        If the array is on the GPU, a synchronous device-to-host copy (on the CUDA default stream) will be
+        automatically performed to ensure that any outstanding work is completed.
+        """
+        if self.ptr:
+            # use the CUDA default stream for synchronous behaviour with other streams
+            with warp.ScopedStream(self.device.null_stream):
+                a = self.to("cpu", requires_grad=False)
+            # convert through __array_interface__
+            # Note: this handles arrays of structs using `descr`, so the result will be a structured NumPy array
+            return np.array(a, copy=False)
+        else:
+            # return an empty numpy array with the correct dtype and shape
+            if isinstance(self.dtype, warp.codegen.Struct):
+                npdtype = self.dtype.numpy_dtype()
+                npshape = self.shape
+            elif issubclass(self.dtype, ctypes.Array):
+                npdtype = warp_type_to_np_dtype[self.dtype._wp_scalar_type_]
+                npshape = (*self.shape, *self.dtype._shape_)
             else:
-                a = self.to("cpu")
+                npdtype = warp_type_to_np_dtype[self.dtype]
+                npshape = self.shape
+            return np.empty(npshape, dtype=npdtype)
-                if isinstance(self.dtype, warp.codegen.Struct):
-                    # Note: cptr holds a backref to the source array to avoid it being deallocated
-                    p = a.cptr()
-                    return np.ctypeslib.as_array(p, self.shape)
-                else:
-                    # convert through array interface
-                    return np.array(a, copy=False)
-    # return a ctypes cast of the array address
-    # note that accesses to this object are *not* bounds checked
     def cptr(self):
-        if self.device != "cpu":
-            raise RuntimeError("Accessing array memory through a ctypes ptr is only supported for CPU arrays.")
-        p = ctypes.cast(self.ptr, ctypes.POINTER(self.dtype.ctype))
+        """Return a ctypes cast of the array address.
+        Notes:
+        #. Only CPU arrays support this method.
+        #. The array must be contiguous.
+        #. Accesses to this object are **not** bounds checked.
+        #. For ``float16`` types, a pointer to the internal ``uint16`` representation is returned.
+        """
+        if not self.ptr:
+            return None
+        if self.device != "cpu" or not self.is_contiguous:
+            raise RuntimeError(
+                "Accessing array memory through a ctypes ptr is only supported for contiguous CPU arrays."
+            )
+        if isinstance(self.dtype, warp.codegen.Struct):
+            p = ctypes.cast(self.ptr, ctypes.POINTER(self.dtype.ctype))
+        else:
+            p = ctypes.cast(self.ptr, ctypes.POINTER(self.dtype._type_))
         # store backref to the underlying array to avoid it being deallocated
         p._ref = self
         return p
-    # returns a flattened list of items in the array as a Python list
     def list(self):
-        a = self.to("cpu").flatten()
-        # Note: cptr holds a backref to the source array to avoid it being deallocated
-        p = a.cptr()
+        """Returns a flattened list of items in the array as a Python list."""
+        a = self.numpy()
+        if isinstance(self.dtype, warp.codegen.Struct):
+            # struct
+            a = a.flatten()
+            data = a.ctypes.data
+            stride = a.strides[0]
+            return [self.dtype.from_ptr(data + i * stride) for i in range(self.size)]
+        elif issubclass(self.dtype, ctypes.Array):
+            # vector/matrix - flatten, but preserve inner vector/matrix dimensions
+            a = a.reshape((self.size, *self.dtype._shape_))
+            data = a.ctypes.data
+            stride = a.strides[0]
+            return [self.dtype.from_ptr(data + i * stride) for i in range(self.size)]
+        else:
+            # scalar
+            return list(a.flatten())
-        return p[:a.size]
-    # convert data from one device to another, nop if already on device
-    def to(self, device):
+    def to(self, device, requires_grad=None):
+        """Returns a Warp array with this array's data moved to the specified device, no-op if already on device."""
         device = warp.get_device(device)
         if self.device == device:
             return self
         else:
-            dest = warp.empty(shape=self.shape, dtype=self.dtype, device=device, requires_grad=self.requires_grad)
-            # to copy between devices, array must be contiguous
-            warp.copy(dest, self.contiguous())
-            return dest
+            return warp.clone(self, device=device, requires_grad=requires_grad)
     def flatten(self):
+        """Returns a zero-copy view of the array collapsed to 1-D. Only supported for contiguous arrays."""
+        if self.ndim == 1:
+            return self
         if not self.is_contiguous:
             raise RuntimeError("Flattening non-contiguous arrays is unsupported.")
         a = array(
+            ptr=self.ptr,
             dtype=self.dtype,
             shape=(self.size,),
-            strides=(type_size_in_bytes(self.dtype),),
-            ptr=self.ptr,
-            grad_ptr=self.grad_ptr,
-            capacity=self.capacity,
             device=self.device,
+            pinned=self.pinned,
             copy=False,
             owner=False,
-            ndim=1,
-            requires_grad=self.requires_grad,
+            grad=None if self.grad is None else self.grad.flatten(),
         )
         # store back-ref to stop data being destroyed
@@ -1740,6 +2255,11 @@ class array(Array):
         return a
     def reshape(self, shape):
+        """Returns a reshaped array. Only supported for contiguous arrays.
+        Args:
+            shape : An int or tuple of ints specifying the shape of the returned array.
+        """
         if not self.is_contiguous:
             raise RuntimeError("Reshaping non-contiguous arrays is unsupported.")
@@ -1748,7 +2268,7 @@ class array(Array):
             raise RuntimeError("shape parameter is required.")
         if isinstance(shape, int):
             shape = (shape,)
-        elif isinstance(shape, List):
+        elif not isinstance(shape, tuple):
             shape = tuple(shape)
         if len(shape) > ARRAY_MAX_DIMS:
@@ -1756,6 +2276,23 @@ class array(Array):
                 f"Arrays may only have {ARRAY_MAX_DIMS} dimensions maximum, trying to create array with {len(shape)} dims."
             )
+        # check for -1 dimension and reformat
+        if -1 in shape:
+            idx = self.size
+            denom = 1
+            minus_one_count = 0
+            for i, d in enumerate(shape):
+                if d == -1:
+                    idx = i
+                    minus_one_count += 1
+                else:
+                    denom *= d
+            if minus_one_count > 1:
+                raise RuntimeError("Cannot infer shape if more than one index is -1.")
+            new_shape = list(shape)
+            new_shape[idx] = int(self.size / denom)
+            shape = tuple(new_shape)
         size = 1
         for d in shape:
             size *= d
@@ -1764,17 +2301,15 @@ class array(Array):
             raise RuntimeError("Reshaped array must have the same total size as the original.")
         a = array(
+            ptr=self.ptr,
             dtype=self.dtype,
             shape=shape,
             strides=None,
-            ptr=self.ptr,
-            grad_ptr=self.grad_ptr,
-            capacity=self.capacity,
             device=self.device,
+            pinned=self.pinned,
             copy=False,
             owner=False,
-            ndim=len(shape),
-            requires_grad=self.requires_grad,
+            grad=None if self.grad is None else self.grad.reshape(shape),
         )
         # store back-ref to stop data being destroyed
@@ -1782,49 +2317,55 @@ class array(Array):
         return a
     def view(self, dtype):
+        """Returns a zero-copy view of this array's memory with a different data type.
+        ``dtype`` must have the same byte size of the array's native ``dtype``.
+        """
         if type_size_in_bytes(dtype) != type_size_in_bytes(self.dtype):
-            raise RuntimeError("cannot reinterpret cast dtypes of unequal byte size")
-        else:
-            # return an alias of the array memory with different type information
-            a = array(
-                data=None,
-                dtype=dtype,
-                shape=self.shape,
-                strides=self.strides,
-                ptr=self.ptr,
-                grad_ptr=self.grad_ptr,
-                capacity=self.capacity,
-                device=self.device,
-                copy=False,
-                owner=False,
-                ndim=self.ndim,
-                requires_grad=self.requires_grad,
-            )
+            raise RuntimeError("Cannot cast dtypes of unequal byte size")
-            a._ref = self
-            return a
+        # return an alias of the array memory with different type information
+        a = array(
+            ptr=self.ptr,
+            dtype=dtype,
+            shape=self.shape,
+            strides=self.strides,
+            device=self.device,
+            pinned=self.pinned,
+            copy=False,
+            owner=False,
+            grad=None if self.grad is None else self.grad.view(dtype),
+        )
+        a._ref = self
+        return a
     def contiguous(self):
+        """Returns a contiguous array with this array's data. No-op if array is already contiguous."""
         if self.is_contiguous:
             return self
         a = warp.empty_like(self)
         warp.copy(a, self)
         return a
-    # note: transpose operation will return an array with a non-contiguous access pattern
     def transpose(self, axes=None):
+        """Returns an zero-copy view of the array with axes transposed.
+        Note: The transpose operation will return an array with a non-contiguous access pattern.
+        Args:
+            axes (optional): Specifies the how the axes are permuted. If not specified, the axes order will be reversed.
+        """
         # noop if 1d array
-        if len(self.shape) == 1:
+        if self.ndim == 1:
             return self
         if axes is None:
             # reverse the order of the axes
             axes = range(self.ndim)[::-1]
-        if len(axes) != len(self.shape):
+        elif len(axes) != len(self.shape):
             raise RuntimeError("Length of parameter axes must be equal in length to array shape")
         shape = []
         strides = []
         for a in axes:
@@ -1836,20 +2377,19 @@ class array(Array):
             strides.append(self.strides[a])
         a = array(
-            data=None,
+            ptr=self.ptr,
             dtype=self.dtype,
             shape=tuple(shape),
             strides=tuple(strides),
-            ptr=self.ptr,
-            grad_ptr=self.grad_ptr,
-            capacity=self.capacity,
             device=self.device,
+            pinned=self.pinned,
             copy=False,
             owner=False,
-            ndim=self.ndim,
-            requires_grad=self.requires_grad,
+            grad=None if self.grad is None else self.grad.transpose(axes=axes),
         )
+        a.is_transposed = not self.is_transposed
         a._ref = self
         return a
@@ -1878,12 +2418,13 @@ def array4d(*args, **kwargs):
     return array(*args, **kwargs)
+# TODO: Rewrite so that we take only shape, not length and optional shape
 def from_ptr(ptr, length, dtype=None, shape=None, device=None):
     return array(
         dtype=dtype,
         length=length,
         capacity=length * type_size_in_bytes(dtype),
-        ptr=ctypes.cast(ptr, ctypes.POINTER(ctypes.c_size_t)).contents.value,
+        ptr=0 if ptr == 0 else ctypes.cast(ptr, ctypes.POINTER(ctypes.c_size_t)).contents.value,
         shape=shape,
         device=device,
         owner=False,
@@ -1891,12 +2432,113 @@ def from_ptr(ptr, length, dtype=None, shape=None, device=None):
     )
-class indexedarray(Generic[T]):
+# A base class for non-contiguous arrays, providing the implementation of common methods like
+# contiguous(), to(), numpy(), list(), assign(), zero_(), and fill_().
+class noncontiguous_array_base(Generic[T]):
+    def __init__(self, array_type_id):
+        self.type_id = array_type_id
+        self.is_contiguous = False
+    # return a contiguous copy
+    def contiguous(self):
+        a = warp.empty_like(self)
+        warp.copy(a, self)
+        return a
+    # copy data from one device to another, nop if already on device
+    def to(self, device):
+        device = warp.get_device(device)
+        if self.device == device:
+            return self
+        else:
+            return warp.clone(self, device=device)
+    # return a contiguous numpy copy
+    def numpy(self):
+        # use the CUDA default stream for synchronous behaviour with other streams
+        with warp.ScopedStream(self.device.null_stream):
+            return self.contiguous().numpy()
+    # returns a flattened list of items in the array as a Python list
+    def list(self):
+        # use the CUDA default stream for synchronous behaviour with other streams
+        with warp.ScopedStream(self.device.null_stream):
+            return self.contiguous().list()
+    # equivalent to wrapping src data in an array and copying to self
+    def assign(self, src):
+        if is_array(src):
+            warp.copy(self, src)
+        else:
+            warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))
+    def zero_(self):
+        self.fill_(0)
+    def fill_(self, value):
+        if self.size == 0:
+            return
+        # try to convert the given value to the array dtype
+        try:
+            if isinstance(self.dtype, warp.codegen.Struct):
+                if isinstance(value, self.dtype.cls):
+                    cvalue = value.__ctype__()
+                elif value == 0:
+                    # allow zero-initializing structs using default constructor
+                    cvalue = self.dtype().__ctype__()
+                else:
+                    raise ValueError(
+                        f"Invalid initializer value for struct {self.dtype.cls.__name__}, expected struct instance or 0"
+                    )
+            elif issubclass(self.dtype, ctypes.Array):
+                # vector/matrix
+                cvalue = self.dtype(value)
+            else:
+                # scalar
+                if type(value) in warp.types.scalar_types:
+                    value = value.value
+                if self.dtype == float16:
+                    cvalue = self.dtype._type_(float_to_half_bits(value))
+                else:
+                    cvalue = self.dtype._type_(value)
+        except Exception as e:
+            raise ValueError(f"Failed to convert the value to the array data type: {e}")
+        cvalue_ptr = ctypes.pointer(cvalue)
+        cvalue_size = ctypes.sizeof(cvalue)
+        ctype = self.__ctype__()
+        ctype_ptr = ctypes.pointer(ctype)
+        if self.device.is_cuda:
+            warp.context.runtime.core.array_fill_device(
+                self.device.context, ctype_ptr, self.type_id, cvalue_ptr, cvalue_size
+            )
+        else:
+            warp.context.runtime.core.array_fill_host(ctype_ptr, self.type_id, cvalue_ptr, cvalue_size)
+# helper to check index array properties
+def check_index_array(indices, expected_device):
+    if not isinstance(indices, array):
+        raise ValueError(f"Indices must be a Warp array, got {type(indices)}")
+    if indices.ndim != 1:
+        raise ValueError(f"Index array must be one-dimensional, got {indices.ndim}")
+    if indices.dtype != int32:
+        raise ValueError(f"Index array must use int32, got dtype {indices.dtype}")
+    if indices.device != expected_device:
+        raise ValueError(f"Index array device ({indices.device} does not match data array device ({expected_device}))")
+class indexedarray(noncontiguous_array_base[T]):
     # member attributes available during code-gen (e.g.: d = arr.shape[0])
     # (initialized when needed)
     _vars = None
     def __init__(self, data: array = None, indices: Union[array, List[array]] = None, dtype=None, ndim=None):
+        super().__init__(ARRAY_TYPE_INDEXED)
         # canonicalize types
         if dtype is not None:
             if dtype == int:
@@ -1926,17 +2568,6 @@ class indexedarray(Generic[T]):
             shape = list(data.shape)
             if indices is not None:
-                # helper to check index array properties
-                def check_index_array(inds, data):
-                    if inds.ndim != 1:
-                        raise ValueError(f"Index array must be one-dimensional, got {inds.ndim}")
-                    if inds.dtype != int32:
-                        raise ValueError(f"Index array must use int32, got dtype {inds.dtype}")
-                    if inds.device != data.device:
-                        raise ValueError(
-                            f"Index array device ({inds.device} does not match data array device ({data.device}))"
-                        )
                 if isinstance(indices, (list, tuple)):
                     if len(indices) > self.ndim:
                         raise ValueError(
@@ -1944,16 +2575,14 @@ class indexedarray(Generic[T]):
                         )
                     for i in range(len(indices)):
-                        if isinstance(indices[i], array):
-                            check_index_array(indices[i], data)
+                        if indices[i] is not None:
+                            check_index_array(indices[i], data.device)
                             self.indices[i] = indices[i]
                             shape[i] = len(indices[i])
-                        elif indices[i] is not None:
-                            raise TypeError(f"Invalid index array type: {type(indices[i])}")
                 elif isinstance(indices, array):
                     # only a single index array was provided
-                    check_index_array(indices, data)
+                    check_index_array(indices, data.device)
                     self.indices[0] = indices
                     shape[0] = len(indices)
@@ -1975,13 +2604,15 @@ class indexedarray(Generic[T]):
         for d in self.shape:
             self.size *= d
-        self.is_contiguous = False
     def __len__(self):
         return self.shape[0]
     def __str__(self):
-        return f"indexedarray{self.dtype}"
+        if self.device is None:
+            # type annotation
+            return f"indexedarray{self.dtype}"
+        else:
+            return str(self.numpy())
     # construct a C-representation of the array for passing to kernels
     def __ctype__(self):
@@ -1992,48 +2623,9 @@ class indexedarray(Generic[T]):
         # member attributes available during code-gen (e.g.: d = arr.shape[0])
         # Note: we use a shared dict for all indexedarray instances
         if indexedarray._vars is None:
-            from warp.codegen import Var
-            indexedarray._vars = {"shape": Var("shape", shape_t)}
+            indexedarray._vars = {"shape": warp.codegen.Var("shape", shape_t)}
         return indexedarray._vars
-    def contiguous(self):
-        a = warp.empty_like(self)
-        warp.copy(a, self)
-        return a
-    # convert data from one device to another, nop if already on device
-    def to(self, device):
-        device = warp.get_device(device)
-        if self.device == device:
-            return self
-        else:
-            dest = warp.empty(shape=self.shape, dtype=self.dtype, device=device)
-            # to copy between devices, array must be contiguous
-            warp.copy(dest, self.contiguous())
-            return dest
-    # convert array to ndarray (alias memory through array interface)
-    def numpy(self):
-        # use the CUDA default stream for synchronous behaviour with other streams
-        with warp.ScopedStream(self.device.null_stream):
-            a = self.contiguous().to("cpu")
-            if isinstance(self.dtype, warp.codegen.Struct):
-                p = ctypes.cast(a.ptr, ctypes.POINTER(a.dtype.ctype))
-                np.ctypeslib.as_array(p, self.shape)
-            else:
-                # convert through array interface
-                return np.array(a, copy=False)
-    # returns a flattened list of items in the array as a Python list
-    def list(self):
-        a = self.flatten()
-        p = ctypes.cast(a.ptr, ctypes.POINTER(a.dtype.ctype))
-        return p[:a.size]
 # aliases for indexedarrays with small dimensions
 def indexedarray1d(*args, **kwargs):
@@ -2059,7 +2651,22 @@ def indexedarray4d(*args, **kwargs):
     return indexedarray(*args, **kwargs)
-array_types = (array, indexedarray)
+from warp.fabric import fabricarray, indexedfabricarray  # noqa: E402
+array_types = (array, indexedarray, fabricarray, indexedfabricarray)
+def array_type_id(a):
+    if isinstance(a, array):
+        return ARRAY_TYPE_REGULAR
+    elif isinstance(a, indexedarray):
+        return ARRAY_TYPE_INDEXED
+    elif isinstance(a, fabricarray):
+        return ARRAY_TYPE_FABRIC
+    elif isinstance(a, indexedfabricarray):
+        return ARRAY_TYPE_FABRIC_INDEXED
+    else:
+        raise ValueError("Invalid array type")
 class Bvh:
@@ -2117,11 +2724,11 @@ class Bvh:
                 with self.device.context_guard:
                     runtime.core.bvh_destroy_device(self.id)
-        except:
+        except Exception:
             pass
     def refit(self):
-        """Refit the Bvh. This should be called after users modify the `lowers` and `uppers` arrays."""
+        """Refit the BVH. This should be called after users modify the `lowers` and `uppers` arrays."""
         from warp.context import runtime
@@ -2141,7 +2748,7 @@ class Mesh:
         "indices": Var("indices", array(dtype=int32)),
     }
-    def __init__(self, points=None, indices=None, velocities=None):
+    def __init__(self, points=None, indices=None, velocities=None, support_winding_number=False):
         """Class representing a triangle mesh.
         Attributes:
@@ -2152,6 +2759,7 @@ class Mesh:
             points (:class:`warp.array`): Array of vertex positions of type :class:`warp.vec3`
             indices (:class:`warp.array`): Array of triangle indices of type :class:`warp.int32`, should be a 1d array with shape (num_tris, 3)
             velocities (:class:`warp.array`): Array of vertex velocities of type :class:`warp.vec3` (optional)
+            support_winding_number (bool): If true the mesh will build additional datastructures to support `wp.mesh_query_point_sign_winding_number()` queries
         """
         if points.device != indices.device:
@@ -2183,6 +2791,7 @@ class Mesh:
                 indices.__ctype__(),
                 int(len(points)),
                 int(indices.size / 3),
+                int(support_winding_number),
             )
         else:
             self.id = runtime.core.mesh_create_device(
@@ -2192,6 +2801,7 @@ class Mesh:
                 indices.__ctype__(),
                 int(len(points)),
                 int(indices.size / 3),
+                int(support_winding_number),
             )
     def __del__(self):
@@ -2204,7 +2814,7 @@ class Mesh:
                 # use CUDA context guard to avoid side effects during garbage collection
                 with self.device.context_guard:
                     runtime.core.mesh_destroy_device(self.id)
-        except:
+        except Exception:
             pass
     def refit(self):
@@ -2220,16 +2830,14 @@ class Mesh:
 class Volume:
+    #: Enum value to specify nearest-neighbor interpolation during sampling
     CLOSEST = constant(0)
+    #: Enum value to specify trilinear interpolation during sampling
     LINEAR = constant(1)
     def __init__(self, data: array):
         """Class representing a sparse grid.
-        Attributes:
-            CLOSEST (int): Enum value to specify nearest-neighbor interpolation during sampling
-            LINEAR (int): Enum value to specify trilinear interpolation during sampling
         Args:
             data (:class:`warp.array`): Array of bytes representing the volume in NanoVDB format
         """
@@ -2271,19 +2879,20 @@ class Volume:
                 with self.device.context_guard:
                     runtime.core.volume_destroy_device(self.id)
-        except:
+        except Exception:
             pass
-    def array(self):
+    def array(self) -> array:
+        """Returns the raw memory buffer of the Volume as an array"""
         buf = ctypes.c_void_p(0)
         size = ctypes.c_uint64(0)
         if self.device.is_cpu:
             self.context.core.volume_get_buffer_info_host(self.id, ctypes.byref(buf), ctypes.byref(size))
         else:
             self.context.core.volume_get_buffer_info_device(self.id, ctypes.byref(buf), ctypes.byref(size))
-        return array(ptr=buf.value, dtype=uint8, length=size.value, device=self.device, owner=False)
+        return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device, owner=False)
-    def get_tiles(self):
+    def get_tiles(self) -> array:
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
@@ -2294,11 +2903,9 @@ class Volume:
         else:
             self.context.core.volume_get_tiles_device(self.id, ctypes.byref(buf), ctypes.byref(size))
         num_tiles = size.value // (3 * 4)
-        return array(
-            ptr=buf.value, dtype=int32, shape=(num_tiles, 3), length=size.value, device=self.device, owner=True
-        )
+        return array(ptr=buf.value, dtype=int32, shape=(num_tiles, 3), device=self.device, owner=True)
-    def get_voxel_size(self):
+    def get_voxel_size(self) -> Tuple[float, float, float]:
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
@@ -2307,7 +2914,13 @@ class Volume:
         return (dx.value, dy.value, dz.value)
     @classmethod
-    def load_from_nvdb(cls, file_or_buffer, device=None):
+    def load_from_nvdb(cls, file_or_buffer, device=None) -> Volume:
+        """Creates a Volume object from a NanoVDB file or in-memory buffer.
+        Returns:
+            A ``warp.Volume`` object.
+        """
         try:
             data = file_or_buffer.read()
         except AttributeError:
@@ -2336,6 +2949,90 @@ class Volume:
         data_array = array(np.frombuffer(grid_data, dtype=np.byte), device=device)
         return cls(data_array)
+    @classmethod
+    def load_from_numpy(
+        cls, ndarray: np.array, min_world=(0.0, 0.0, 0.0), voxel_size=1.0, bg_value=0.0, device=None
+    ) -> Volume:
+        """Creates a Volume object from a dense 3D NumPy array.
+        This function is only supported for CUDA devices.
+        Args:
+            min_world: The 3D coordinate of the lower corner of the volume.
+            voxel_size: The size of each voxel in spatial coordinates.
+            bg_value: Background value
+            device: The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
+        Returns:
+            A ``warp.Volume`` object.
+        """
+        import math
+        target_shape = (
+            math.ceil(ndarray.shape[0] / 8) * 8,
+            math.ceil(ndarray.shape[1] / 8) * 8,
+            math.ceil(ndarray.shape[2] / 8) * 8,
+        )
+        if hasattr(bg_value, "__len__"):
+            # vec3, assuming the numpy array is 4D
+            padded_array = np.array((target_shape[0], target_shape[1], target_shape[2], 3), dtype=np.single)
+            padded_array[:, :, :, :] = np.array(bg_value)
+            padded_array[0 : ndarray.shape[0], 0 : ndarray.shape[1], 0 : ndarray.shape[2], :] = ndarray
+        else:
+            padded_amount = (
+                math.ceil(ndarray.shape[0] / 8) * 8 - ndarray.shape[0],
+                math.ceil(ndarray.shape[1] / 8) * 8 - ndarray.shape[1],
+                math.ceil(ndarray.shape[2] / 8) * 8 - ndarray.shape[2],
+            )
+            padded_array = np.pad(
+                ndarray,
+                ((0, padded_amount[0]), (0, padded_amount[1]), (0, padded_amount[2])),
+                mode="constant",
+                constant_values=bg_value,
+            )
+        shape = padded_array.shape
+        volume = warp.Volume.allocate(
+            min_world,
+            [
+                min_world[0] + (shape[0] - 1) * voxel_size,
+                min_world[1] + (shape[1] - 1) * voxel_size,
+                min_world[2] + (shape[2] - 1) * voxel_size,
+            ],
+            voxel_size,
+            bg_value=bg_value,
+            points_in_world_space=True,
+            translation=min_world,
+            device=device,
+        )
+        # Populate volume
+        if hasattr(bg_value, "__len__"):
+            warp.launch(
+                warp.utils.copy_dense_volume_to_nano_vdb_v,
+                dim=(shape[0], shape[1], shape[2]),
+                inputs=[volume.id, warp.array(padded_array, dtype=warp.vec3, device=device)],
+                device=device,
+            )
+        elif isinstance(bg_value, int):
+            warp.launch(
+                warp.utils.copy_dense_volume_to_nano_vdb_i,
+                dim=shape,
+                inputs=[volume.id, warp.array(padded_array, dtype=warp.int32, device=device)],
+                device=device,
+            )
+        else:
+            warp.launch(
+                warp.utils.copy_dense_volume_to_nano_vdb_f,
+                dim=shape,
+                inputs=[volume.id, warp.array(padded_array, dtype=warp.float32, device=device)],
+                device=device,
+            )
+        return volume
     @classmethod
     def allocate(
         cls,
@@ -2346,9 +3043,11 @@ class Volume:
         translation=(0.0, 0.0, 0.0),
         points_in_world_space=False,
         device=None,
-    ):
+    ) -> Volume:
         """Allocate a new Volume based on the bounding box defined by min and max.
+        This function is only supported for CUDA devices.
         Allocate a volume that is large enough to contain voxels [min[0], min[1], min[2]] - [max[0], max[1], max[2]], inclusive.
         If points_in_world_space is true, then min and max are first converted to index space with the given voxel size and
         translation, and the volume is allocated with those.
@@ -2357,12 +3056,12 @@ class Volume:
         the resulting tiles will be available in the new volume.
         Args:
-            min (array-like): Lower 3D-coordinates of the bounding box in index space or world space, inclusive
-            max (array-like): Upper 3D-coordinates of the bounding box in index space or world space, inclusive
-            voxel_size (float): Voxel size of the new volume
+            min (array-like): Lower 3D coordinates of the bounding box in index space or world space, inclusive.
+            max (array-like): Upper 3D coordinates of the bounding box in index space or world space, inclusive.
+            voxel_size (float): Voxel size of the new volume.
             bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
-            translation (array-like): translation between the index and world spaces
-            device (Devicelike): Device the array lives on
+            translation (array-like): translation between the index and world spaces.
+            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
         if points_in_world_space:
@@ -2387,9 +3086,11 @@ class Volume:
     @classmethod
     def allocate_by_tiles(
         cls, tile_points: array, voxel_size: float, bg_value=0.0, translation=(0.0, 0.0, 0.0), device=None
-    ):
+    ) -> Volume:
         """Allocate a new Volume with active tiles for each point tile_points.
+        This function is only supported for CUDA devices.
         The smallest unit of allocation is a dense tile of 8x8x8 voxels.
         This is the primary method for allocating sparse volumes. It uses an array of points indicating the tiles that must be allocated.
@@ -2399,13 +3100,13 @@ class Volume:
         Args:
             tile_points (:class:`warp.array`): Array of positions that define the tiles to be allocated.
-                The array can be a 2d, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
+                The array can be a 2D, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
                 or can be a 1D array of :class:`warp.vec3` values, indicating world space positions.
                 Repeated points per tile are allowed and will be efficiently deduplicated.
-            voxel_size (float): Voxel size of the new volume
+            voxel_size (float): Voxel size of the new volume.
             bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
-            translation (array-like): translation between the index and world spaces
-            device (Devicelike): Device the array lives on
+            translation (array-like): Translation between the index and world spaces.
+            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
         from warp.context import runtime
@@ -2442,7 +3143,7 @@ class Volume:
                 translation[2],
                 in_world_space,
             )
-        elif type(bg_value) == int:
+        elif isinstance(bg_value, int):
             volume.id = volume.context.core.volume_i_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
@@ -2473,6 +3174,67 @@ class Volume:
         return volume
+# definition just for kernel type (cannot be a parameter), see mesh.h
+# NOTE: its layout must match the corresponding struct defined in C.
+# NOTE: it needs to be defined after `indexedarray` to workaround a circular import issue.
+class mesh_query_point_t:
+    """Output for the mesh query point functions.
+    Attributes:
+        result (bool): Whether a point is found within the given constraints.
+        sign (float32): A value < 0 if query point is inside the mesh, >=0 otherwise.
+                        Note that mesh must be watertight for this to be robust
+        face (int32): Index of the closest face.
+        u (float32): Barycentric u coordinate of the closest point.
+        v (float32): Barycentric v coordinate of the closest point.
+    See Also:
+        :func:`mesh_query_point`, :func:`mesh_query_point_no_sign`,
+        :func:`mesh_query_furthest_point_no_sign`,
+        :func:`mesh_query_point_sign_normal`,
+        and :func:`mesh_query_point_sign_winding_number`.
+    """
+    from warp.codegen import Var
+    vars = {
+        "result": Var("result", bool),
+        "sign": Var("sign", float32),
+        "face": Var("face", int32),
+        "u": Var("u", float32),
+        "v": Var("v", float32),
+    }
+# definition just for kernel type (cannot be a parameter), see mesh.h
+# NOTE: its layout must match the corresponding struct defined in C.
+class mesh_query_ray_t:
+    """Output for the mesh query ray functions.
+    Attributes:
+        result (bool): Whether a hit is found within the given constraints.
+        sign (float32): A value > 0 if the ray hit in front of the face, returns < 0 otherwise.
+        face (int32): Index of the closest face.
+        t (float32): Distance of the closest hit along the ray.
+        u (float32): Barycentric u coordinate of the closest hit.
+        v (float32): Barycentric v coordinate of the closest hit.
+        normal (vec3f): Face normal.
+    See Also:
+        :func:`mesh_query_ray`.
+    """
+    from warp.codegen import Var
+    vars = {
+        "result": Var("result", bool),
+        "sign": Var("sign", float32),
+        "face": Var("face", int32),
+        "t": Var("t", float32),
+        "u": Var("u", float32),
+        "v": Var("v", float32),
+        "normal": Var("normal", vec3),
+    }
 def matmul(
     a: array2d,
     b: array2d,
@@ -2480,7 +3242,7 @@ def matmul(
     d: array2d,
     alpha: float = 1.0,
     beta: float = 0.0,
-    allow_tf32x3_arith: bool = False,
+    allow_tf32x3_arith: builtins.bool = False,
     device=None,
 ):
     """Computes a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -2509,6 +3271,11 @@ def matmul(
             "wp.matmul currently only supports operation between {A, B, C, D} matrices of the same type."
         )
+    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
+        )
     m = a.shape[0]
     n = b.shape[1]
     k = a.shape[1]
@@ -2543,13 +3310,13 @@ def matmul(
         ctypes.c_void_p(d.ptr),
         alpha,
         beta,
-        True,
-        True,
+        not a.is_transposed,
+        not b.is_transposed,
         allow_tf32x3_arith,
         1,
     )
     if not ret:
-        raise RuntimeError("Matmul failed.")
+        raise RuntimeError("matmul failed.")
 def adj_matmul(
@@ -2562,7 +3329,7 @@ def adj_matmul(
     adj_d: array2d,
     alpha: float = 1.0,
     beta: float = 0.0,
-    allow_tf32x3_arith: bool = False,
+    allow_tf32x3_arith: builtins.bool = False,
     device=None,
 ):
     """Computes the adjoint of a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -2613,6 +3380,19 @@ def adj_matmul(
             "wp.adj_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
         )
+    if (
+        (not a.is_contiguous and not a.is_transposed)
+        or (not b.is_contiguous and not b.is_transposed)
+        or (not c.is_contiguous)
+        or (not adj_a.is_contiguous and not adj_a.is_transposed)
+        or (not adj_b.is_contiguous and not adj_b.is_transposed)
+        or (not adj_c.is_contiguous)
+        or (not adj_d.is_contiguous)
+    ):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
+        )
     m = a.shape[0]
     n = b.shape[1]
     k = a.shape[1]
@@ -2633,75 +3413,105 @@ def adj_matmul(
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()))
-        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()))
-        adj_c.assign(beta * adj_d.numpy())
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()) + adj_a.numpy())
+        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()) + adj_b.numpy())
+        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
     cc = device.arch
     # adj_a
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        k,
-        n,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_a.ptr),
-        alpha,
-        0.0,
-        True,
-        False,
-        allow_tf32x3_arith,
-        1,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not a.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            k,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(b.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            alpha,
+            1.0,
+            True,
+            b.is_transposed,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            m,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(b.ptr),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            ctypes.c_void_p(adj_a.ptr),
+            alpha,
+            1.0,
+            not b.is_transposed,
+            False,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_b
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        k,
-        n,
-        m,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_b.ptr),
-        alpha,
-        0.0,
-        False,
-        True,
-        allow_tf32x3_arith,
-        1,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not b.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            n,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a.ptr),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            alpha,
+            1.0,
+            a.is_transposed,
+            True,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            n,
+            k,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d.ptr),
+            ctypes.c_void_p(a.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            ctypes.c_void_p(adj_b.ptr),
+            alpha,
+            1.0,
+            False,
+            not a.is_transposed,
+            allow_tf32x3_arith,
+            1,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_c
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(adj_c.ptr),
-        0.0,
-        beta,
-        True,
-        True,
-        allow_tf32x3_arith,
-        1,
+    warp.launch(
+        kernel=warp.utils.add_kernel_2d,
+        dim=adj_c.shape,
+        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
+        device=device,
+        record_tape=False
     )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
 def batched_matmul(
@@ -2711,7 +3521,7 @@ def batched_matmul(
     d: array3d,
     alpha: float = 1.0,
     beta: float = 0.0,
-    allow_tf32x3_arith: bool = False,
+    allow_tf32x3_arith: builtins.bool = False,
     device=None,
 ):
     """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -2740,6 +3550,11 @@ def batched_matmul(
             "wp.batched_matmul currently only supports operation between {A, B, C, D} matrices of the same type."
         )
+    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
+        )
     m = a.shape[1]
     n = b.shape[2]
     k = a.shape[2]
@@ -2751,7 +3566,7 @@ def batched_matmul(
     if runtime.tape:
         runtime.tape.record_func(
-            backward=lambda: adj_matmul(
+            backward=lambda: adj_batched_matmul(
                 a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
             ),
             arrays=[a, b, c, d],
@@ -2762,26 +3577,55 @@ def batched_matmul(
         d.assign(alpha * np.matmul(a.numpy(), b.numpy()) + beta * c.numpy())
         return
+    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
+    max_batch_count = 65535
+    iters = int(batch_count / max_batch_count)
+    remainder = batch_count % max_batch_count
     cc = device.arch
+    for i in range(iters):
+        idx_start = i * max_batch_count
+        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            n,
+            k,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(c[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(d[idx_start:idx_end,:,:].ptr),
+            alpha,
+            beta,
+            not a.is_transposed,
+            not b.is_transposed,
+            allow_tf32x3_arith,
+            max_batch_count,
+        )
+        if not ret:
+            raise RuntimeError("Batched matmul failed.")
+    idx_start = iters * max_batch_count
     ret = runtime.core.cutlass_gemm(
         cc,
         m,
         n,
         k,
         type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(c.ptr),
-        ctypes.c_void_p(d.ptr),
+        ctypes.c_void_p(a[idx_start:,:,:].ptr),
+        ctypes.c_void_p(b[idx_start:,:,:].ptr),
+        ctypes.c_void_p(c[idx_start:,:,:].ptr),
+        ctypes.c_void_p(d[idx_start:,:,:].ptr),
         alpha,
         beta,
-        True,
-        True,
+        not a.is_transposed,
+        not b.is_transposed,
         allow_tf32x3_arith,
-        batch_count,
+        remainder,
     )
     if not ret:
-        raise RuntimeError("Batched matmul failed.")
+        raise RuntimeError("Batched matmul failed.")
 def adj_batched_matmul(
@@ -2794,7 +3638,7 @@ def adj_batched_matmul(
     adj_d: array3d,
     alpha: float = 1.0,
     beta: float = 0.0,
-    allow_tf32x3_arith: bool = False,
+    allow_tf32x3_arith: builtins.bool = False,
     device=None,
 ):
     """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -2861,78 +3705,215 @@ def adj_batched_matmul(
             )
         )
+    if (
+        (not a.is_contiguous and not a.is_transposed)
+        or (not b.is_contiguous and not b.is_transposed)
+        or (not c.is_contiguous)
+        or (not adj_a.is_contiguous and not adj_a.is_transposed)
+        or (not adj_b.is_contiguous and not adj_b.is_transposed)
+        or (not adj_c.is_contiguous)
+        or (not adj_d.is_contiguous)
+    ):
+        raise RuntimeError(
+            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
+        )
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))))
-        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()))
-        adj_c.assign(beta * adj_d.numpy())
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))) + adj_a.numpy())
+        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()) + adj_b.numpy())
+        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
+    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
+    max_batch_count = 65535
+    iters = int(batch_count / max_batch_count)
+    remainder = batch_count % max_batch_count
     cc = device.arch
+    for i in range(iters):
+        idx_start = i * max_batch_count
+        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
+        # adj_a
+        if not a.is_transposed:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                m,
+                k,
+                n,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                True,
+                b.is_transposed,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        else:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                k,
+                m,
+                n,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                not b.is_transposed,
+                False,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        # adj_b
+        if not b.is_transposed:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                k,
+                n,
+                m,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                a.is_transposed,
+                True,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+        else:
+            ret = runtime.core.cutlass_gemm(
+                cc,
+                n,
+                k,
+                m,
+                type_typestr(a.dtype).encode(),
+                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                alpha,
+                1.0,
+                False,
+                not a.is_transposed,
+                allow_tf32x3_arith,
+                max_batch_count,
+            )
+            if not ret:
+                raise RuntimeError("adj_matmul failed.")
+    idx_start = iters * max_batch_count
     # adj_a
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        k,
-        n,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_a.ptr),
-        alpha,
-        0.0,
-        True,
-        False,
-        allow_tf32x3_arith,
-        batch_count,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not a.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            m,
+            k,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            True,
+            b.is_transposed,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            m,
+            n,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            not b.is_transposed,
+            False,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_b
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        k,
-        n,
-        m,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_b.ptr),
-        alpha,
-        0.0,
-        False,
-        True,
-        allow_tf32x3_arith,
-        batch_count,
-    )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
+    if not b.is_transposed:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            k,
+            n,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            a.is_transposed,
+            True,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
+    else:
+        ret = runtime.core.cutlass_gemm(
+            cc,
+            n,
+            k,
+            m,
+            type_typestr(a.dtype).encode(),
+            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
+            ctypes.c_void_p(a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            alpha,
+            1.0,
+            False,
+            not a.is_transposed,
+            allow_tf32x3_arith,
+            remainder,
+        )
+        if not ret:
+            raise RuntimeError("adj_matmul failed.")
     # adj_c
-    ret = runtime.core.cutlass_gemm(
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(adj_d.ptr),
-        ctypes.c_void_p(adj_c.ptr),
-        0.0,
-        beta,
-        True,
-        True,
-        allow_tf32x3_arith,
-        batch_count,
+    warp.launch(
+        kernel=warp.utils.add_kernel_3d,
+        dim=adj_c.shape,
+        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
+        device=device,
+        record_tape=False
     )
-    if not ret:
-        raise RuntimeError("adj_matmul failed.")
 class HashGrid:
     def __init__(self, dim_x, dim_y, dim_z, device=None):
@@ -3001,7 +3982,7 @@ class HashGrid:
                 with self.device.context_guard:
                     runtime.core.hash_grid_destroy_device(self.id)
-        except:
+        except Exception:
             pass
@@ -3075,7 +4056,7 @@ class MarchingCubes:
         if error:
             raise RuntimeError(
-                "Error occured buffers may not be large enough, marching cubes required at least {num_verts} vertices, and {num_tris} triangles."
+                "Buffers may not be large enough, marching cubes required at least {num_verts} vertices, and {num_tris} triangles."
             )
         # resize the geometry arrays
@@ -3131,7 +4112,7 @@ def type_matches_template(arg_type, template_type):
         return True
     elif is_array(template_type):
         # ensure the argument type is a non-generic array with matching dtype and dimensionality
-        if type(arg_type) != type(template_type):
+        if type(arg_type) is not type(template_type):
             return False
         if not type_matches_template(arg_type.dtype, template_type.dtype):
             return False
@@ -3160,9 +4141,53 @@ def type_matches_template(arg_type, template_type):
     return True
+def infer_argument_types(args, template_types, arg_names=None):
+    """Resolve argument types with the given list of template types."""
+    if len(args) != len(template_types):
+        raise RuntimeError("Number of arguments must match number of template types.")
+    arg_types = []
+    for i in range(len(args)):
+        arg = args[i]
+        arg_type = type(arg)
+        arg_name = arg_names[i] if arg_names else str(i)
+        if arg_type in warp.types.array_types:
+            arg_types.append(arg_type(dtype=arg.dtype, ndim=arg.ndim))
+        elif arg_type in warp.types.scalar_types:
+            arg_types.append(arg_type)
+        elif arg_type in [int, float]:
+            # canonicalize type
+            arg_types.append(warp.types.type_to_warp(arg_type))
+        elif hasattr(arg_type, "_wp_scalar_type_"):
+            # vector/matrix type
+            arg_types.append(arg_type)
+        elif issubclass(arg_type, warp.codegen.StructInstance):
+            # a struct
+            arg_types.append(arg._cls)
+        # elif arg_type in [warp.types.launch_bounds_t, warp.types.shape_t, warp.types.range_t]:
+        #     arg_types.append(arg_type)
+        # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.mesh_query_point_t, warp.mesh_query_ray_t, warp.bvh_query_t]:
+        #     arg_types.append(arg_type)
+        elif arg is None:
+            # allow passing None for arrays
+            t = template_types[i]
+            if warp.types.is_array(t):
+                arg_types.append(type(t)(dtype=t.dtype, ndim=t.ndim))
+            else:
+                raise TypeError(f"Unable to infer the type of argument '{arg_name}', got None")
+        else:
+            # TODO: attempt to figure out if it's a vector/matrix type given as a numpy array, list, etc.
+            raise TypeError(f"Unable to infer the type of argument '{arg_name}', got {arg_type}")
+    return arg_types
 simple_type_codes = {
     int: "i4",
     float: "f4",
+    builtins.bool: "b",
     bool: "b",
     str: "str",  # accepted by print()
     int8: "i1",
@@ -3181,6 +4206,8 @@ simple_type_codes = {
     launch_bounds_t: "lb",
     hash_grid_query_t: "hgq",
     mesh_query_aabb_t: "mqa",
+    mesh_query_point_t: "mqp",
+    mesh_query_ray_t: "mqr",
     bvh_query_t: "bvhq",
 }
@@ -3197,14 +4224,14 @@ def get_type_code(arg_type):
             # check for "special" vector/matrix subtypes
             if hasattr(arg_type, "_wp_generic_type_str_"):
                 type_str = arg_type._wp_generic_type_str_
-                if type_str == "quaternion":
+                if type_str == "quat_t":
                     return f"q{dtype_code}"
                 elif type_str == "transform_t":
                     return f"t{dtype_code}"
-                elif type_str == "spatial_vector_t":
-                    return f"sv{dtype_code}"
-                elif type_str == "spatial_matrix_t":
-                    return f"sm{dtype_code}"
+                # elif type_str == "spatial_vector_t":
+                #     return f"sv{dtype_code}"
+                # elif type_str == "spatial_matrix_t":
+                #     return f"sm{dtype_code}"
             # generic vector/matrix
             ndim = len(arg_type._shape_)
             if ndim == 1:
@@ -3227,6 +4254,10 @@ def get_type_code(arg_type):
         return f"a{arg_type.ndim}{get_type_code(arg_type.dtype)}"
     elif isinstance(arg_type, indexedarray):
         return f"ia{arg_type.ndim}{get_type_code(arg_type.dtype)}"
+    elif isinstance(arg_type, fabricarray):
+        return f"fa{arg_type.ndim}{get_type_code(arg_type.dtype)}"
+    elif isinstance(arg_type, indexedfabricarray):
+        return f"ifa{arg_type.ndim}{get_type_code(arg_type.dtype)}"
     elif isinstance(arg_type, warp.codegen.Struct):
         return warp.codegen.make_full_qualified_name(arg_type.cls)
     elif arg_type == Scalar: