PyPI - warp-lang - Versions diffs - 1.2.2__py3-none-manylinux2014_aarch64.whl → 1.3.0__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.2.2__py3-none-manylinux2014_aarch64.whl → 1.3.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (193) hide show

warp/__init__.py +8 -6
warp/autograd.py +823 -0
warp/bin/warp.so +0 -0
warp/build.py +6 -2
warp/builtins.py +1410 -886
warp/codegen.py +503 -166
warp/config.py +48 -18
warp/context.py +400 -198
warp/dlpack.py +8 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +1 -1
warp/examples/benchmarks/benchmark_interop_torch.py +158 -0
warp/examples/benchmarks/benchmark_launches.py +1 -1
warp/examples/core/example_cupy.py +78 -0
warp/examples/fem/example_apic_fluid.py +17 -36
warp/examples/fem/example_burgers.py +9 -18
warp/examples/fem/example_convection_diffusion.py +7 -17
warp/examples/fem/example_convection_diffusion_dg.py +27 -47
warp/examples/fem/example_deformed_geometry.py +11 -22
warp/examples/fem/example_diffusion.py +7 -18
warp/examples/fem/example_diffusion_3d.py +24 -28
warp/examples/fem/example_diffusion_mgpu.py +7 -14
warp/examples/fem/example_magnetostatics.py +190 -0
warp/examples/fem/example_mixed_elasticity.py +111 -80
warp/examples/fem/example_navier_stokes.py +30 -34
warp/examples/fem/example_nonconforming_contact.py +290 -0
warp/examples/fem/example_stokes.py +17 -32
warp/examples/fem/example_stokes_transfer.py +12 -21
warp/examples/fem/example_streamlines.py +350 -0
warp/examples/fem/utils.py +936 -0
warp/fabric.py +5 -2
warp/fem/__init__.py +13 -3
warp/fem/cache.py +161 -11
warp/fem/dirichlet.py +37 -28
warp/fem/domain.py +105 -14
warp/fem/field/__init__.py +14 -3
warp/fem/field/field.py +454 -11
warp/fem/field/nodal_field.py +33 -18
warp/fem/geometry/deformed_geometry.py +50 -15
warp/fem/geometry/hexmesh.py +12 -24
warp/fem/geometry/nanogrid.py +106 -31
warp/fem/geometry/quadmesh_2d.py +6 -11
warp/fem/geometry/tetmesh.py +103 -61
warp/fem/geometry/trimesh_2d.py +98 -47
warp/fem/integrate.py +231 -186
warp/fem/operator.py +14 -9
warp/fem/quadrature/pic_quadrature.py +35 -9
warp/fem/quadrature/quadrature.py +119 -32
warp/fem/space/basis_space.py +98 -22
warp/fem/space/collocated_function_space.py +3 -1
warp/fem/space/function_space.py +7 -2
warp/fem/space/grid_2d_function_space.py +3 -3
warp/fem/space/grid_3d_function_space.py +4 -4
warp/fem/space/hexmesh_function_space.py +3 -2
warp/fem/space/nanogrid_function_space.py +12 -14
warp/fem/space/partition.py +45 -47
warp/fem/space/restriction.py +19 -16
warp/fem/space/shape/cube_shape_function.py +91 -3
warp/fem/space/shape/shape_function.py +7 -0
warp/fem/space/shape/square_shape_function.py +32 -0
warp/fem/space/shape/tet_shape_function.py +11 -7
warp/fem/space/shape/triangle_shape_function.py +10 -1
warp/fem/space/topology.py +116 -42
warp/fem/types.py +8 -1
warp/fem/utils.py +301 -83
warp/native/array.h +16 -0
warp/native/builtin.h +0 -15
warp/native/cuda_util.cpp +14 -6
warp/native/exports.h +1348 -1308
warp/native/quat.h +79 -0
warp/native/rand.h +27 -4
warp/native/sparse.cpp +83 -81
warp/native/sparse.cu +381 -453
warp/native/vec.h +64 -0
warp/native/volume.cpp +40 -49
warp/native/volume_builder.cu +2 -3
warp/native/volume_builder.h +12 -17
warp/native/warp.cu +3 -3
warp/native/warp.h +69 -59
warp/render/render_opengl.py +17 -9
warp/sim/articulation.py +117 -17
warp/sim/collide.py +35 -29
warp/sim/model.py +123 -18
warp/sim/render.py +3 -1
warp/sparse.py +867 -203
warp/stubs.py +312 -541
warp/tape.py +29 -1
warp/tests/disabled_kinematics.py +1 -1
warp/tests/test_adam.py +1 -1
warp/tests/test_arithmetic.py +1 -1
warp/tests/test_array.py +58 -1
warp/tests/test_array_reduce.py +1 -1
warp/tests/test_async.py +1 -1
warp/tests/test_atomic.py +1 -1
warp/tests/test_bool.py +1 -1
warp/tests/test_builtins_resolution.py +1 -1
warp/tests/test_bvh.py +6 -1
warp/tests/test_closest_point_edge_edge.py +1 -1
warp/tests/test_codegen.py +66 -1
warp/tests/test_compile_consts.py +1 -1
warp/tests/test_conditional.py +1 -1
warp/tests/test_copy.py +1 -1
warp/tests/test_ctypes.py +1 -1
warp/tests/test_dense.py +1 -1
warp/tests/test_devices.py +1 -1
warp/tests/test_dlpack.py +1 -1
warp/tests/test_examples.py +33 -4
warp/tests/test_fabricarray.py +5 -2
warp/tests/test_fast_math.py +1 -1
warp/tests/test_fem.py +213 -6
warp/tests/test_fp16.py +1 -1
warp/tests/test_func.py +1 -1
warp/tests/test_future_annotations.py +90 -0
warp/tests/test_generics.py +1 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +1 -1
warp/tests/test_grad_debug.py +247 -0
warp/tests/test_hash_grid.py +6 -1
warp/tests/test_implicit_init.py +354 -0
warp/tests/test_import.py +1 -1
warp/tests/test_indexedarray.py +1 -1
warp/tests/test_intersect.py +1 -1
warp/tests/test_jax.py +1 -1
warp/tests/test_large.py +1 -1
warp/tests/test_launch.py +1 -1
warp/tests/test_lerp.py +1 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_lvalue.py +1 -1
warp/tests/test_marching_cubes.py +5 -2
warp/tests/test_mat.py +34 -35
warp/tests/test_mat_lite.py +2 -1
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_math.py +1 -1
warp/tests/test_matmul.py +20 -16
warp/tests/test_matmul_lite.py +1 -1
warp/tests/test_mempool.py +1 -1
warp/tests/test_mesh.py +5 -2
warp/tests/test_mesh_query_aabb.py +1 -1
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_mesh_query_ray.py +1 -1
warp/tests/test_mlp.py +1 -1
warp/tests/test_model.py +1 -1
warp/tests/test_module_hashing.py +77 -1
warp/tests/test_modules_lite.py +1 -1
warp/tests/test_multigpu.py +1 -1
warp/tests/test_noise.py +1 -1
warp/tests/test_operators.py +1 -1
warp/tests/test_options.py +1 -1
warp/tests/test_overwrite.py +542 -0
warp/tests/test_peer.py +1 -1
warp/tests/test_pinned.py +1 -1
warp/tests/test_print.py +1 -1
warp/tests/test_quat.py +15 -1
warp/tests/test_rand.py +1 -1
warp/tests/test_reload.py +1 -1
warp/tests/test_rounding.py +1 -1
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +95 -0
warp/tests/test_sim_grad.py +1 -1
warp/tests/test_sim_kinematics.py +1 -1
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +82 -15
warp/tests/test_spatial.py +1 -1
warp/tests/test_special_values.py +2 -11
warp/tests/test_streams.py +11 -1
warp/tests/test_struct.py +1 -1
warp/tests/test_tape.py +1 -1
warp/tests/test_torch.py +194 -1
warp/tests/test_transient_module.py +1 -1
warp/tests/test_types.py +1 -1
warp/tests/test_utils.py +1 -1
warp/tests/test_vec.py +15 -63
warp/tests/test_vec_lite.py +2 -1
warp/tests/test_vec_scalar_ops.py +65 -1
warp/tests/test_verify_fp.py +1 -1
warp/tests/test_volume.py +28 -2
warp/tests/test_volume_write.py +1 -1
warp/tests/unittest_serial.py +1 -1
warp/tests/unittest_suites.py +9 -1
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +2 -5
warp/torch.py +103 -41
warp/types.py +341 -224
warp/utils.py +11 -2
{warp_lang-1.2.2.dist-info → warp_lang-1.3.0.dist-info}/METADATA +99 -46
warp_lang-1.3.0.dist-info/RECORD +368 -0
warp/examples/fem/bsr_utils.py +0 -378
warp/examples/fem/mesh_utils.py +0 -133
warp/examples/fem/plot_utils.py +0 -292
warp_lang-1.2.2.dist-info/RECORD +0 -359
{warp_lang-1.2.2.dist-info → warp_lang-1.3.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.2.2.dist-info → warp_lang-1.3.0.dist-info}/WHEEL +0 -0
{warp_lang-1.2.2.dist-info → warp_lang-1.3.0.dist-info}/top_level.txt +0 -0

warp/types.py CHANGED Viewed

@@ -12,7 +12,7 @@ import ctypes
 import inspect
 import struct
 import zlib
-from typing import Any, Callable, Generic, List, NamedTuple, Optional, Tuple, TypeVar, Union
+from typing import Any, Callable, Generic, List, NamedTuple, Optional, Sequence, Tuple, TypeVar, Union
 import numpy as np
@@ -50,6 +50,15 @@ class Array(Generic[DType]):
     pass
+int_tuple_type_hints = {
+    Tuple[int]: 1,
+    Tuple[int, int]: 2,
+    Tuple[int, int, int]: 3,
+    Tuple[int, int, int, int]: 4,
+    Tuple[int, ...]: -1,
+}
 def constant(x):
     """Function to declare compile-time constants accessible from Warp kernels
@@ -99,6 +108,7 @@ def vector(length, dtype):
         # warp scalar type:
         _wp_scalar_type_ = dtype
         _wp_type_params_ = [length, dtype]
+        _wp_type_args_ = {"length": length, "dtype": dtype}
         _wp_generic_type_str_ = "vec_t"
         _wp_generic_type_hint_ = Vector
         _wp_constructor_ = "vector"
@@ -282,6 +292,7 @@ def matrix(shape, dtype):
         # used in type checking and when writing out c++ code for constructors:
         _wp_scalar_type_ = dtype
         _wp_type_params_ = [shape[0], shape[1], dtype]
+        _wp_type_args_ = {"shape": (shape[0], shape[1]), "dtype": dtype}
         _wp_generic_type_str_ = "mat_t"
         _wp_generic_type_hint_ = Matrix
         _wp_constructor_ = "matrix"
@@ -471,233 +482,130 @@ class void:
         pass
-class bool:
-    _length_ = 1
-    _type_ = ctypes.c_bool
-    def __init__(self, x=False):
+class scalar_base:
+    def __init__(self, x=0):
         self.value = x
     def __bool__(self) -> builtins.bool:
         return self.value != 0
-    def __float__(self) -> float:
-        return float(self.value != 0)
-    def __int__(self) -> int:
-        return int(self.value != 0)
-class float16:
-    _length_ = 1
-    _type_ = ctypes.c_uint16
-    def __init__(self, x=0.0):
-        self.value = x
-    def __bool__(self) -> bool:
-        return self.value != 0.0
     def __float__(self) -> float:
         return float(self.value)
     def __int__(self) -> int:
         return int(self.value)
+    def __add__(self, y):
+        return warp.add(self, y)
-class float32:
-    _length_ = 1
-    _type_ = ctypes.c_float
+    def __radd__(self, y):
+        return warp.add(y, self)
-    def __init__(self, x=0.0):
-        self.value = x
+    def __sub__(self, y):
+        return warp.sub(self, y)
-    def __bool__(self) -> bool:
-        return self.value != 0.0
+    def __rsub__(self, y):
+        return warp.sub(y, self)
-    def __float__(self) -> float:
-        return float(self.value)
+    def __mul__(self, y):
+        return warp.mul(self, y)
-    def __int__(self) -> int:
-        return int(self.value)
+    def __rmul__(self, x):
+        return warp.mul(x, self)
+    def __truediv__(self, y):
+        return warp.div(self, y)
-class float64:
-    _length_ = 1
-    _type_ = ctypes.c_double
+    def __rtruediv__(self, x):
+        return warp.div(x, self)
-    def __init__(self, x=0.0):
-        self.value = x
+    def __pos__(self):
+        return warp.pos(self)
-    def __bool__(self) -> bool:
-        return self.value != 0.0
+    def __neg__(self):
+        return warp.neg(self)
-    def __float__(self) -> float:
-        return float(self.value)
-    def __int__(self) -> int:
-        return int(self.value)
-class int8:
-    _length_ = 1
-    _type_ = ctypes.c_int8
-    def __init__(self, x=0):
-        self.value = x
-    def __bool__(self) -> bool:
-        return self.value != 0
+class float_base(scalar_base):
+    pass
-    def __float__(self) -> float:
-        return float(self.value)
-    def __int__(self) -> int:
-        return int(self.value)
+class int_base(scalar_base):
     def __index__(self) -> int:
         return int(self.value)
-class uint8:
+class bool:
     _length_ = 1
-    _type_ = ctypes.c_uint8
+    _type_ = ctypes.c_bool
-    def __init__(self, x=0):
+    def __init__(self, x=False):
         self.value = x
-    def __bool__(self) -> bool:
+    def __bool__(self) -> builtins.bool:
         return self.value != 0
     def __float__(self) -> float:
-        return float(self.value)
+        return float(self.value != 0)
     def __int__(self) -> int:
-        return int(self.value)
-    def __index__(self) -> int:
-        return int(self.value)
+        return int(self.value != 0)
-class int16:
+class float16(float_base):
     _length_ = 1
-    _type_ = ctypes.c_int16
-    def __init__(self, x=0):
-        self.value = x
-    def __bool__(self) -> bool:
-        return self.value != 0
-    def __float__(self) -> float:
-        return float(self.value)
+    _type_ = ctypes.c_uint16
-    def __int__(self) -> int:
-        return int(self.value)
-    def __index__(self) -> int:
-        return int(self.value)
+class float32(float_base):
+    _length_ = 1
+    _type_ = ctypes.c_float
-class uint16:
+class float64(float_base):
     _length_ = 1
-    _type_ = ctypes.c_uint16
+    _type_ = ctypes.c_double
-    def __init__(self, x=0):
-        self.value = x
-    def __bool__(self) -> bool:
-        return self.value != 0
-    def __float__(self) -> float:
-        return float(self.value)
+class int8(int_base):
+    _length_ = 1
+    _type_ = ctypes.c_int8
-    def __int__(self) -> int:
-        return int(self.value)
-    def __index__(self) -> int:
-        return int(self.value)
+class uint8(int_base):
+    _length_ = 1
+    _type_ = ctypes.c_uint8
-class int32:
+class int16(int_base):
     _length_ = 1
-    _type_ = ctypes.c_int32
+    _type_ = ctypes.c_int16
-    def __init__(self, x=0):
-        self.value = x
-    def __bool__(self) -> bool:
-        return self.value != 0
+class uint16(int_base):
+    _length_ = 1
+    _type_ = ctypes.c_uint16
-    def __float__(self) -> float:
-        return float(self.value)
-    def __int__(self) -> int:
-        return int(self.value)
-    def __index__(self) -> int:
-        return int(self.value)
+class int32(int_base):
+    _length_ = 1
+    _type_ = ctypes.c_int32
-class uint32:
+class uint32(int_base):
     _length_ = 1
     _type_ = ctypes.c_uint32
-    def __init__(self, x=0):
-        self.value = x
-    def __bool__(self) -> bool:
-        return self.value != 0
-    def __float__(self) -> float:
-        return float(self.value)
-    def __int__(self) -> int:
-        return int(self.value)
-    def __index__(self) -> int:
-        return int(self.value)
-class int64:
+class int64(int_base):
     _length_ = 1
     _type_ = ctypes.c_int64
-    def __init__(self, x=0):
-        self.value = x
-    def __bool__(self) -> bool:
-        return self.value != 0
-    def __float__(self) -> float:
-        return float(self.value)
-    def __int__(self) -> int:
-        return int(self.value)
-    def __index__(self) -> int:
-        return int(self.value)
-class uint64:
+class uint64(int_base):
     _length_ = 1
     _type_ = ctypes.c_uint64
-    def __init__(self, x=0):
-        self.value = x
-    def __bool__(self) -> bool:
-        return self.value != 0
-    def __float__(self) -> float:
-        return float(self.value)
-    def __int__(self) -> int:
-        return int(self.value)
-    def __index__(self) -> int:
-        return int(self.value)
 def quaternion(dtype=Any):
     class quat_t(vector(length=4, dtype=dtype)):
@@ -707,6 +615,7 @@ def quaternion(dtype=Any):
     ret = quat_t
     ret._wp_type_params_ = [dtype]
+    ret._wp_type_args_ = {"dtype": dtype}
     ret._wp_generic_type_str_ = "quat_t"
     ret._wp_generic_type_hint_ = Quaternion
     ret._wp_constructor_ = "quaternion"
@@ -743,6 +652,7 @@ def transformation(dtype=Any):
             ),
         )
         _wp_type_params_ = [dtype]
+        _wp_type_args_ = {"dtype": dtype}
         _wp_generic_type_str_ = "transform_t"
         _wp_generic_type_hint_ = Transformation
         _wp_constructor_ = "transformation"
@@ -1150,6 +1060,9 @@ class bvh_query_t:
         pass
+BvhQuery = bvh_query_t
 # definition just for kernel type (cannot be a parameter), see mesh.h
 class mesh_query_aabb_t:
     """Object used to track state during mesh traversal."""
@@ -1158,6 +1071,9 @@ class mesh_query_aabb_t:
         pass
+MeshQueryAABB = mesh_query_aabb_t
 # definition just for kernel type (cannot be a parameter), see hash_grid.h
 class hash_grid_query_t:
     """Object used to track state during neighbor traversal."""
@@ -1166,6 +1082,9 @@ class hash_grid_query_t:
         pass
+HashGridQuery = hash_grid_query_t
 # maximum number of dimensions, must match array.h
 ARRAY_MAX_DIMS = 4
 LAUNCH_MAX_DIMS = 4
@@ -1378,7 +1297,8 @@ def type_repr(t):
     if t in scalar_types:
         return t.__name__
-    return t.__module__ + "." + t.__qualname__
+    name = getattr(t, "__qualname__", t.__name__)
+    return t.__module__ + "." + name
 def type_is_int(t):
@@ -1400,6 +1320,11 @@ def type_is_vector(t):
     return getattr(t, "_wp_generic_type_hint_", None) is Vector
+# returns True if the passed *type* is a quaternion
+def type_is_quaternion(t):
+    return getattr(t, "_wp_generic_type_hint_", None) is Quaternion
 # returns True if the passed *type* is a matrix
 def type_is_matrix(t):
     return getattr(t, "_wp_generic_type_hint_", None) is Matrix
@@ -1432,9 +1357,30 @@ def is_array(a):
 def scalars_equal(a, b, match_generic):
+    # convert to canonical types
+    if a == float:
+        a = float32
+    elif a == int:
+        a = int32
+    elif a == builtins.bool:
+        a = bool
+    if b == float:
+        b = float32
+    elif b == int:
+        b = int32
+    elif b == builtins.bool:
+        b = bool
     if match_generic:
         if a == Any or b == Any:
             return True
+        if a == Int and b in int_types:
+            return True
+        if b == Int and a in int_types:
+            return True
+        if a == Int and b == Int:
+            return True
         if a == Scalar and b in scalar_and_bool_types:
             return True
         if b == Scalar and a in scalar_and_bool_types:
@@ -1448,25 +1394,29 @@ def scalars_equal(a, b, match_generic):
         if a == Float and b == Float:
             return True
-    # convert to canonical types
-    if a == float:
-        a = float32
-    elif a == int:
-        a = int32
-    elif a == builtins.bool:
-        a = bool
-    if b == float:
-        b = float32
-    elif b == int:
-        b = int32
-    elif b == builtins.bool:
-        b = bool
     return a == b
 def types_equal(a, b, match_generic=False):
+    if match_generic:
+        if a in int_tuple_type_hints and isinstance(b, Sequence):
+            a_length = int_tuple_type_hints[a]
+            if (a_length == -1 or a_length == len(b)) and all(
+                scalars_equal(x, Int, match_generic=match_generic) for x in b
+            ):
+                return True
+        if b in int_tuple_type_hints and isinstance(a, Sequence):
+            b_length = int_tuple_type_hints[b]
+            if (b_length == -1 or b_length == len(a)) and all(
+                scalars_equal(x, Int, match_generic=match_generic) for x in a
+            ):
+                return True
+        if a in int_tuple_type_hints and b in int_tuple_type_hints:
+            a_length = int_tuple_type_hints[a]
+            b_length = int_tuple_type_hints[b]
+            if a_length is None or b_length is None or a_length == b_length:
+                return True
     # convert to canonical types
     if a == float:
         a = float32
@@ -1522,6 +1472,61 @@ def check_array_shape(shape: Tuple):
             )
+def array_ctype_from_interface(interface: dict, dtype=None, owner=None):
+    """Get native array descriptor (array_t) from __array_interface__ or __cuda_array_interface__ dictionary"""
+    ptr = interface.get("data")[0]
+    shape = interface.get("shape")
+    strides = interface.get("strides")
+    typestr = interface.get("typestr")
+    element_dtype = dtype_from_numpy(np.dtype(typestr))
+    if strides is None:
+        strides = strides_from_shape(shape, element_dtype)
+    if dtype is None:
+        # accept verbatum
+        pass
+    elif hasattr(dtype, "_shape_"):
+        # vector/matrix types, ensure element dtype matches
+        if element_dtype != dtype._wp_scalar_type_:
+            raise RuntimeError(
+                f"Could not convert array interface with typestr='{typestr}' to Warp array with dtype={dtype}"
+            )
+        dtype_shape = dtype._shape_
+        dtype_dims = len(dtype._shape_)
+        ctype_size = ctypes.sizeof(dtype._type_)
+        # ensure inner shape matches
+        if dtype_dims > len(shape) or dtype_shape != shape[-dtype_dims:]:
+            raise RuntimeError(
+                f"Could not convert array interface with shape {shape} to Warp array with dtype={dtype}, ensure that source inner shape is {dtype_shape}"
+            )
+        # ensure inner strides are contiguous
+        if strides[-1] != ctype_size or (dtype_dims > 1 and strides[-2] != ctype_size * dtype_shape[-1]):
+            raise RuntimeError(
+                f"Could not convert array interface with shape {shape} to Warp array with dtype={dtype}, because the source inner strides are not contiguous"
+            )
+        # trim shape and strides
+        shape = tuple(shape[:-dtype_dims]) or (1,)
+        strides = tuple(strides[:-dtype_dims]) or (ctype_size,)
+    else:
+        # scalar types, ensure dtype matches
+        if element_dtype != dtype:
+            raise RuntimeError(
+                f"Could not convert array interface with typestr='{typestr}' to Warp array with dtype={dtype}"
+            )
+    # create array descriptor
+    array_ctype = array_t(ptr, 0, len(shape), shape, strides)
+    # keep owner alive
+    if owner is not None:
+        array_ctype._ref = owner
+    return array_ctype
 class array(Array):
     # member attributes available during code-gen (e.g.: d = array.shape[0])
     # (initialized when needed)
@@ -1631,6 +1636,9 @@ class array(Array):
         else:
             self._init_annotation(dtype, ndim or 1)
+        # initialize read flag
+        self.mark_init()
         # initialize gradient, if needed
         if self.device is not None:
             if grad is not None:
@@ -1642,6 +1650,9 @@ class array(Array):
                 if requires_grad:
                     self._alloc_grad()
+        # reference to other array
+        self._ref = None
     def _init_from_data(self, data, dtype, shape, device, copy, pinned):
         if not hasattr(data, "__len__"):
             raise RuntimeError(f"Data must be a sequence or array, got scalar {data}")
@@ -2164,6 +2175,9 @@ class array(Array):
         """
         Enables A @ B syntax for matrix multiplication
         """
+        if not is_array(other):
+            return NotImplemented
         if self.ndim != 2 or other.ndim != 2:
             raise RuntimeError(
                 "A has dim = {}, B has dim = {}. If multiplying with @, A and B must have dim = 2.".format(
@@ -2234,6 +2248,33 @@ class array(Array):
             array._vars = {"shape": warp.codegen.Var("shape", shape_t)}
         return array._vars
+    def mark_init(self):
+        """Resets this array's read flag"""
+        self._is_read = False
+    def mark_read(self):
+        """Marks this array as having been read from in a kernel or recorded function on the tape."""
+        # no additional checks required: it is always safe to set an array to READ
+        self._is_read = True
+        # recursively update all parent arrays
+        parent = self._ref
+        while parent is not None:
+            parent._is_read = True
+            parent = parent._ref
+    def mark_write(self, **kwargs):
+        """Detect if we are writing to an array that has already been read from"""
+        if self._is_read:
+            if "arg_name" and "kernel_name" and "filename" and "lineno" in kwargs:
+                print(
+                    f"Warning: Array {self} passed to argument {kwargs['arg_name']} in kernel {kwargs['kernel_name']} at {kwargs['filename']}:{kwargs['lineno']} is being written to but has already been read from in a previous launch. This may corrupt gradient computation in the backward pass."
+                )
+            else:
+                print(
+                    f"Warning: Array {self} is being written to but has already been read from in a previous launch. This may corrupt gradient computation in the backward pass."
+                )
     def zero_(self):
         """Zeroes-out the array entries."""
         if self.is_contiguous:
@@ -2241,6 +2282,7 @@ class array(Array):
             self.device.memset(self.ptr, 0, self.size * type_size_in_bytes(self.dtype))
         else:
             self.fill_(0)
+        self.mark_init()
     def fill_(self, value):
         """Set all array entries to `value`
@@ -2315,6 +2357,8 @@ class array(Array):
             else:
                 warp.context.runtime.core.array_fill_host(carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size)
+        self.mark_init()
     def assign(self, src):
         """Wraps ``src`` in an :class:`warp.array` if it is not already one and copies the contents to ``self``."""
         if is_array(src):
@@ -2421,6 +2465,9 @@ class array(Array):
             grad=None if self.grad is None else self.grad.flatten(),
         )
+        # transfer read flag
+        a._is_read = self._is_read
         # store back-ref to stop data being destroyed
         a._ref = self
         return a
@@ -2482,6 +2529,9 @@ class array(Array):
             grad=None if self.grad is None else self.grad.reshape(shape),
         )
+        # transfer read flag
+        a._is_read = self._is_read
         # store back-ref to stop data being destroyed
         a._ref = self
         return a
@@ -2505,6 +2555,9 @@ class array(Array):
             grad=None if self.grad is None else self.grad.view(dtype),
         )
+        # transfer read flag
+        a._is_read = self._is_read
         a._ref = self
         return a
@@ -2558,6 +2611,9 @@ class array(Array):
         a.is_transposed = not self.is_transposed
+        # transfer read flag
+        a._is_read = self._is_read
         a._ref = self
         return a
@@ -2841,6 +2897,11 @@ def array_type_id(a):
 class Bvh:
+    def __new__(cls, *args, **kwargs):
+        instance = super(Bvh, cls).__new__(cls)
+        instance.id = None
+        return instance
     def __init__(self, lowers, uppers):
         """Class representing a bounding volume hierarchy.
@@ -2853,8 +2914,6 @@ class Bvh:
             uppers (:class:`warp.array`): Array of upper bounds :class:`warp.vec3`
         """
-        self.id = 0
         if len(lowers) != len(uppers):
             raise RuntimeError("Bvh the same number of lower and upper bounds must be provided")
@@ -2916,6 +2975,11 @@ class Mesh:
         "indices": Var("indices", array(dtype=int32)),
     }
+    def __new__(cls, *args, **kwargs):
+        instance = super(Mesh, cls).__new__(cls)
+        instance.id = None
+        return instance
     def __init__(self, points=None, indices=None, velocities=None, support_winding_number=False):
         """Class representing a triangle mesh.
@@ -2930,8 +2994,6 @@ class Mesh:
             support_winding_number (bool): If true the mesh will build additional datastructures to support `wp.mesh_query_point_sign_winding_number()` queries
         """
-        self.id = 0
         if points.device != indices.device:
             raise RuntimeError("Mesh points and indices must live on the same device")
@@ -3001,6 +3063,11 @@ class Volume:
     #: Enum value to specify trilinear interpolation during sampling
     LINEAR = constant(1)
+    def __new__(cls, *args, **kwargs):
+        instance = super(Volume, cls).__new__(cls)
+        instance.id = None
+        return instance
     def __init__(self, data: array, copy: bool = True):
         """Class representing a sparse grid.
@@ -3009,8 +3076,6 @@ class Volume:
             copy (bool): Whether the incoming data will be copied or aliased
         """
-        self.id = 0
         # keep a runtime reference for orderly destruction
         self.runtime = warp.context.runtime
@@ -3568,9 +3633,39 @@ class Volume:
         return cls.allocate_by_tiles(tile_points, voxel_size, bg_value, translation, device)
+    @staticmethod
+    def _fill_transform_buffers(
+        voxel_size: Union[float, List[float]],
+        translation,
+        transform,
+    ):
+        if transform is None:
+            if voxel_size is None:
+                raise ValueError("Either 'voxel_size' or 'transform' must be provided")
+            if isinstance(voxel_size, float):
+                voxel_size = (voxel_size, voxel_size, voxel_size)
+            transform = mat33f(voxel_size[0], 0.0, 0.0, 0.0, voxel_size[1], 0.0, 0.0, 0.0, voxel_size[2])
+        else:
+            if voxel_size is not None:
+                raise ValueError("Only one of 'voxel_size' or 'transform' must be provided")
+            if not isinstance(transform, mat33f):
+                transform = mat33f(transform)
+        transform_buf = (ctypes.c_float * 9).from_buffer_copy(transform)
+        translation_buf = (ctypes.c_float * 3)(translation[0], translation[1], translation[2])
+        return transform_buf, translation_buf
     @classmethod
     def allocate_by_tiles(
-        cls, tile_points: array, voxel_size: float, bg_value=0.0, translation=(0.0, 0.0, 0.0), device=None
+        cls,
+        tile_points: array,
+        voxel_size: Union[float, List[float]] = None,
+        bg_value=0.0,
+        translation=(0.0, 0.0, 0.0),
+        device=None,
+        transform=None,
     ) -> Volume:
         """Allocate a new Volume with active tiles for each point tile_points.
@@ -3588,16 +3683,15 @@ class Volume:
                 The array may use an integer scalar type (2D N-by-3 array of :class:`warp.int32` or 1D array of `warp.vec3i` values), indicating index space positions,
                 or a floating point scalar type (2D N-by-3 array of :class:`warp.float32` or 1D array of `warp.vec3f` values), indicating world space positions.
                 Repeated points per tile are allowed and will be efficiently deduplicated.
-            voxel_size (float): Voxel size of the new volume.
+            voxel_size (float or array-like): Voxel size(s) of the new volume. Ignored if `transform` is given.
             bg_value (array-like, float, int or None): Value of unallocated voxels of the volume, also defines the volume's type. A :class:`warp.vec3` volume is created if this is `array-like`, an index volume will be created if `bg_value` is ``None``.
             translation (array-like): Translation between the index and world spaces.
+            transform (array-like): Linear transform between the index and world spaces. If ``None``, deduced from `voxel_size`.
             device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
         device = warp.get_device(device)
-        if voxel_size <= 0.0:
-            raise RuntimeError(f"Voxel size must be positive! Got {voxel_size}")
         if not device.is_cuda:
             raise RuntimeError("Only CUDA devices are supported for allocate_by_tiles")
         if not _is_contiguous_vec_like_array(tile_points, vec_length=3, scalar_types=(float32, int32)):
@@ -3610,15 +3704,16 @@ class Volume:
         volume = cls(data=None)
         volume.device = device
         in_world_space = type_scalar_type(tile_points.dtype) == float32
+        transform_buf, translation_buf = Volume._fill_transform_buffers(voxel_size, translation, transform)
         if bg_value is None:
             volume.id = volume.runtime.core.volume_index_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
                 tile_points.shape[0],
-                voxel_size,
-                translation[0],
-                translation[1],
-                translation[2],
+                transform_buf,
+                translation_buf,
                 in_world_space,
             )
         elif hasattr(bg_value, "__len__"):
@@ -3626,38 +3721,30 @@ class Volume:
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
                 tile_points.shape[0],
-                voxel_size,
-                bg_value[0],
-                bg_value[1],
-                bg_value[2],
-                translation[0],
-                translation[1],
-                translation[2],
+                transform_buf,
+                translation_buf,
                 in_world_space,
+                (ctypes.c_float * 3)(bg_value[0], bg_value[1], bg_value[2]),
             )
         elif isinstance(bg_value, int):
             volume.id = volume.runtime.core.volume_i_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
                 tile_points.shape[0],
-                voxel_size,
-                bg_value,
-                translation[0],
-                translation[1],
-                translation[2],
+                transform_buf,
+                translation_buf,
                 in_world_space,
+                bg_value,
             )
         else:
             volume.id = volume.runtime.core.volume_f_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
                 tile_points.shape[0],
-                voxel_size,
-                float(bg_value),
-                translation[0],
-                translation[1],
-                translation[2],
+                transform_buf,
+                translation_buf,
                 in_world_space,
+                float(bg_value),
             )
         if volume.id == 0:
@@ -3667,7 +3754,12 @@ class Volume:
     @classmethod
     def allocate_by_voxels(
-        cls, voxel_points: array, voxel_size: float, translation=(0.0, 0.0, 0.0), device=None
+        cls,
+        voxel_points: array,
+        voxel_size: Union[float, List[float]] = None,
+        translation=(0.0, 0.0, 0.0),
+        device=None,
+        transform=None,
     ) -> Volume:
         """Allocate a new Volume with active voxel for each point voxel_points.
@@ -3682,19 +3774,16 @@ class Volume:
                 The array may use an integer scalar type (2D N-by-3 array of :class:`warp.int32` or 1D array of `warp.vec3i` values), indicating index space positions,
                 or a floating point scalar type (2D N-by-3 array of :class:`warp.float32` or 1D array of `warp.vec3f` values), indicating world space positions.
                 Repeated points per tile are allowed and will be efficiently deduplicated.
-            voxel_size (float): Voxel size of the new volume.
+            voxel_size (float or array-like): Voxel size(s) of the new volume. Ignored if `transform` is given.
             translation (array-like): Translation between the index and world spaces.
+            transform (array-like): Linear transform between the index and world spaces. If ``None``, deduced from `voxel_size`.
             device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
         device = warp.get_device(device)
-        if voxel_size <= 0.0:
-            raise RuntimeError(f"Voxel size must be positive! Got {voxel_size}")
         if not device.is_cuda:
             raise RuntimeError("Only CUDA devices are supported for allocate_by_tiles")
-        if not (is_array(voxel_points) and voxel_points.is_contiguous):
-            raise RuntimeError("tile_points must be a contiguous array")
         if not _is_contiguous_vec_like_array(voxel_points, vec_length=3, scalar_types=(float32, int32)):
             raise RuntimeError(
                 "voxel_points must be contiguous and either a 1D warp array of vec3f or vec3i or a 2D n-by-3 array of int32 or float32."
@@ -3706,14 +3795,14 @@ class Volume:
         volume.device = device
         in_world_space = type_scalar_type(voxel_points.dtype) == float32
+        transform_buf, translation_buf = Volume._fill_transform_buffers(voxel_size, translation, transform)
         volume.id = volume.runtime.core.volume_from_active_voxels_device(
             volume.device.context,
             ctypes.c_void_p(voxel_points.ptr),
             voxel_points.shape[0],
-            voxel_size,
-            translation[0],
-            translation[1],
-            translation[2],
+            transform_buf,
+            translation_buf,
             in_world_space,
         )
@@ -3765,6 +3854,9 @@ class mesh_query_point_t:
     }
+MeshQueryPoint = mesh_query_point_t
 # definition just for kernel type (cannot be a parameter), see mesh.h
 # NOTE: its layout must match the corresponding struct defined in C.
 class mesh_query_ray_t:
@@ -3796,6 +3888,9 @@ class mesh_query_ray_t:
     }
+MeshQueryRay = mesh_query_ray_t
 def matmul(
     a: array2d,
     b: array2d,
@@ -3852,10 +3947,16 @@ def matmul(
             backward=lambda: adj_matmul(a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith),
             arrays=[a, b, c, d],
         )
+        if warp.config.verify_autograd_array_access:
+            d.mark_write()
+            a.mark_read()
+            b.mark_read()
+            c.mark_read()
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        d.assign(alpha * (a.numpy() @ b.numpy()) + beta * c.numpy())
+        np_dtype = warp_type_to_np_dtype[a.dtype]
+        d.assign(alpha * np.matmul(a.numpy(), b.numpy(), dtype=np_dtype) + beta * c.numpy())
         return
     cc = device.arch
@@ -3971,8 +4072,9 @@ def adj_matmul(
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()) + adj_a.numpy())
-        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()) + adj_b.numpy())
+        np_dtype = warp_type_to_np_dtype[a.dtype]
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose(), dtype=np_dtype) + adj_a.numpy())
+        adj_b.assign(alpha * np.matmul(a.numpy().transpose(), adj_d.numpy(), dtype=np_dtype) + adj_b.numpy())
         adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
@@ -4135,10 +4237,16 @@ def batched_matmul(
             ),
             arrays=[a, b, c, d],
         )
+        if warp.config.verify_autograd_array_access:
+            d.mark_write()
+            a.mark_read()
+            b.mark_read()
+            c.mark_read()
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        d.assign(alpha * np.matmul(a.numpy(), b.numpy()) + beta * c.numpy())
+        np_dtype = warp_type_to_np_dtype[a.dtype]
+        d.assign(alpha * np.matmul(a.numpy(), b.numpy(), dtype=np_dtype) + beta * c.numpy())
         return
     # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
@@ -4282,8 +4390,9 @@ def adj_batched_matmul(
     # cpu fallback if no cuda devices found
     if device == "cpu":
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))) + adj_a.numpy())
-        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()) + adj_b.numpy())
+        np_dtype = warp_type_to_np_dtype[a.dtype]
+        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1)), dtype=np_dtype) + adj_a.numpy())
+        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy(), dtype=np_dtype) + adj_b.numpy())
         adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
         return
@@ -4487,6 +4596,11 @@ def adj_batched_matmul(
 class HashGrid:
+    def __new__(cls, *args, **kwargs):
+        instance = super(HashGrid, cls).__new__(cls)
+        instance.id = None
+        return instance
     def __init__(self, dim_x, dim_y, dim_z, device=None):
         """Class representing a hash grid object for accelerated point queries.
@@ -4500,8 +4614,6 @@ class HashGrid:
             dim_z (int): Number of cells in z-axis
         """
-        self.id = 0
         self.runtime = warp.context.runtime
         self.device = self.runtime.get_device(device)
@@ -4559,6 +4671,11 @@ class HashGrid:
 class MarchingCubes:
+    def __new__(cls, *args, **kwargs):
+        instance = super(MarchingCubes, cls).__new__(cls)
+        instance.id = None
+        return instance
     def __init__(self, nx: int, ny: int, nz: int, max_verts: int, max_tris: int, device=None):
         """CUDA-based Marching Cubes algorithm to extract a 2D surface mesh from a 3D volume.