PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-macosx_10_13_universal2.whl → 1.7.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.6.2__py3-none-macosx_10_13_universal2.whl → 1.7.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (179) hide show

warp/__init__.py +7 -1
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +452 -362
warp/codegen.py +179 -119
warp/config.py +42 -6
warp/context.py +490 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/nodal_field.py +22 -68
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +9 -10
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +3 -8
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +14 -27
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +301 -105
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +99 -10
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/sim/articulation.py +4 -4
warp/sim/collide.py +21 -10
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/integrator_euler.py +5 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +5 -5
warp/sim/model.py +42 -13
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +216 -19
warp/tests/__main__.py +0 -15
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +2 -2
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_examples.py +28 -36
warp/tests/test_fem.py +23 -4
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +233 -79
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +67 -46
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +46 -34
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +1 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -59
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +110 -658
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/METADATA +29 -7
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/RECORD +172 -162
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info/licenses}/LICENSE.md +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/types.py CHANGED Viewed

@@ -20,7 +20,21 @@ import ctypes
 import inspect
 import struct
 import zlib
-from typing import Any, Callable, Generic, List, Literal, NamedTuple, Optional, Sequence, Tuple, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    get_args,
+    get_origin,
+)
 import numpy as np
 import numpy.typing as npt
@@ -56,7 +70,9 @@ class Transformation(Generic[Float]):
 class Array(Generic[DType]):
-    pass
+    device: Optional[warp.context.Device]
+    dtype: type
+    size: int
 int_tuple_type_hints = {
@@ -1139,7 +1155,7 @@ ARRAY_TYPE_FABRIC_INDEXED = 3
 class launch_bounds_t(ctypes.Structure):
     _fields_ = [("shape", ctypes.c_int32 * LAUNCH_MAX_DIMS), ("ndim", ctypes.c_int32), ("size", ctypes.c_size_t)]
-    def __init__(self, shape):
+    def __init__(self, shape: Union[int, Sequence[int]]):
         if isinstance(shape, int):
             # 1d launch
             self.ndim = 1
@@ -1260,7 +1276,7 @@ _type_size_cache = {
 }
-def type_size_in_bytes(dtype):
+def type_size_in_bytes(dtype: type) -> int:
     size = _type_size_cache.get(dtype)
     if size is None:
@@ -1279,7 +1295,7 @@ def type_size_in_bytes(dtype):
     return size
-def type_to_warp(dtype):
+def type_to_warp(dtype: type) -> type:
     if dtype == float:
         return float32
     elif dtype == int:
@@ -1290,7 +1306,7 @@ def type_to_warp(dtype):
         return dtype
-def type_typestr(dtype):
+def type_typestr(dtype: type) -> str:
     if dtype == bool:
         return "|b1"
     elif dtype == float16:
@@ -1376,29 +1392,29 @@ def type_is_transformation(t):
     return getattr(t, "_wp_generic_type_hint_", None) is Transformation
-value_types = (int, float, builtins.bool) + scalar_types
+value_types = (int, float, builtins.bool) + scalar_and_bool_types
 # returns true for all value types (int, float, bool, scalars, vectors, matrices)
-def type_is_value(x):
+def type_is_value(x: Any) -> builtins.bool:
     return x in value_types or hasattr(x, "_wp_scalar_type_")
 # equivalent of the above but for values
-def is_int(x):
+def is_int(x: Any) -> builtins.bool:
     return type_is_int(type(x))
-def is_float(x):
+def is_float(x: Any) -> builtins.bool:
     return type_is_float(type(x))
-def is_value(x):
+def is_value(x: Any) -> builtins.bool:
     return type_is_value(type(x))
-# returns true if the passed *instance* is one of the array types
-def is_array(a):
+def is_array(a) -> builtins.bool:
+    """Return true if the passed *instance* is one of the array types."""
     return isinstance(a, array_types)
@@ -1465,21 +1481,21 @@ def types_equal(a, b, match_generic=False):
             if a_length is None or b_length is None or a_length == b_length:
                 return True
-    a_origin = warp.codegen.get_type_origin(a)
-    b_origin = warp.codegen.get_type_origin(b)
+    a_origin = get_origin(a)
+    b_origin = get_origin(b)
     if a_origin is tuple and b_origin is tuple:
-        a_args = warp.codegen.get_type_args(a)
-        b_args = warp.codegen.get_type_args(b)
+        a_args = get_args(a)
+        b_args = get_args(b)
         if len(a_args) == len(b_args) and all(
             scalars_equal(x, y, match_generic=match_generic) for x, y in zip(a_args, b_args)
         ):
             return True
     elif a_origin is tuple and isinstance(b, Sequence):
-        a_args = warp.codegen.get_type_args(a)
+        a_args = get_args(a)
         if len(a_args) == len(b) and all(scalars_equal(x, y, match_generic=match_generic) for x, y in zip(a_args, b)):
             return True
     elif b_origin is tuple and isinstance(a, Sequence):
-        b_args = warp.codegen.get_type_args(b)
+        b_args = get_args(b)
         if len(b_args) == len(a) and all(scalars_equal(x, y, match_generic=match_generic) for x, y in zip(b_args, a)):
             return True
@@ -1600,7 +1616,7 @@ def array_ctype_from_interface(interface: dict, dtype=None, owner=None):
     return array_ctype
-class array(Array):
+class array(Array[DType]):
     """A fixed-size multi-dimensional array containing values of the same type.
     Attributes:
@@ -1629,21 +1645,21 @@ class array(Array):
     def __init__(
         self,
-        data: Optional[Union[List, Tuple, npt.NDArray]] = None,
-        dtype: Union[DType, Any] = Any,
-        shape: Optional[Tuple[int, ...]] = None,
+        data: Union[List, Tuple, npt.NDArray, None] = None,
+        dtype: Any = Any,
+        shape: Union[int, Tuple[int, ...], List[int], None] = None,
         strides: Optional[Tuple[int, ...]] = None,
         length: Optional[int] = None,
         ptr: Optional[int] = None,
         capacity: Optional[int] = None,
         device=None,
-        pinned: bool = False,
-        copy: bool = True,
-        owner: bool = False,  # deprecated - pass deleter instead
+        pinned: builtins.bool = False,
+        copy: builtins.bool = True,
+        owner: builtins.bool = False,  # deprecated - pass deleter instead
         deleter: Optional[Callable[[int, int], None]] = None,
         ndim: Optional[int] = None,
         grad: Optional[array] = None,
-        requires_grad: bool = False,
+        requires_grad: builtins.bool = False,
     ):
         """Constructs a new Warp array object
@@ -2939,7 +2955,7 @@ def from_ipc_handle(
 # A base class for non-contiguous arrays, providing the implementation of common methods like
 # contiguous(), to(), numpy(), list(), assign(), zero_(), and fill_().
-class noncontiguous_array_base(Generic[T]):
+class noncontiguous_array_base(Array[T]):
     def __init__(self, array_type_id):
         self.type_id = array_type_id
         self.is_contiguous = False
@@ -3036,12 +3052,18 @@ def check_index_array(indices, expected_device):
         raise ValueError(f"Index array device ({indices.device} does not match data array device ({expected_device}))")
-class indexedarray(noncontiguous_array_base[T]):
+class indexedarray(noncontiguous_array_base):
     # member attributes available during code-gen (e.g.: d = arr.shape[0])
     # (initialized when needed)
     _vars = None
-    def __init__(self, data: array = None, indices: Union[array, List[array]] = None, dtype=None, ndim=None):
+    def __init__(
+        self,
+        data: Optional[array] = None,
+        indices: Union[array, List[array], None] = None,
+        dtype=None,
+        ndim: Optional[int] = None,
+    ):
         super().__init__(ARRAY_TYPE_INDEXED)
         # canonicalize types
@@ -3232,7 +3254,7 @@ class Tile:
                 return f"wp::tile_alloc_empty<{Var.type_to_ctype(self.dtype)},wp::tile_shape_t<{','.join(map(str, self.shape))}>,{'true' if requires_grad else 'false'}>()"
             else:
                 # tile will be initialized by another call, e.g.: tile_transpose()
-                return "NULL"
+                return "nullptr"
     # return total tile size in bytes
     def size_in_bytes(self):
@@ -3634,7 +3656,7 @@ class Volume:
         instance.id = None
         return instance
-    def __init__(self, data: array, copy: bool = True):
+    def __init__(self, data: array, copy: builtins.bool = True):
         """Class representing a sparse grid.
         Args:
@@ -4361,6 +4383,15 @@ class Volume:
         translation_buf = (ctypes.c_float * 3)(translation[0], translation[1], translation[2])
         return transform_buf, translation_buf
+    # nanovdb types for which we instantiate the grid builder
+    # Should be in sync with WP_VOLUME_BUILDER_INSTANTIATE_TYPES in volume_builder.h
+    _supported_allocation_types = [
+        "int32",
+        "float",
+        "Vec3f",
+        "Vec4f",
+    ]
     @classmethod
     def allocate_by_tiles(
         cls,
@@ -4388,7 +4419,8 @@ class Volume:
                 or a floating point scalar type (2D N-by-3 array of :class:`warp.float32` or 1D array of `warp.vec3f` values), indicating world space positions.
                 Repeated points per tile are allowed and will be efficiently deduplicated.
             voxel_size (float or array-like): Voxel size(s) of the new volume. Ignored if `transform` is given.
-            bg_value (array-like, float, int or None): Value of unallocated voxels of the volume, also defines the volume's type. A :class:`warp.vec3` volume is created if this is `array-like`, an index volume will be created if `bg_value` is ``None``.
+            bg_value (array-like, scalar or None): Value of unallocated voxels of the volume, also defines the volume's type. An index volume will be created if `bg_value` is ``None``.
+              Other supported grid types are `int`, `float`, `vec3f`, and `vec4f`.
             translation (array-like): Translation between the index and world spaces.
             transform (array-like): Linear transform between the index and world spaces. If ``None``, deduced from `voxel_size`.
             device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
@@ -4420,35 +4452,47 @@ class Volume:
                 translation_buf,
                 in_world_space,
             )
-        elif hasattr(bg_value, "__len__"):
-            volume.id = volume.runtime.core.volume_v_from_tiles_device(
-                volume.device.context,
-                ctypes.c_void_p(tile_points.ptr),
-                tile_points.shape[0],
-                transform_buf,
-                translation_buf,
-                in_world_space,
-                (ctypes.c_float * 3)(bg_value[0], bg_value[1], bg_value[2]),
-            )
-        elif isinstance(bg_value, int):
-            volume.id = volume.runtime.core.volume_i_from_tiles_device(
-                volume.device.context,
-                ctypes.c_void_p(tile_points.ptr),
-                tile_points.shape[0],
-                transform_buf,
-                translation_buf,
-                in_world_space,
-                bg_value,
-            )
         else:
-            volume.id = volume.runtime.core.volume_f_from_tiles_device(
+            # normalize background value type
+            grid_type = type_to_warp(type(bg_value))
+            if not (is_value(bg_value) or type_is_vector(grid_type)) and (
+                hasattr(bg_value, "__len__") and is_value(bg_value[0])
+            ):
+                # non-warp vectors are considered float, for backward compatibility
+                grid_type = vector(len(bg_value), dtype=float)
+            # look for corresponding nvdb type
+            try:
+                nvdb_type = next(
+                    typ
+                    for typ in Volume._supported_allocation_types
+                    if types_equal(grid_type, Volume._nvdb_type_to_dtype[typ])
+                )
+            except StopIteration as err:
+                raise TypeError(
+                    f"Unsupported bg_value type for volume allocation {type_repr(grid_type)}. Supported volume types are {', '.join(Volume._supported_allocation_types)}."
+                ) from err
+            # cast to ctype
+            # wrap scalar values in length-1 vectors to handle specific ctype conversion
+            if not type_is_vector(grid_type):
+                grid_type = vector(length=1, dtype=grid_type)
+            cvalue = grid_type(bg_value)
+            cvalue_ptr = ctypes.pointer(cvalue)
+            cvalue_size = ctypes.sizeof(cvalue)
+            cvalue_type = nvdb_type.encode("ascii")
+            volume.id = volume.runtime.core.volume_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
                 tile_points.shape[0],
                 transform_buf,
                 translation_buf,
                 in_world_space,
-                float(bg_value),
+                cvalue_ptr,
+                cvalue_size,
+                cvalue_type,
             )
         if volume.id == 0:
@@ -4606,6 +4650,8 @@ def matmul(
 ):
     """Computes a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
+    .. versionremoved:: 1.7
     .. deprecated:: 1.6
         Use :doc:`tile primitives </modules/tiles>` instead.
@@ -4619,80 +4665,8 @@ def matmul(
         allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                    while using Tensor Cores
     """
-    from warp.context import runtime
-    warp.utils.warn(
-        "wp.matmul() is deprecated and will be removed in a\nfuture version. Use tile primitives instead.",
-        category=DeprecationWarning,
-        stacklevel=2,
-    )
-    device = a.device
-    if b.device != device or c.device != device or d.device != device:
-        raise RuntimeError("Matrices A, B, C, and D must all be on the same device as the runtime device.")
-    if a.dtype != b.dtype or a.dtype != c.dtype or a.dtype != d.dtype:
-        raise RuntimeError(
-            "wp.matmul currently only supports operation between {A, B, C, D} matrices of the same type."
-        )
-    if (
-        (not a.is_contiguous and not a.is_transposed)
-        or (not b.is_contiguous and not b.is_transposed)
-        or (not c.is_contiguous)
-        or (not d.is_contiguous)
-    ):
-        raise RuntimeError(
-            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
-        )
-    m = a.shape[0]
-    n = b.shape[1]
-    k = a.shape[1]
-    if b.shape != (k, n) or c.shape != (m, n) or d.shape != (m, n):
-        raise RuntimeError(
-            "Invalid shapes for matrices: A = {} B = {} C = {} D = {}".format(a.shape, b.shape, c.shape, d.shape)
-        )
-    if runtime.tape:
-        runtime.tape.record_func(
-            backward=lambda: adj_matmul(a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith),
-            arrays=[a, b, c, d],
-        )
-        if warp.config.verify_autograd_array_access:
-            d.mark_write()
-            a.mark_read()
-            b.mark_read()
-            c.mark_read()
-    # cpu fallback if no cuda devices found
-    if device == "cpu":
-        np_dtype = warp_type_to_np_dtype[a.dtype]
-        d.assign(alpha * np.matmul(a.numpy(), b.numpy(), dtype=np_dtype) + beta * c.numpy())
-        return
-    cc = device.arch
-    ret = runtime.core.cutlass_gemm(
-        device.context,
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a.ptr),
-        ctypes.c_void_p(b.ptr),
-        ctypes.c_void_p(c.ptr),
-        ctypes.c_void_p(d.ptr),
-        alpha,
-        beta,
-        not a.is_transposed,
-        not b.is_transposed,
-        allow_tf32x3_arith,
-        1,
-    )
-    if not ret:
-        raise RuntimeError("matmul failed.")
+    raise RuntimeError("This function has been removed. Use tile primitives instead.")
 def adj_matmul(
@@ -4724,171 +4698,8 @@ def adj_matmul(
         allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                    while using Tensor Cores
     """
-    from warp.context import runtime
-    device = a.device
-    if (
-        b.device != device
-        or c.device != device
-        or adj_a.device != device
-        or adj_b.device != device
-        or adj_c.device != device
-        or adj_d.device != device
-    ):
-        raise RuntimeError(
-            "Matrices A, B, C, D, and their adjoints must all be on the same device as the runtime device."
-        )
-    if (
-        a.dtype != b.dtype
-        or a.dtype != c.dtype
-        or a.dtype != adj_a.dtype
-        or a.dtype != adj_b.dtype
-        or a.dtype != adj_c.dtype
-        or a.dtype != adj_d.dtype
-    ):
-        raise RuntimeError(
-            "wp.adj_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
-        )
-    if (
-        (not a.is_contiguous and not a.is_transposed)
-        or (not b.is_contiguous and not b.is_transposed)
-        or (not c.is_contiguous)
-        or (not adj_a.is_contiguous and not adj_a.is_transposed)
-        or (not adj_b.is_contiguous and not adj_b.is_transposed)
-        or (not adj_c.is_contiguous)
-        or (not adj_d.is_contiguous)
-    ):
-        raise RuntimeError(
-            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
-        )
-    m = a.shape[0]
-    n = b.shape[1]
-    k = a.shape[1]
-    if (
-        a.shape != (m, k)
-        or b.shape != (k, n)
-        or c.shape != (m, n)
-        or adj_d.shape != (m, n)
-        or adj_a.shape != (m, k)
-        or adj_b.shape != (k, n)
-        or adj_c.shape != (m, n)
-    ):
-        raise RuntimeError(
-            "Invalid shapes for matrices: A = {} B = {} C = {} adj_D = {} adj_A = {} adj_B = {} adj_C = {}".format(
-                a.shape, b.shape, c.shape, adj_d.shape, adj_a.shape, adj_b.shape, adj_c.shape
-            )
-        )
-    # cpu fallback if no cuda devices found
-    if device == "cpu":
-        np_dtype = warp_type_to_np_dtype[a.dtype]
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose(), dtype=np_dtype) + adj_a.numpy())
-        adj_b.assign(alpha * np.matmul(a.numpy().transpose(), adj_d.numpy(), dtype=np_dtype) + adj_b.numpy())
-        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
-        return
-    cc = device.arch
-    # adj_a
-    if not a.is_transposed:
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            m,
-            k,
-            n,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(adj_d.ptr),
-            ctypes.c_void_p(b.ptr),
-            ctypes.c_void_p(adj_a.ptr),
-            ctypes.c_void_p(adj_a.ptr),
-            alpha,
-            1.0,
-            True,
-            b.is_transposed,
-            allow_tf32x3_arith,
-            1,
-        )
-        if not ret:
-            raise RuntimeError("adj_matmul failed.")
-    else:
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            k,
-            m,
-            n,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(b.ptr),
-            ctypes.c_void_p(adj_d.ptr),
-            ctypes.c_void_p(adj_a.ptr),
-            ctypes.c_void_p(adj_a.ptr),
-            alpha,
-            1.0,
-            not b.is_transposed,
-            False,
-            allow_tf32x3_arith,
-            1,
-        )
-        if not ret:
-            raise RuntimeError("adj_matmul failed.")
-    # adj_b
-    if not b.is_transposed:
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            k,
-            n,
-            m,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(a.ptr),
-            ctypes.c_void_p(adj_d.ptr),
-            ctypes.c_void_p(adj_b.ptr),
-            ctypes.c_void_p(adj_b.ptr),
-            alpha,
-            1.0,
-            a.is_transposed,
-            True,
-            allow_tf32x3_arith,
-            1,
-        )
-        if not ret:
-            raise RuntimeError("adj_matmul failed.")
-    else:
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            n,
-            k,
-            m,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(adj_d.ptr),
-            ctypes.c_void_p(a.ptr),
-            ctypes.c_void_p(adj_b.ptr),
-            ctypes.c_void_p(adj_b.ptr),
-            alpha,
-            1.0,
-            False,
-            not a.is_transposed,
-            allow_tf32x3_arith,
-            1,
-        )
-        if not ret:
-            raise RuntimeError("adj_matmul failed.")
-    # adj_c
-    warp.launch(
-        kernel=warp.utils.add_kernel_2d,
-        dim=adj_c.shape,
-        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
-        device=device,
-        record_tape=False,
-    )
+    raise RuntimeError("This function has been removed. Use tile primitives instead.")
 def batched_matmul(
@@ -4902,6 +4713,8 @@ def batched_matmul(
 ):
     """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
+    .. versionremoved:: 1.7
     .. deprecated:: 1.6
         Use :doc:`tile primitives </modules/tiles>` instead.
@@ -4915,107 +4728,8 @@ def batched_matmul(
         allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                    while using Tensor Cores
     """
-    from warp.context import runtime
-    device = a.device
-    if b.device != device or c.device != device or d.device != device:
-        raise RuntimeError("Matrices A, B, C, and D must all be on the same device as the runtime device.")
-    if a.dtype != b.dtype or a.dtype != c.dtype or a.dtype != d.dtype:
-        raise RuntimeError(
-            "wp.batched_matmul currently only supports operation between {A, B, C, D} matrices of the same type."
-        )
-    if (
-        (not a.is_contiguous and not a.is_transposed)
-        or (not b.is_contiguous and not b.is_transposed)
-        or (not c.is_contiguous)
-        or (not d.is_contiguous)
-    ):
-        raise RuntimeError(
-            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
-        )
-    m = a.shape[1]
-    n = b.shape[2]
-    k = a.shape[2]
-    batch_count = a.shape[0]
-    if b.shape != (batch_count, k, n) or c.shape != (batch_count, m, n) or d.shape != (batch_count, m, n):
-        raise RuntimeError(
-            "Invalid shapes for matrices: A = {} B = {} C = {} D = {}".format(a.shape, b.shape, c.shape, d.shape)
-        )
-    if runtime.tape:
-        runtime.tape.record_func(
-            backward=lambda: adj_batched_matmul(
-                a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith
-            ),
-            arrays=[a, b, c, d],
-        )
-        if warp.config.verify_autograd_array_access:
-            d.mark_write()
-            a.mark_read()
-            b.mark_read()
-            c.mark_read()
-    # cpu fallback if no cuda devices found
-    if device == "cpu":
-        np_dtype = warp_type_to_np_dtype[a.dtype]
-        d.assign(alpha * np.matmul(a.numpy(), b.numpy(), dtype=np_dtype) + beta * c.numpy())
-        return
-    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
-    max_batch_count = 65535
-    iters = int(batch_count / max_batch_count)
-    remainder = batch_count % max_batch_count
-    cc = device.arch
-    for i in range(iters):
-        idx_start = i * max_batch_count
-        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            m,
-            n,
-            k,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(a[idx_start:idx_end, :, :].ptr),
-            ctypes.c_void_p(b[idx_start:idx_end, :, :].ptr),
-            ctypes.c_void_p(c[idx_start:idx_end, :, :].ptr),
-            ctypes.c_void_p(d[idx_start:idx_end, :, :].ptr),
-            alpha,
-            beta,
-            not a.is_transposed,
-            not b.is_transposed,
-            allow_tf32x3_arith,
-            max_batch_count,
-        )
-        if not ret:
-            raise RuntimeError("Batched matmul failed.")
-    idx_start = iters * max_batch_count
-    ret = runtime.core.cutlass_gemm(
-        device.context,
-        cc,
-        m,
-        n,
-        k,
-        type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a[idx_start:, :, :].ptr),
-        ctypes.c_void_p(b[idx_start:, :, :].ptr),
-        ctypes.c_void_p(c[idx_start:, :, :].ptr),
-        ctypes.c_void_p(d[idx_start:, :, :].ptr),
-        alpha,
-        beta,
-        not a.is_transposed,
-        not b.is_transposed,
-        allow_tf32x3_arith,
-        remainder,
-    )
-    if not ret:
-        raise RuntimeError("Batched matmul failed.")
+    raise RuntimeError("This function has been removed. Use tile primitives instead.")
 def adj_batched_matmul(
@@ -5045,270 +4759,8 @@ def adj_batched_matmul(
         allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                    while using Tensor Cores
     """
-    from warp.context import runtime
-    device = a.device
-    if (
-        b.device != device
-        or c.device != device
-        or adj_a.device != device
-        or adj_b.device != device
-        or adj_c.device != device
-        or adj_d.device != device
-    ):
-        raise RuntimeError(
-            "Matrices A, B, C, D, and their adjoints must all be on the same device as the runtime device."
-        )
-    if (
-        a.dtype != b.dtype
-        or a.dtype != c.dtype
-        or a.dtype != adj_a.dtype
-        or a.dtype != adj_b.dtype
-        or a.dtype != adj_c.dtype
-        or a.dtype != adj_d.dtype
-    ):
-        raise RuntimeError(
-            "wp.adj_batched_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
-        )
-    m = a.shape[1]
-    n = b.shape[2]
-    k = a.shape[2]
-    batch_count = a.shape[0]
-    if (
-        b.shape != (batch_count, k, n)
-        or c.shape != (batch_count, m, n)
-        or adj_d.shape != (batch_count, m, n)
-        or adj_a.shape != (batch_count, m, k)
-        or adj_b.shape != (batch_count, k, n)
-        or adj_c.shape != (batch_count, m, n)
-    ):
-        raise RuntimeError(
-            "Invalid shapes for matrices: A = {} B = {} C = {} adj_D = {} adj_A = {} adj_B = {} adj_C = {}".format(
-                a.shape, b.shape, c.shape, adj_d.shape, adj_a.shape, adj_b.shape, adj_c.shape
-            )
-        )
-    if (
-        (not a.is_contiguous and not a.is_transposed)
-        or (not b.is_contiguous and not b.is_transposed)
-        or (not c.is_contiguous)
-        or (not adj_a.is_contiguous and not adj_a.is_transposed)
-        or (not adj_b.is_contiguous and not adj_b.is_transposed)
-        or (not adj_c.is_contiguous)
-        or (not adj_d.is_contiguous)
-    ):
-        raise RuntimeError(
-            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
-        )
-    # cpu fallback if no cuda devices found
-    if device == "cpu":
-        np_dtype = warp_type_to_np_dtype[a.dtype]
-        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1)), dtype=np_dtype) + adj_a.numpy())
-        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy(), dtype=np_dtype) + adj_b.numpy())
-        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
-        return
-    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
-    max_batch_count = 65535
-    iters = int(batch_count / max_batch_count)
-    remainder = batch_count % max_batch_count
-    cc = device.arch
-    for i in range(iters):
-        idx_start = i * max_batch_count
-        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
-        # adj_a
-        if not a.is_transposed:
-            ret = runtime.core.cutlass_gemm(
-                device.context,
-                cc,
-                m,
-                k,
-                n,
-                type_typestr(a.dtype).encode(),
-                ctypes.c_void_p(adj_d[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(b[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_a[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_a[idx_start:idx_end, :, :].ptr),
-                alpha,
-                1.0,
-                True,
-                b.is_transposed,
-                allow_tf32x3_arith,
-                max_batch_count,
-            )
-            if not ret:
-                raise RuntimeError("adj_matmul failed.")
-        else:
-            ret = runtime.core.cutlass_gemm(
-                device.context,
-                cc,
-                k,
-                m,
-                n,
-                type_typestr(a.dtype).encode(),
-                ctypes.c_void_p(b[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_d[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_a[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_a[idx_start:idx_end, :, :].ptr),
-                alpha,
-                1.0,
-                not b.is_transposed,
-                False,
-                allow_tf32x3_arith,
-                max_batch_count,
-            )
-            if not ret:
-                raise RuntimeError("adj_matmul failed.")
-        # adj_b
-        if not b.is_transposed:
-            ret = runtime.core.cutlass_gemm(
-                device.context,
-                cc,
-                k,
-                n,
-                m,
-                type_typestr(a.dtype).encode(),
-                ctypes.c_void_p(a[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_d[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_b[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_b[idx_start:idx_end, :, :].ptr),
-                alpha,
-                1.0,
-                a.is_transposed,
-                True,
-                allow_tf32x3_arith,
-                max_batch_count,
-            )
-            if not ret:
-                raise RuntimeError("adj_matmul failed.")
-        else:
-            ret = runtime.core.cutlass_gemm(
-                device.context,
-                cc,
-                n,
-                k,
-                m,
-                type_typestr(a.dtype).encode(),
-                ctypes.c_void_p(adj_d[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(a[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_b[idx_start:idx_end, :, :].ptr),
-                ctypes.c_void_p(adj_b[idx_start:idx_end, :, :].ptr),
-                alpha,
-                1.0,
-                False,
-                not a.is_transposed,
-                allow_tf32x3_arith,
-                max_batch_count,
-            )
-            if not ret:
-                raise RuntimeError("adj_matmul failed.")
-    idx_start = iters * max_batch_count
-    # adj_a
-    if not a.is_transposed:
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            m,
-            k,
-            n,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(adj_d[idx_start:, :, :].ptr),
-            ctypes.c_void_p(b[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_a[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_a[idx_start:, :, :].ptr),
-            alpha,
-            1.0,
-            True,
-            b.is_transposed,
-            allow_tf32x3_arith,
-            remainder,
-        )
-        if not ret:
-            raise RuntimeError("adj_matmul failed.")
-    else:
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            k,
-            m,
-            n,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(b[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_d[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_a[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_a[idx_start:, :, :].ptr),
-            alpha,
-            1.0,
-            not b.is_transposed,
-            False,
-            allow_tf32x3_arith,
-            remainder,
-        )
-        if not ret:
-            raise RuntimeError("adj_matmul failed.")
-    # adj_b
-    if not b.is_transposed:
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            k,
-            n,
-            m,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(a[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_d[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_b[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_b[idx_start:, :, :].ptr),
-            alpha,
-            1.0,
-            a.is_transposed,
-            True,
-            allow_tf32x3_arith,
-            remainder,
-        )
-        if not ret:
-            raise RuntimeError("adj_matmul failed.")
-    else:
-        ret = runtime.core.cutlass_gemm(
-            device.context,
-            cc,
-            n,
-            k,
-            m,
-            type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(adj_d[idx_start:, :, :].ptr),
-            ctypes.c_void_p(a[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_b[idx_start:, :, :].ptr),
-            ctypes.c_void_p(adj_b[idx_start:, :, :].ptr),
-            alpha,
-            1.0,
-            False,
-            not a.is_transposed,
-            allow_tf32x3_arith,
-            remainder,
-        )
-        if not ret:
-            raise RuntimeError("adj_matmul failed.")
-    # adj_c
-    warp.launch(
-        kernel=warp.utils.add_kernel_3d,
-        dim=adj_c.shape,
-        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
-        device=device,
-        record_tape=False,
-    )
+    raise RuntimeError("This function has been removed. Use tile primitives instead.")
 class HashGrid:
@@ -5691,7 +5143,7 @@ simple_type_codes = {
 }
-def get_type_code(arg_type):
+def get_type_code(arg_type: type) -> str:
     if arg_type == Any:
         # special case for generics
         # note: since Python 3.11 Any is a type, so we check for it first
@@ -5755,8 +5207,8 @@ def get_type_code(arg_type):
         raise TypeError(f"Unrecognized type '{arg_type}'")
-def get_signature(arg_types, func_name=None, arg_names=None):
-    type_codes = []
+def get_signature(arg_types: List[type], func_name: Optional[str] = None, arg_names: Optional[List[str]] = None) -> str:
+    type_codes: List[str] = []
     for i, arg_type in enumerate(arg_types):
         try:
             type_codes.append(get_type_code(arg_type))