PyPI - warp-lang - Versions diffs - 1.7.2__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.7.2__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (180) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/warp.so +0 -0
warp/build.py +241 -252
warp/build_dll.py +125 -26
warp/builtins.py +1907 -384
warp/codegen.py +257 -101
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +657 -223
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +97 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +107 -52
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +12 -17
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +99 -0
warp/native/builtin.h +174 -31
warp/native/coloring.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +3 -3
warp/native/mat.h +5 -10
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/quat.h +28 -4
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/tile.h +583 -72
warp/native/tile_radix_sort.h +1108 -0
warp/native/tile_reduce.h +237 -2
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +6 -16
warp/native/warp.cpp +36 -4
warp/native/warp.cu +574 -51
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +58 -29
warp/render/render_usd.py +124 -61
warp/sim/__init__.py +9 -0
warp/sim/collide.py +252 -78
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +751 -320
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +52 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +15 -1
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_atomic_cas.py +299 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +1 -24
warp/tests/test_quat.py +6 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +51 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_struct.py +2 -2
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/tile/test_tile.py +420 -1
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_reduce.py +213 -0
warp/tests/tile/test_tile_shared_memory.py +130 -1
warp/tests/tile/test_tile_sort.py +117 -0
warp/tests/unittest_suites.py +4 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/RECORD +177 -165
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0

warp/sparse.py CHANGED Viewed

@@ -28,9 +28,10 @@ from warp.types import (
     is_array,
     scalar_types,
     type_is_matrix,
-    type_length,
     type_repr,
     type_scalar_type,
+    type_size,
+    type_size_in_bytes,
     type_to_warp,
     types_equal,
 )
@@ -86,7 +87,7 @@ class BsrMatrix(Generic[_BlockType]):
     @property
     def block_size(self) -> int:
         """Size of the individual blocks, i.e. number of rows per block times number of columns per block."""
-        return type_length(self.values.dtype)
+        return type_size(self.values.dtype)
     @property
     def shape(self) -> Tuple[int, int]:
@@ -104,23 +105,15 @@ class BsrMatrix(Generic[_BlockType]):
         """Device on which ``offsets``, ``columns``, and ``values`` are allocated -- assumed to be the same for all three arrays."""
         return self.values.device
+    @property
+    def requires_grad(self) -> bool:
+        """Read-only property indicating whether the matrix participates in adjoint computations."""
+        return self.values.requires_grad
     @property
     def scalar_values(self) -> wp.array:
         """Accesses the ``values`` array as a 3d scalar array."""
-        if self.block_shape == (1, 1):
-            return self.values.reshape((self.nnz, 1, 1))
-        def _as_3d_array(arr):
-            return wp.array(
-                ptr=arr.ptr,
-                capacity=arr.capacity,
-                device=arr.device,
-                dtype=self.scalar_type,
-                shape=(self.nnz, *self.block_shape),
-                grad=None if arr.grad is None else _as_3d_array(arr.grad),
-            )
-        values_view = _as_3d_array(self.values)
+        values_view = _as_3d_array(self.values, self.block_shape)
         values_view._ref = self.values  # keep ref in case we're garbage collected
         return values_view
@@ -144,13 +137,14 @@ class BsrMatrix(Generic[_BlockType]):
         See also :meth:`copy_nnz_async`.
         """
-        if self._is_nnz_transfer_setup():
-            if self.device.is_cuda:
-                wp.synchronize_event(self._nnz_event)
-            self.nnz = int(self._nnz_buf.numpy()[0])
+        buf, event = self._nnz_transfer_if_any()
+        if buf is not None:
+            if event is not None:
+                wp.synchronize_event(event)
+            self.nnz = int(buf.numpy()[0])
         return self.nnz
-    def copy_nnz_async(self, known_nnz: int = None):
+    def copy_nnz_async(self) -> None:
         """
         Start the asynchronous transfer of the exact nnz from the device offsets array to host and records an event for completion.
@@ -158,37 +152,25 @@ class BsrMatrix(Generic[_BlockType]):
         See also :meth:`nnz_sync`.
         """
-        if known_nnz is not None:
-            self.nnz = int(known_nnz)
-        else:
-            self._setup_nnz_transfer()
-        # If a transfer is already ongoing, or if the actual nnz is unknown, schedule a new transfer
-        if self._is_nnz_transfer_setup():
-            stream = wp.get_stream(self.device) if self.device.is_cuda else None
-            wp.copy(src=self.offsets, dest=self._nnz_buf, src_offset=self.nrow, count=1, stream=stream)
-            if self.device.is_cuda:
-                stream.record_event(self._nnz_event)
+        buf, event = self._setup_nnz_transfer()
+        stream = wp.get_stream(self.device) if self.device.is_cuda else None
+        wp.copy(src=self.offsets, dest=buf, src_offset=self.nrow, count=1, stream=stream)
+        if event is not None:
+            stream.record_event(event)
     def _setup_nnz_transfer(self):
-        if self._is_nnz_transfer_setup():
-            return
-        BsrMatrix.__setattr__(
-            self, "_nnz_buf", wp.empty(dtype=int, shape=(1,), device="cpu", pinned=self.device.is_cuda)
-        )
-        if self.device.is_cuda:
-            BsrMatrix.__setattr__(self, "_nnz_event", wp.Event(self.device))
-    def _is_nnz_transfer_setup(self):
-        return hasattr(self, "_nnz_buf")
+        buf, event = self._nnz_transfer_if_any()
+        if buf is not None or self.device.is_capturing:
+            return buf, event
-    def _nnz_transfer_buf_and_event(self):
-        self._setup_nnz_transfer()
+        buf = wp.empty(dtype=int, shape=(1,), device="cpu", pinned=self.device.is_cuda)
+        event = wp.Event(self.device) if self.device.is_cuda else None
+        BsrMatrix.__setattr__(self, "_nnz_transfer", (buf, event))
+        return buf, event
-        if not self.device.is_cuda:
-            return self._nnz_buf, ctypes.c_void_p(None)
-        return self._nnz_buf, self._nnz_event.cuda_event
+    def _nnz_transfer_if_any(self):
+        return getattr(self, "_nnz_transfer", (None, None))
     # Overloaded math operators
     def __add__(self, y):
@@ -303,7 +285,7 @@ def bsr_zeros(
     bsr.nrow = int(rows_of_blocks)
     bsr.ncol = int(cols_of_blocks)
-    bsr.nnz = int(0)
+    bsr.nnz = 0
     bsr.columns = wp.empty(shape=(0,), dtype=int, device=device)
     bsr.values = wp.empty(shape=(0,), dtype=block_type, device=device)
     bsr.offsets = wp.zeros(shape=(bsr.nrow + 1,), dtype=int, device=device)
@@ -311,7 +293,7 @@ def bsr_zeros(
     return bsr
-def _bsr_ensure_fits(bsr: BsrMatrix, nrow: int = None, nnz: int = None):
+def _bsr_ensure_fits(bsr: BsrMatrix, nrow: Optional[int] = None, nnz: Optional[int] = None) -> None:
     if nrow is None:
         nrow = bsr.nrow
     if nnz is None:
@@ -325,7 +307,9 @@ def _bsr_ensure_fits(bsr: BsrMatrix, nrow: int = None, nnz: int = None):
     if bsr.columns.size < nnz:
         bsr.columns = wp.empty(shape=(nnz,), dtype=int, device=bsr.columns.device)
     if bsr.values.size < nnz:
-        bsr.values = wp.empty(shape=(nnz,), dtype=bsr.values.dtype, device=bsr.values.device)
+        bsr.values = wp.empty(
+            shape=(nnz,), dtype=bsr.values.dtype, device=bsr.values.device, requires_grad=bsr.values.requires_grad
+        )
 def bsr_set_zero(
@@ -348,7 +332,64 @@ def bsr_set_zero(
     _bsr_ensure_fits(bsr, nnz=0)
     bsr.offsets.zero_()
-    bsr.copy_nnz_async(known_nnz=0)
+    bsr.copy_nnz_async()
+def _as_3d_array(arr, block_shape):
+    return wp.array(
+        ptr=arr.ptr,
+        capacity=arr.capacity,
+        device=arr.device,
+        dtype=type_scalar_type(arr.dtype),
+        shape=(arr.shape[0], *block_shape),
+        grad=None if arr.grad is None else _as_3d_array(arr.grad, block_shape),
+    )
+def _optional_ctypes_pointer(array: Optional[wp.array], ctype):
+    return None if array is None else ctypes.cast(array.ptr, ctypes.POINTER(ctype))
+def _optional_ctypes_event(event: Optional[wp.Event]):
+    return None if event is None else event.cuda_event
+_zero_value_masks = {
+    wp.float16: 0x7FFF,
+    wp.float32: 0x7FFFFFFF,
+    wp.float64: 0x7FFFFFFFFFFFFFFF,
+    wp.int8: 0xFF,
+    wp.int16: 0xFFFF,
+    wp.int32: 0xFFFFFFFF,
+    wp.int64: 0xFFFFFFFFFFFFFFFF,
+}
+@wp.kernel
+def _bsr_accumulate_triplet_values(
+    row_count: int,
+    tpl_summed_offsets: wp.array(dtype=int),
+    tpl_summed_indices: wp.array(dtype=int),
+    tpl_values: wp.array3d(dtype=Any),
+    bsr_offsets: wp.array(dtype=int),
+    bsr_values: wp.array3d(dtype=Any),
+):
+    block, i, j = wp.tid()
+    if block >= bsr_offsets[row_count]:
+        return
+    if block == 0:
+        beg = 0
+    else:
+        beg = tpl_summed_offsets[block - 1]
+    end = tpl_summed_offsets[block]
+    val = tpl_values[tpl_summed_indices[beg], i, j]
+    for k in range(beg + 1, end):
+        val += tpl_values[tpl_summed_indices[k], i, j]
+    bsr_values[block, i, j] = val
 def bsr_set_from_triplets(
@@ -356,6 +397,7 @@ def bsr_set_from_triplets(
     rows: "Array[int]",
     columns: "Array[int]",
     values: Optional["Array[Union[Scalar, BlockType[Rows, Cols, Scalar]]]"] = None,
+    count: Optional["Array[int]"] = None,
     prune_numerical_zeros: bool = True,
     masked: bool = False,
 ):
@@ -370,27 +412,50 @@ def bsr_set_from_triplets(
         values: Block values for each non-zero. Must be either a one-dimensional array with data type identical
           to the ``dest`` matrix's block type, or a 3d array with data type equal to the ``dest`` matrix's scalar type.
           If ``None``, the values array of the resulting matrix will be allocated but uninitialized.
+        count: Single-element array indicating the number of triplets. If ``None``, the number of triplets is determined from the shape of
+          ``rows`` and ``columns`` arrays.
         prune_numerical_zeros: If ``True``, will ignore the zero-valued blocks.
         masked: If ``True``, ignore blocks that are not existing non-zeros of ``dest``.
     """
     if rows.device != columns.device or rows.device != dest.device:
-        raise ValueError("All arguments must reside on the same device")
+        raise ValueError(
+            f"Rows and columns must reside on the destination matrix device, got {rows.device}, {columns.device} and {dest.device}"
+        )
     if rows.shape[0] != columns.shape[0]:
-        raise ValueError("All triplet arrays must have the same length")
+        raise ValueError(
+            f"Rows and columns arrays must have the same length, got {rows.shape[0]} and {columns.shape[0]}"
+        )
+    if rows.dtype != wp.int32 or columns.dtype != wp.int32:
+        raise TypeError("Rows and columns arrays must be of type int32")
+    if count is not None:
+        if count.device != rows.device:
+            raise ValueError(f"Count and rows must reside on the same device, got {count.device} and {rows.device}")
+        if count.shape != (1,):
+            raise ValueError(f"Count array must be a single-element array, got {count.shape}")
+        if count.dtype != wp.int32:
+            raise TypeError("Count array must be of type int32")
     # Accept either array1d(dtype) or contiguous array3d(scalar_type) as values
     if values is not None:
         if values.device != rows.device:
-            raise ValueError("All arguments must reside on the same device")
+            raise ValueError(f"Values and rows must reside on the same device, got {values.device} and {rows.device}")
         if values.shape[0] != rows.shape[0]:
-            raise ValueError("All triplet arrays must have the same length")
+            raise ValueError(
+                f"Values and rows arrays must have the same length, got {values.shape[0]} and {rows.shape[0]}"
+            )
         if values.ndim == 1:
-            if values.dtype != dest.values.dtype:
-                raise ValueError("Values array type must correspond to that of dest matrix")
+            if not types_equal(values.dtype, dest.values.dtype):
+                raise ValueError(
+                    f"Values array type must correspond to that of the dest matrix, got {type_repr(values.dtype)} and {type_repr(dest.values.dtype)}"
+                )
         elif values.ndim == 3:
             if values.shape[1:] != dest.block_shape:
                 raise ValueError(
@@ -398,12 +463,14 @@ def bsr_set_from_triplets(
                 )
             if type_scalar_type(values.dtype) != dest.scalar_type:
-                raise ValueError("Scalar type of values array should correspond to that of matrix")
-            if not values.is_contiguous:
-                raise ValueError("Multi-dimensional values array should be contiguous")
+                raise ValueError(
+                    f"Scalar type of values array ({type_repr(values.dtype)}) should correspond to that of matrix ({type_repr(dest.scalar_type)})"
+                )
         else:
-            raise ValueError("Number of dimension for values array should be 1 or 3")
+            raise ValueError(f"Number of dimension for values array should be 1 or 3, got {values.ndim}")
+        if prune_numerical_zeros and not values.is_contiguous:
+            raise ValueError("Values array should be contiguous for numerical zero pruning")
     nnz = rows.shape[0]
     if nnz == 0:
@@ -416,40 +483,54 @@ def bsr_set_from_triplets(
     device = dest.values.device
     scalar_type = dest.scalar_type
+    zero_value_mask = _zero_value_masks.get(scalar_type, 0)
+    # compute the BSR topology
     from warp.context import runtime
     if device.is_cpu:
-        if scalar_type == wp.float32:
-            native_func = runtime.core.bsr_matrix_from_triplets_float_host
-        elif scalar_type == wp.float64:
-            native_func = runtime.core.bsr_matrix_from_triplets_double_host
+        native_func = runtime.core.bsr_matrix_from_triplets_host
     else:
-        if scalar_type == wp.float32:
-            native_func = runtime.core.bsr_matrix_from_triplets_float_device
-        elif scalar_type == wp.float64:
-            native_func = runtime.core.bsr_matrix_from_triplets_double_device
+        native_func = runtime.core.bsr_matrix_from_triplets_device
-    if not native_func:
-        raise NotImplementedError(f"bsr_from_triplets not implemented for scalar type {scalar_type}")
-    nnz_buf, nnz_event = dest._nnz_transfer_buf_and_event()
+    nnz_buf, nnz_event = dest._setup_nnz_transfer()
+    summed_triplet_offsets = wp.empty(shape=(nnz,), dtype=wp.int32, device=device)
+    summed_triplet_indices = wp.empty(shape=(nnz,), dtype=wp.int32, device=device)
     with wp.ScopedDevice(device):
         native_func(
-            dest.block_shape[0],
-            dest.block_shape[1],
+            dest.block_size,
+            type_size_in_bytes(scalar_type),
             dest.nrow,
+            dest.ncol,
             nnz,
+            _optional_ctypes_pointer(count, ctype=ctypes.c_int32),
             ctypes.cast(rows.ptr, ctypes.POINTER(ctypes.c_int32)),
             ctypes.cast(columns.ptr, ctypes.POINTER(ctypes.c_int32)),
-            None if values is None else ctypes.cast(values.ptr, ctypes.c_void_p),
-            prune_numerical_zeros,
+            _optional_ctypes_pointer(values, ctype=ctypes.c_int32),
+            zero_value_mask,
             masked,
+            ctypes.cast(summed_triplet_offsets.ptr, ctypes.POINTER(ctypes.c_int32)),
+            ctypes.cast(summed_triplet_indices.ptr, ctypes.POINTER(ctypes.c_int32)),
             ctypes.cast(dest.offsets.ptr, ctypes.POINTER(ctypes.c_int32)),
             ctypes.cast(dest.columns.ptr, ctypes.POINTER(ctypes.c_int32)),
-            None if values is None else ctypes.cast(dest.values.ptr, ctypes.c_void_p),
-            ctypes.cast(nnz_buf.ptr, ctypes.POINTER(ctypes.c_int32)),
-            nnz_event,
+            _optional_ctypes_pointer(nnz_buf, ctype=ctypes.c_int32),
+            _optional_ctypes_event(nnz_event),
+        )
+        # now accumulate repeated blocks
+        wp.launch(
+            _bsr_accumulate_triplet_values,
+            dim=(nnz, *dest.block_shape),
+            inputs=[
+                dest.nrow,
+                summed_triplet_offsets,
+                summed_triplet_indices,
+                _as_3d_array(values, dest.block_shape),
+                dest.offsets,
+            ],
+            outputs=[dest.scalar_values],
         )
@@ -483,6 +564,7 @@ def bsr_from_triplets(
     A = bsr_zeros(
         rows_of_blocks=rows_of_blocks, cols_of_blocks=cols_of_blocks, block_type=block_type, device=values.device
     )
+    A.values.requires_grad = values.requires_grad
     bsr_set_from_triplets(A, rows, columns, values, prune_numerical_zeros=prune_numerical_zeros)
     return A
@@ -539,6 +621,10 @@ class _BsrScalingExpression(_BsrExpression):
     def dtype(self) -> type:
         return self.mat.dtype
+    @property
+    def requires_grad(self) -> bool:
+        return self.mat.requires_grad
     @property
     def device(self) -> wp.context.Device:
         return self.mat.device
@@ -721,10 +807,10 @@ def bsr_assign(
       src: Matrix to be copied.
       dest: Destination matrix. May have a different block shape or scalar type
         than ``src``, in which case the required casting will be performed.
-      structure_only: If ``True``, only the non-zeros indices are copied, and uninitialized value storage is allocated
+      structure_only: If ``True``, only the non-zero indices are copied, and uninitialized value storage is allocated
         to accommodate at least ``src.nnz`` blocks. If ``structure_only`` is ``False``, values are also copied with implicit
         casting if the two matrices use distinct scalar types.
-      masked: If ``True``, prevent the assignment operation from adding new non-zeros blocks to ``dest``.
+      masked: If ``True``, prevent the assignment operation from adding new non-zero blocks to ``dest``.
     """
     src, src_scale = _extract_matrix_and_scale(src)
@@ -741,7 +827,7 @@ def bsr_assign(
     if src_subrows * dest.block_shape[0] != src.block_shape[0] * dest_subrows:
         raise ValueError(
-            f"Incompatible dest and src block shapes; block rows must evenly divide one another (Got {src.block_shape[0]}, {dest.block_shape[0]})"
+            f"Incompatible dest and src block shapes; block rows must evenly divide one another (Got {dest.block_shape[0]}, {src.block_shape[0]})"
         )
     if src.block_shape[1] >= dest.block_shape[1]:
@@ -753,14 +839,16 @@ def bsr_assign(
     if src_subcols * dest.block_shape[1] != src.block_shape[1] * dest_subcols:
         raise ValueError(
-            f"Incompatible dest and src block shapes; block columns must evenly divide one another (Got {src.block_shape[1]}, {dest.block_shape[1]})"
+            f"Incompatible dest and src block shapes; block columns must evenly divide one another (Got {dest.block_shape[1]}, {src.block_shape[1]})"
         )
     dest_nrow = (src.nrow * src_subrows) // dest_subrows
     dest_ncol = (src.ncol * src_subcols) // dest_subcols
     if src.nrow * src_subrows != dest_nrow * dest_subrows or src.ncol * src_subcols != dest_ncol * dest_subcols:
-        raise ValueError("The requested block shape does not evenly divide the source matrix")
+        raise ValueError(
+            f"The requested block shape {dest.block_shape} does not evenly divide the source matrix of total size {src.shape}"
+        )
     nnz_alloc = src.nnz * src_subrows * src_subcols
     if masked:
@@ -813,27 +901,30 @@ def bsr_assign(
         from warp.context import runtime
         if dest.device.is_cpu:
-            native_func = runtime.core.bsr_matrix_from_triplets_float_host
+            native_func = runtime.core.bsr_matrix_from_triplets_host
         else:
-            native_func = runtime.core.bsr_matrix_from_triplets_float_device
+            native_func = runtime.core.bsr_matrix_from_triplets_device
-        nnz_buf, nnz_event = dest._nnz_transfer_buf_and_event()
+        nnz_buf, nnz_event = dest._setup_nnz_transfer()
         with wp.ScopedDevice(dest.device):
             native_func(
-                dest.block_shape[0],
-                dest.block_shape[1],
+                dest.block_size,
+                0,  # scalar_size_in_bytes
                 dest.nrow,
+                dest.ncol,
                 nnz_alloc,
+                None,  # device nnz
                 ctypes.cast(dest_rows.ptr, ctypes.POINTER(ctypes.c_int32)),
                 ctypes.cast(dest_cols.ptr, ctypes.POINTER(ctypes.c_int32)),
-                0,
-                False,
+                None,  # triplet values
+                0,  # zero_value_mask
                 masked,
+                None,  # summed block offsets
+                None,  # summed block indices
                 ctypes.cast(dest.offsets.ptr, ctypes.POINTER(ctypes.c_int32)),
                 ctypes.cast(dest.columns.ptr, ctypes.POINTER(ctypes.c_int32)),
-                0,
-                ctypes.cast(nnz_buf.ptr, ctypes.POINTER(ctypes.c_int32)),
-                nnz_event,
+                _optional_ctypes_pointer(nnz_buf, ctype=ctypes.c_int32),
+                _optional_ctypes_event(nnz_event),
             )
         # merge block values
@@ -893,10 +984,28 @@ def bsr_copy(
         block_type=block_type,
         device=A.device,
     )
+    copy.values.requires_grad = A.requires_grad
     bsr_assign(dest=copy, src=A, structure_only=structure_only)
     return copy
+@wp.kernel
+def _bsr_transpose_values(
+    col_count: int,
+    scale: Any,
+    bsr_values: wp.array3d(dtype=Any),
+    block_index_map: wp.array(dtype=int),
+    transposed_bsr_offsets: wp.array(dtype=int),
+    transposed_bsr_values: wp.array3d(dtype=Any),
+):
+    block, i, j = wp.tid()
+    if block >= transposed_bsr_offsets[col_count]:
+        return
+    transposed_bsr_values[block, i, j] = bsr_values[block_index_map[block], j, i] * scale
 def bsr_set_transpose(
     dest: BsrMatrix[BlockType[Cols, Rows, Scalar]],
     src: BsrMatrixOrExpression[BlockType[Rows, Cols, Scalar]],
@@ -906,15 +1015,17 @@ def bsr_set_transpose(
     src, src_scale = _extract_matrix_and_scale(src)
     if dest.values.device != src.values.device:
-        raise ValueError("All arguments must reside on the same device")
+        raise ValueError(
+            f"All arguments must reside on the same device, got {dest.values.device} and {src.values.device}"
+        )
     if dest.scalar_type != src.scalar_type:
-        raise ValueError("All arguments must have the same scalar type")
+        raise ValueError(f"All arguments must have the same scalar type, got {dest.scalar_type} and {src.scalar_type}")
     transpose_block_shape = src.block_shape[::-1]
     if dest.block_shape != transpose_block_shape:
-        raise ValueError(f"Destination block shape must be {transpose_block_shape}")
+        raise ValueError(f"Destination block shape must be {transpose_block_shape}, got {dest.block_shape}")
     nnz = src.nnz
     dest.nrow = src.ncol
@@ -930,36 +1041,33 @@ def bsr_set_transpose(
     from warp.context import runtime
     if dest.values.device.is_cpu:
-        if dest.scalar_type == wp.float32:
-            native_func = runtime.core.bsr_transpose_float_host
-        elif dest.scalar_type == wp.float64:
-            native_func = runtime.core.bsr_transpose_double_host
+        native_func = runtime.core.bsr_transpose_host
     else:
-        if dest.scalar_type == wp.float32:
-            native_func = runtime.core.bsr_transpose_float_device
-        elif dest.scalar_type == wp.float64:
-            native_func = runtime.core.bsr_transpose_double_device
+        native_func = runtime.core.bsr_transpose_device
-    if not native_func:
-        raise NotImplementedError(f"bsr_set_transpose not implemented for scalar type {dest.scalar_type}")
+    block_index_map = wp.empty(shape=2 * nnz, dtype=int, device=src.device)
     with wp.ScopedDevice(dest.device):
         native_func(
-            src.block_shape[0],
-            src.block_shape[1],
             src.nrow,
             src.ncol,
             nnz,
             ctypes.cast(src.offsets.ptr, ctypes.POINTER(ctypes.c_int32)),
             ctypes.cast(src.columns.ptr, ctypes.POINTER(ctypes.c_int32)),
-            ctypes.cast(src.values.ptr, ctypes.c_void_p),
             ctypes.cast(dest.offsets.ptr, ctypes.POINTER(ctypes.c_int32)),
             ctypes.cast(dest.columns.ptr, ctypes.POINTER(ctypes.c_int32)),
-            ctypes.cast(dest.values.ptr, ctypes.c_void_p),
+            ctypes.cast(block_index_map.ptr, ctypes.POINTER(ctypes.c_int32)),
         )
-    dest.copy_nnz_async()
-    bsr_scale(dest, src_scale)
+        dest.copy_nnz_async()
+        wp.launch(
+            _bsr_transpose_values,
+            dim=(nnz, *dest.block_shape),
+            device=dest.device,
+            inputs=[src.ncol, dest.scalar_type(src_scale), src.scalar_values, block_index_map, dest.offsets],
+            outputs=[dest.scalar_values],
+        )
 def bsr_transposed(A: BsrMatrixOrExpression) -> BsrMatrix:
@@ -976,6 +1084,7 @@ def bsr_transposed(A: BsrMatrixOrExpression) -> BsrMatrix:
         block_type=block_type,
         device=A.device,
     )
+    transposed.values.requires_grad = A.requires_grad
     bsr_set_transpose(dest=transposed, src=A)
     return transposed
@@ -1010,12 +1119,12 @@ def bsr_get_diag(A: BsrMatrixOrExpression[BlockType], out: "Optional[Array[Block
     if out is None:
         out = wp.zeros(shape=(dim,), dtype=A.values.dtype, device=A.values.device)
     else:
-        if out.dtype != A.values.dtype:
-            raise ValueError(f"Output array must have type {A.values.dtype}")
+        if not types_equal(out.dtype, A.values.dtype):
+            raise ValueError(f"Output array must have type {A.values.dtype}, got {out.dtype}")
         if out.device != A.values.device:
-            raise ValueError(f"Output array must reside on device {A.values.device}")
+            raise ValueError(f"Output array must reside on device {A.values.device}, got {out.device}")
         if out.shape[0] < dim:
-            raise ValueError(f"Output array must be of length at least {dim}")
+            raise ValueError(f"Output array must be of length at least {dim}, got {out.shape[0]}")
     wp.launch(
         kernel=_bsr_get_diag_kernel,
@@ -1095,7 +1204,7 @@ def bsr_set_diag(
     elif diag is not None:
         A.values.fill_(diag)
-    A.copy_nnz_async(known_nnz=nnz)
+    A.copy_nnz_async()
 def bsr_diag(
@@ -1151,6 +1260,8 @@ def bsr_diag(
             block_type = wp.mat(shape=diag.shape, dtype=diag.dtype)
     A = bsr_zeros(rows_of_blocks, cols_of_blocks, block_type=block_type, device=device)
+    if is_array(diag):
+        A.values.requires_grad = diag.requires_grad
     bsr_set_diag(A, diag)
     return A
@@ -1292,8 +1403,8 @@ def bsr_axpy(
     The ``x`` and ``y`` matrices are allowed to alias.
     Args:
-        x: Read-only right-hand-side.
-        y: Mutable left-hand-side. If ``y`` is not provided, it will be allocated and treated as zero.
+        x: Read-only first operand.
+        y: Mutable second operand and output matrix. If ``y`` is not provided, it will be allocated and treated as zero.
         alpha: Uniform scaling factor for ``x``.
         beta: Uniform scaling factor for ``y``.
         masked: If ``True``, discard all blocks from ``x`` which are not
@@ -1312,6 +1423,7 @@ def bsr_axpy(
         # If not output matrix is provided, allocate it for convenience
         y = bsr_zeros(x.nrow, x.ncol, block_type=x.values.dtype, device=x.values.device)
+        y.values.requires_grad = x.requires_grad
         beta = 0.0
     x_nnz = x.nnz
@@ -1337,13 +1449,17 @@ def bsr_axpy(
     # General case
     if x.values.device != y.values.device:
-        raise ValueError("All arguments must reside on the same device")
+        raise ValueError(f"All arguments must reside on the same device, got {x.values.device} and {y.values.device}")
     if x.scalar_type != y.scalar_type or x.block_shape != y.block_shape:
-        raise ValueError("Matrices must have the same block type")
+        raise ValueError(
+            f"Matrices must have the same block type, got ({x.block_shape}, {x.scalar_type}) and ({y.block_shape}, {y.scalar_type})"
+        )
     if x.nrow != y.nrow or x.ncol != y.ncol:
-        raise ValueError("Matrices must have the same number of rows and columns")
+        raise ValueError(
+            f"Matrices must have the same number of rows and columns, got ({x.nrow}, {x.ncol}) and ({y.nrow}, {y.ncol})"
+        )
     if work_arrays is None:
         work_arrays = bsr_axpy_work_arrays()
@@ -1368,29 +1484,32 @@ def bsr_axpy(
     from warp.context import runtime
     if device.is_cpu:
-        native_func = runtime.core.bsr_matrix_from_triplets_float_host
+        native_func = runtime.core.bsr_matrix_from_triplets_host
     else:
-        native_func = runtime.core.bsr_matrix_from_triplets_float_device
+        native_func = runtime.core.bsr_matrix_from_triplets_device
     old_y_nnz = y_nnz
-    nnz_buf, nnz_event = y._nnz_transfer_buf_and_event()
+    nnz_buf, nnz_event = y._setup_nnz_transfer()
     with wp.ScopedDevice(y.device):
         native_func(
-            y.block_shape[0],
-            y.block_shape[1],
+            y.block_size,
+            0,  # scalar_size_in_bytes
             y.nrow,
+            y.ncol,
             sum_nnz,
+            None,  # device nnz
             ctypes.cast(work_arrays._sum_rows.ptr, ctypes.POINTER(ctypes.c_int32)),
             ctypes.cast(work_arrays._sum_cols.ptr, ctypes.POINTER(ctypes.c_int32)),
-            0,
-            False,
+            None,  # triplet values
+            0,  # zero_value_mask
             masked,
+            None,  # summed block offsets
+            None,  # summed block indices
             ctypes.cast(y.offsets.ptr, ctypes.POINTER(ctypes.c_int32)),
             ctypes.cast(y.columns.ptr, ctypes.POINTER(ctypes.c_int32)),
-            0,
-            ctypes.cast(nnz_buf.ptr, ctypes.POINTER(ctypes.c_int32)),
-            nnz_event,
+            _optional_ctypes_pointer(nnz_buf, ctype=ctypes.c_int32),
+            _optional_ctypes_event(nnz_event),
         )
     y.values.zero_()
@@ -1617,9 +1736,9 @@ def bsr_mm(
     If the matrix ``z`` is not provided as input, it will be allocated and treated as zero.
     Args:
-        x: Read-only left factor of the matrix-matrix product.
-        y: Read-only right factor of the matrix-matrix product.
-        z: Mutable left-hand-side. If ``z`` is not provided, it will be allocated and treated as zero.
+        x: Read-only left operand of the matrix-matrix product.
+        y: Read-only right operand of the matrix-matrix product.
+        z: Mutable affine operand and result matrix. If ``z`` is not provided, it will be allocated and treated as zero.
         alpha: Uniform scaling factor for the ``x @ y`` product
         beta: Uniform scaling factor for ``z``
         masked: If ``True``, ignore all blocks from ``x @ y`` which are not existing non-zeros of ``y``
@@ -1649,23 +1768,32 @@ def bsr_mm(
         else:
             z_block_type = wp.mat(shape=z_block_shape, dtype=x.scalar_type)
         z = bsr_zeros(x.nrow, y.ncol, block_type=z_block_type, device=x.values.device)
+        z.values.requires_grad = x.requires_grad or y.requires_grad
         beta = 0.0
     if x.values.device != y.values.device or x.values.device != z.values.device:
-        raise ValueError("All arguments must reside on the same device")
+        raise ValueError(
+            f"All arguments must reside on the same device, got {x.values.device}, {y.values.device} and {z.values.device}"
+        )
     if x.scalar_type != y.scalar_type or x.scalar_type != z.scalar_type:
-        raise ValueError("Matrices must have the same scalar type")
+        raise ValueError(
+            f"Matrices must have the same scalar type, got {x.scalar_type}, {y.scalar_type} and {z.scalar_type}"
+        )
     if (
         x.block_shape[0] != z.block_shape[0]
         or y.block_shape[1] != z.block_shape[1]
         or x.block_shape[1] != y.block_shape[0]
     ):
-        raise ValueError("Incompatible block sizes for matrix multiplication")
+        raise ValueError(
+            f"Incompatible block sizes for matrix multiplication, got ({x.block_shape}, {y.block_shape}) and ({z.block_shape})"
+        )
     if x.nrow != z.nrow or z.ncol != y.ncol or x.ncol != y.nrow:
-        raise ValueError("Incompatible number of rows/columns for matrix multiplication")
+        raise ValueError(
+            f"Incompatible number of rows/columns for matrix multiplication, got ({x.nrow}, {x.ncol}) and ({y.nrow}, {y.ncol})"
+        )
     device = z.values.device
@@ -1696,7 +1824,9 @@ def bsr_mm(
         mm_nnz = work_arrays._mm_nnz
     else:
         if device.is_capturing:
-            raise RuntimeError("`bsr_mm` requires `reuse_topology=True` for use in graph capture")
+            raise RuntimeError(
+                "`bsr_mm` requires either `reuse_topology=True` or `masked=True` for use in graph capture"
+            )
         if work_arrays is None:
             work_arrays = bsr_mm_work_arrays()
@@ -1725,7 +1855,7 @@ def bsr_mm(
         # Get back total counts on host -- we need a synchronization here
         # Use pinned buffer from z, we are going to need it later anyway
-        nnz_buf, _ = z._nnz_transfer_buf_and_event()
+        nnz_buf, _ = z._setup_nnz_transfer()
         stream = wp.get_stream(device) if device.is_cuda else None
         wp.copy(dest=nnz_buf, src=work_arrays._mm_block_counts, src_offset=x.nnz, count=1, stream=stream)
         if device.is_cuda:
@@ -1782,28 +1912,31 @@ def bsr_mm(
         from warp.context import runtime
         if device.is_cpu:
-            native_func = runtime.core.bsr_matrix_from_triplets_float_host
+            native_func = runtime.core.bsr_matrix_from_triplets_host
         else:
-            native_func = runtime.core.bsr_matrix_from_triplets_float_device
+            native_func = runtime.core.bsr_matrix_from_triplets_device
-        nnz_buf, nnz_event = z._nnz_transfer_buf_and_event()
+        nnz_buf, nnz_event = z._setup_nnz_transfer()
         with wp.ScopedDevice(z.device):
             native_func(
-                z.block_shape[0],
-                z.block_shape[1],
+                z.block_size,
+                0,  # scalar_size_in_bytes
                 z.nrow,
+                z.ncol,
                 mm_nnz,
+                None,  # device nnz
                 ctypes.cast(work_arrays._mm_rows.ptr, ctypes.POINTER(ctypes.c_int32)),
                 ctypes.cast(work_arrays._mm_cols.ptr, ctypes.POINTER(ctypes.c_int32)),
-                0,
-                False,
-                masked,
+                None,  # triplet values
+                0,  # zero_value_mask
+                False,  # masked_topology
+                None,  # summed block offsets
+                None,  # summed block indices
                 ctypes.cast(z.offsets.ptr, ctypes.POINTER(ctypes.c_int32)),
                 ctypes.cast(z.columns.ptr, ctypes.POINTER(ctypes.c_int32)),
-                0,
-                ctypes.cast(nnz_buf.ptr, ctypes.POINTER(ctypes.c_int32)),
-                nnz_event,
+                _optional_ctypes_pointer(nnz_buf, ctype=ctypes.c_int32),
+                _optional_ctypes_event(nnz_event),
             )
         # Resize z to fit mm result if necessary
@@ -1912,7 +2045,7 @@ def _bsr_mv_transpose_kernel(
 def _vec_array_view(array: wp.array, dtype: type, expected_scalar_count: int) -> wp.array:
     # cast a 1d or 2d array to a 1d array with the target dtype, adjusting shape as required
-    scalar_count = array.size * type_length(array.dtype)
+    scalar_count = array.size * type_size(array.dtype)
     if scalar_count != expected_scalar_count:
         raise ValueError(f"Invalid array scalar size, expected {expected_scalar_count}, got {scalar_count}")
@@ -1920,15 +2053,15 @@ def _vec_array_view(array: wp.array, dtype: type, expected_scalar_count: int) ->
         return array
     if type_scalar_type(array.dtype) != type_scalar_type(dtype):
-        raise ValueError(f"Incompatible scalar types, {type_repr(array.dtype)} vs {type_repr(dtype)}")
+        raise ValueError(f"Incompatible scalar types, expected {type_repr(array.dtype)}, got {type_repr(dtype)}")
     if array.ndim > 2:
-        raise ValueError(f"Incompatible array number of dimensions {array.ndim}")
+        raise ValueError(f"Incompatible array number of dimensions, expected 1 or 2, got {array.ndim}")
     if not array.is_contiguous:
         raise ValueError("Array must be contiguous")
-    vec_length = type_length(dtype)
+    vec_length = type_size(dtype)
     vec_count = scalar_count // vec_length
     if vec_count * vec_length != scalar_count:
         raise ValueError(
@@ -1965,9 +2098,9 @@ def bsr_mv(
     The ``x`` and ``y`` vectors are allowed to alias.
     Args:
-        A: Read-only, left matrix factor of the matrix-vector product.
-        x: Read-only, right vector factor of the matrix-vector product.
-        y: Mutable left-hand-side. If ``y`` is not provided, it will be allocated and treated as zero.
+        A: Read-only, left matrix operand of the matrix-vector product.
+        x: Read-only, right vector operand of the matrix-vector product.
+        y: Mutable affine operand and result vector. If ``y`` is not provided, it will be allocated and treated as zero.
         alpha: Uniform scaling factor for ``x``. If zero, ``x`` will not be read and may be left uninitialized.
         beta: Uniform scaling factor for ``y``. If zero, ``y`` will not be read and may be left uninitialized.
         transpose: If ``True``, use the transpose of the matrix ``A``. In this case the result is **non-deterministic**.
@@ -1990,23 +2123,27 @@ def bsr_mv(
         # If no output array is provided, allocate one for convenience
         y_vec_len = block_shape[0]
         y_dtype = A.scalar_type if y_vec_len == 1 else wp.vec(length=y_vec_len, dtype=A.scalar_type)
-        y = wp.empty(shape=(nrow,), device=A.values.device, dtype=y_dtype)
+        y = wp.empty(shape=(nrow,), device=A.values.device, dtype=y_dtype, requires_grad=x.requires_grad)
         beta = 0.0
     alpha = A.scalar_type(alpha)
     beta = A.scalar_type(beta)
     if A.values.device != x.device or A.values.device != y.device:
-        raise ValueError("A, x, and y must reside on the same device")
+        raise ValueError(
+            f"A, x, and y must reside on the same device, got {A.values.device}, {x.device} and {y.device}"
+        )
     if x.ptr == y.ptr:
         # Aliasing case, need temporary storage
         if work_buffer is None:
             work_buffer = wp.empty_like(y)
         elif work_buffer.size < y.size:
-            raise ValueError(f"Work buffer size is insufficient, needs to be at least {y.size}")
+            raise ValueError(f"Work buffer size is insufficient, needs to be at least {y.size}, got {work_buffer.size}")
         elif not types_equal(work_buffer.dtype, y.dtype):
-            raise ValueError(f"Work buffer must have same data type as y, {type_repr(y.dtype)}")
+            raise ValueError(
+                f"Work buffer must have same data type as y, {type_repr(y.dtype)} vs {type_repr(work_buffer.dtype)}"
+            )
         # Save old y values before overwriting vector
         wp.copy(dest=work_buffer, src=y, count=y.size)