PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl - Mend

warp-lang 1.8.1__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (134) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +93 -30
warp/build_dll.py +47 -67
warp/builtins.py +955 -137
warp/codegen.py +312 -206
warp/config.py +1 -1
warp/context.py +1249 -784
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +2 -1
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +82 -5
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +283 -69
warp/native/vec.h +381 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +323 -192
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +85 -6
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +56 -5
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +184 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/builtins.py CHANGED Viewed

@@ -17,10 +17,12 @@ from __future__ import annotations
 import builtins
 import functools
+import math
 from typing import Any, Callable, Mapping, Sequence
 import warp.build
 import warp.context
+import warp.utils
 from warp.codegen import Reference, Var, get_arg_value, strip_reference
 from warp.types import *
@@ -2355,6 +2357,7 @@ def tile_load_tuple_value_func(arg_types: Mapping[str, type], arg_values: Mappin
 def tile_load_tuple_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     a = args["a"]
     shape = extract_tuple(args["shape"], as_constant=True)
+    bounds_check = args["bounds_check"]
     if None in shape:
         raise ValueError("Tile functions require shape to be a compile time constant.")
@@ -2365,17 +2368,23 @@ def tile_load_tuple_dispatch_func(input_types: Mapping[str, type], return_type:
         offset = (0,) * a.type.ndim
     func_args = (a, *offset)
-    template_args = shape
+    template_args = (return_type.dtype, bounds_check.constant, *shape)
     return (func_args, template_args)
 add_builtin(
     "tile_load",
-    input_types={"a": array(dtype=Any), "shape": Tuple[int, ...], "offset": Tuple[int, ...], "storage": str},
+    input_types={
+        "a": array(dtype=Any),
+        "shape": Tuple[int, ...],
+        "offset": Tuple[int, ...],
+        "storage": str,
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_load_tuple_value_func,
     dispatch_func=tile_load_tuple_dispatch_func,
-    defaults={"offset": None, "storage": "register"},
+    defaults={"offset": None, "storage": "register", "bounds_check": True},
     variadic=False,
     doc="""Loads a tile from a global memory array.
@@ -2386,6 +2395,7 @@ add_builtin(
     :param offset: Offset in the source array to begin reading from (optional)
     :param storage: The storage location for the tile: ``"register"`` for registers
       (default) or ``"shared"`` for shared memory.
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster load times
     :returns: A tile with shape as specified and data type the same as the source array""",
     group="Tile Primitives",
     export=False,
@@ -2394,16 +2404,160 @@ add_builtin(
 # overload for scalar shape
 add_builtin(
     "tile_load",
-    input_types={"a": array(dtype=Any), "shape": int, "offset": int, "storage": str},
+    input_types={"a": array(dtype=Any), "shape": int, "offset": int, "storage": str, "bounds_check": builtins.bool},
     value_func=tile_load_tuple_value_func,
     dispatch_func=tile_load_tuple_dispatch_func,
-    defaults={"offset": None, "storage": "register"},
+    defaults={"offset": None, "storage": "register", "bounds_check": True},
     group="Tile Primitives",
     hidden=True,
     export=False,
 )
+def tile_load_indexed_tuple_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return tile(dtype=Any, shape=Tuple[int, ...])
+    a = arg_types["a"]
+    indices_tile = arg_types["indices"]
+    indices_tile.storage = "shared"  # force to shared
+    axis = arg_values["axis"]
+    if axis >= a.ndim:
+        raise ValueError(f"tile_load_indexed() axis argument must be valid axis of array {a}, got {axis}.")
+    indices_tile_dim = len(indices_tile.shape)
+    if indices_tile_dim != 1:
+        raise ValueError(
+            f"tile_load_indexed() indices argument must be a 1D tile, got {indices_tile_dim} dimensions instead."
+        )
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
+    num_indices = indices_tile.shape[0]
+    if num_indices != shape[axis]:
+        raise ValueError(
+            "The number of elements in the 1D indices tile must match the output tile shape along the specified axis."
+        )
+    if "offset" in arg_values:
+        offset = extract_tuple(arg_values["offset"])
+    else:
+        offset = (0,) * a.ndim
+    if a.ndim != len(shape):
+        raise ValueError(
+            f"tile_load_indexed() array argument must have same number of dimensions as the tile shape, trying to perform an {len(shape)} dimensional load from an array with {a.ndim} dimensions."
+        )
+    if a.ndim != len(offset):
+        raise ValueError(
+            f"tile_load_indexed() offset argument must have the same number of dimensions as the array to load from, got {len(offset)} indices for an array with {a.ndim} dimensions"
+        )
+    if arg_values["storage"] not in {"shared", "register"}:
+        raise ValueError(f"Invalid value for 'storage': {arg_values['storage']!r}. Expected 'shared' or 'register'.")
+    return tile(dtype=a.dtype, shape=shape, storage=arg_values["storage"])
+def tile_load_indexed_tuple_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    a = args["a"]
+    indices_tile = args["indices"]
+    axis = args["axis"]
+    shape = extract_tuple(args["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
+    func_args = (a, indices_tile, axis, *offset)
+    template_args = shape
+    return (func_args, template_args)
+add_builtin(
+    "tile_load_indexed",
+    input_types={
+        "a": array(dtype=Any),
+        "indices": tile(dtype=int, shape=Tuple[int]),
+        "shape": Tuple[int, ...],
+        "offset": Tuple[int, ...],
+        "axis": int,
+        "storage": str,
+    },
+    value_func=tile_load_indexed_tuple_value_func,
+    dispatch_func=tile_load_indexed_tuple_dispatch_func,
+    defaults={"offset": None, "axis": 0, "storage": "register"},
+    variadic=False,
+    doc="""Loads a tile from a global memory array, with loads along a specified axis mapped according to a 1D tile of indices.
+    :param a: The source array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param shape: Shape of the tile to load, must have the same number of dimensions as ``a``, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the source array to begin reading from (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    :param storage: The storage location for the tile: ``"register"`` for registers (default) or ``"shared"`` for shared memory.
+    :returns: A tile with shape as specified and data type the same as the source array
+    This example shows how to select and store the even indexed rows from a 2D array:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        HALF_M = wp.constant(TILE_M // 2)
+        HALF_N = wp.constant(TILE_N // 2)
+        @wp.kernel
+        def compute(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            evens = wp.tile_arange(HALF_M, dtype=int, storage="shared") * 2
+            t0 = wp.tile_load_indexed(x, indices=evens, shape=(HALF_M, TILE_N), offset=(i*TILE_M, j*TILE_N), axis=0, storage="register")
+            wp.tile_store(y, t0, offset=(i*HALF_M, j*TILE_N))
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N).reshape(M, N)
+        x = wp.array(arr, dtype=float)
+        y = wp.zeros((M // 2, N), dtype=float)
+        wp.launch_tiled(compute, dim=[2,2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 0.  1.  2.  3.]
+         [ 8.  9. 10. 11.]]
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_store_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2440,6 +2594,7 @@ def tile_store_value_func(arg_types, arg_values):
 def tile_store_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     a = args["a"]
     t = args["t"]
+    bounds_check = args["bounds_check"]
     if "offset" in args:
         offset = extract_tuple(args["offset"])
@@ -2447,17 +2602,22 @@ def tile_store_dispatch_func(input_types: Mapping[str, type], return_type: Any,
         offset = (0,) * a.type.ndim
     func_args = (a, *offset, t)
-    template_args = []
+    template_args = (a.type.dtype, bounds_check.constant)
     return (func_args, template_args)
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": Tuple[int, ...]},
+    input_types={
+        "a": array(dtype=Any),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_store_value_func,
     dispatch_func=tile_store_dispatch_func,
-    defaults={"offset": None},
+    defaults={"offset": None, "bounds_check": True},
     variadic=False,
     skip_replay=True,
     doc="""Store a tile to a global memory array.
@@ -2466,7 +2626,9 @@ add_builtin(
     :param a: The destination array in global memory
     :param t: The source tile to store data from, must have the same data type and number of dimensions as the destination array
-    :param offset: Offset in the destination array (optional)""",
+    :param offset: Offset in the destination array (optional)
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster write times
+    """,
     group="Tile Primitives",
     export=False,
 )
@@ -2474,10 +2636,15 @@ add_builtin(
 # overload for scalar offset
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": int},
+    input_types={
+        "a": array(dtype=Any),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": int,
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_store_value_func,
     dispatch_func=tile_store_dispatch_func,
-    defaults={"offset": None},
+    defaults={"offset": None, "bounds_check": True},
     variadic=False,
     skip_replay=True,
     group="Tile Primitives",
@@ -2486,6 +2653,151 @@ add_builtin(
 )
+def tile_store_indexed_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+    a = arg_types["a"]
+    t = arg_types["t"]
+    indices_tile = arg_types["indices"]
+    indices_tile.storage = "shared"  # force to shared
+    axis = arg_values["axis"]
+    if axis >= a.ndim:
+        raise ValueError(f"tile_store_indexed() axis argument must be valid axis of array {a}, got {axis}.")
+    indices_tile_dim = len(indices_tile.shape)
+    if indices_tile_dim != 1:
+        raise ValueError(
+            f"tile_store_indexed() indices argument must be a 1D tile, got {indices_tile_dim} dimensions instead."
+        )
+    num_indices = indices_tile.shape[0]
+    if num_indices != t.shape[axis]:
+        raise ValueError(
+            "The number of elements in the 1D indices tile must match the input tile shape along the specified axis."
+        )
+    if "offset" in arg_types:
+        c = extract_tuple(arg_values["offset"])
+    else:
+        c = (0,) * a.ndim
+    if len(c) != a.ndim:
+        raise ValueError(
+            f"tile_store_indexed() 'a' argument must have {len(c)} dimensions, "
+            f"calculated based on the provided offset arguments, but got {a.ndim} dimensions."
+        )
+    if len(t.shape) != a.ndim:
+        raise ValueError(
+            f"tile_store_indexed() 'a' argument must have the same number of dimensions as the 't' argument, "
+            f"but got {a.ndim} dimensions for 'a' and {len(t.shape)} dimensions for 't'"
+        )
+    if not types_equal(arg_types["a"].dtype, arg_types["t"].dtype):
+        raise TypeError(
+            f"tile_store_indexed() 'a' and 't' arguments must have the same dtype, got {arg_types['a'].dtype} and {arg_types['t'].dtype}"
+        )
+    return None
+def tile_store_indexed_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    a = args["a"]
+    indices_tile = args["indices"]
+    axis = args["axis"]
+    t = args["t"]
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
+    func_args = (a, indices_tile, axis, *offset, t)
+    template_args = []
+    return (func_args, template_args)
+add_builtin(
+    "tile_store_indexed",
+    input_types={
+        "a": array(dtype=Any),
+        "indices": tile(dtype=int, shape=Tuple[int]),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+        "axis": int,
+    },
+    value_func=tile_store_indexed_value_func,
+    dispatch_func=tile_store_indexed_dispatch_func,
+    defaults={"offset": None, "axis": 0},
+    variadic=False,
+    skip_replay=True,
+    doc="""Store a tile to a global memory array, with storage along a specified axis mapped according to a 1D tile of indices.
+    :param a: The destination array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param t: The source tile to store data from, must have the same data type and number of dimensions as the destination array, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the destination array (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    This example shows how to map tile rows to the even rows of a 2D array:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        TWO_M = wp.constant(TILE_M * 2)
+        TWO_N = wp.constant(TILE_N * 2)
+        @wp.kernel
+        def compute(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            t = wp.tile_load(x, shape=(TILE_M, TILE_N), offset=(i*TILE_M, j*TILE_N), storage="register")
+            evens_M = wp.tile_arange(TILE_M, dtype=int, storage="shared") * 2
+            wp.tile_store_indexed(y, indices=evens_M, t=t, offset=(i*TWO_M, j*TILE_N), axis=0)
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N, dtype=float).reshape(M, N)
+        x = wp.array(arr, dtype=float, requires_grad=True, device=device)
+        y = wp.zeros((M * 2, N), dtype=float, requires_grad=True, device=device)
+        wp.launch_tiled(compute, dim=[2,2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 0.  1.  2.  3.]
+         [ 0.  0.  0.  0.]
+         [ 4.  5.  6.  7.]
+         [ 0.  0.  0.  0.]
+         [ 8.  9. 10. 11.]
+         [ 0.  0.  0.  0.]
+         [12. 13. 14. 15.]
+         [ 0.  0.  0.  0.]]
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_atomic_add_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2526,6 +2838,7 @@ def tile_atomic_add_value_func(arg_types, arg_values):
 def tile_atomic_add_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     a = args["a"]
     t = args["t"]
+    bounds_check = args["bounds_check"]
     if "offset" in args:
         offset = extract_tuple(args["offset"])
@@ -2533,17 +2846,22 @@ def tile_atomic_add_dispatch_func(input_types: Mapping[str, type], return_type:
         offset = (0,) * a.type.ndim
     func_args = (a, *offset, t)
-    template_args = []
+    template_args = (a.type.dtype, bounds_check.constant)
     return (func_args, template_args)
 add_builtin(
     "tile_atomic_add",
-    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": Tuple[int, ...]},
+    input_types={
+        "a": array(dtype=Any),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_atomic_add_value_func,
     dispatch_func=tile_atomic_add_dispatch_func,
-    defaults={"offset": None},
+    defaults={"offset": None, "bounds_check": True},
     variadic=False,
     skip_replay=True,
     doc="""Atomically add a tile onto the array `a`, each element will be updated atomically.
@@ -2551,6 +2869,7 @@ add_builtin(
     :param a: Array in global memory, should have the same ``dtype`` as the input tile
     :param t: Source tile to add to the destination array
     :param offset: Offset in the destination array (optional)
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster write times
     :returns: A tile with the same dimensions and data type as the source tile, holding the original value of the destination elements""",
     group="Tile Primitives",
     export=False,
@@ -2559,10 +2878,15 @@ add_builtin(
 # overload for scalar offset
 add_builtin(
     "tile_atomic_add",
-    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": int},
+    input_types={
+        "a": array(dtype=Any),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": int,
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_atomic_add_value_func,
     dispatch_func=tile_atomic_add_dispatch_func,
-    defaults={"offset": None},
+    defaults={"offset": None, "bounds_check": True},
     variadic=False,
     skip_replay=True,
     group="Tile Primitives",
@@ -2571,6 +2895,143 @@ add_builtin(
 )
+def tile_atomic_add_indexed_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Any, shape=Tuple[int, ...])
+    a = arg_types["a"]
+    t = arg_types["t"]
+    indices_tile = arg_types["indices"]
+    indices_tile.storage = "shared"  # force to shared
+    axis = arg_values["axis"]
+    if axis >= a.ndim:
+        raise ValueError(f"tile_atomic_add_indexed() axis argument must be valid axis of array {a}, got {axis}.")
+    indices_tile_dim = len(indices_tile.shape)
+    if indices_tile_dim != 1:
+        raise ValueError(
+            f"tile_atomic_add_indexed() indices argument must be a 1D tile, got {indices_tile_dim} dimensions instead."
+        )
+    num_indices = indices_tile.shape[0]
+    if num_indices != t.shape[axis]:
+        raise ValueError(
+            "The number of elements in the 1D indices tile must match the input tile shape along the specified axis."
+        )
+    if "offset" in arg_types:
+        c = extract_tuple(arg_values["offset"])
+    else:
+        c = (0,) * a.ndim
+    if len(c) != a.ndim:
+        raise ValueError(
+            f"tile_atomic_add_indexed() 'a' argument must have {len(c)} dimensions, "
+            f"calculated based on the provided offset arguments, but got {a.ndim} dimensions."
+        )
+    if len(t.shape) != a.ndim:
+        raise ValueError(
+            f"tile_atomic_add_indexed() 'a' argument must have the same number of dimensions as the 't' argument, "
+            f"but got {a.ndim} dimensions for 'a' and {len(t.shape)} dimensions for 't'"
+        )
+    if not types_equal(arg_types["a"].dtype, arg_types["t"].dtype):
+        raise TypeError(
+            f"tile_atomic_add_indexed() 'a' and 't' arguments must have the same dtype, got {arg_types['a'].dtype} and {arg_types['t'].dtype}"
+        )
+    return tile(dtype=t.dtype, shape=t.shape, storage=t.storage)
+def tile_atomic_add_indexed_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    a = args["a"]
+    indices_tile = args["indices"]
+    axis = args["axis"]
+    t = args["t"]
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
+    func_args = (a, indices_tile, axis, *offset, t)
+    template_args = []
+    return (func_args, template_args)
+add_builtin(
+    "tile_atomic_add_indexed",
+    input_types={
+        "a": array(dtype=Any),
+        "indices": tile(dtype=int, shape=Tuple[int]),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+        "axis": int,
+    },
+    value_func=tile_atomic_add_indexed_value_func,
+    dispatch_func=tile_atomic_add_indexed_dispatch_func,
+    defaults={"offset": None, "axis": 0},
+    variadic=False,
+    skip_replay=True,
+    doc="""Atomically add a tile to a global memory array, with storage along a specified axis mapped according to a 1D tile of indices.
+    :param a: The destination array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param t: The source tile to extract data from, must have the same data type and number of dimensions as the destination array, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the destination array (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    This example shows how to compute a blocked, row-wise reduction:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        @wp.kernel
+        def tile_atomic_add_indexed(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            t = wp.tile_load(x, shape=(TILE_M, TILE_N), offset=(i*TILE_M, j*TILE_N), storage="register")
+            zeros = wp.tile_zeros(TILE_M, dtype=int, storage="shared")
+            wp.tile_atomic_add_indexed(y, indices=zeros, t=t, offset=(i, j*TILE_N), axis=0)
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N, dtype=float).reshape(M, N)
+        x = wp.array(arr, dtype=float, requires_grad=True, device=device)
+        y = wp.zeros((2, N), dtype=float, requires_grad=True, device=device)
+        wp.launch_tiled(tile_atomic_add_indexed, dim=[2,2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 4.  6.  8. 10.]
+         [20. 22. 24. 26.]]
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_view_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -3934,14 +4395,45 @@ def tile_unary_map_value_func(arg_types, arg_values):
     if not is_tile(a):
         raise TypeError(f"tile_map() 'a' argument must be a tile, got {a!r}")
-    return tile(dtype=a.dtype, shape=a.shape)
+    if "op" in arg_values:
+        op = arg_values["op"]
+        try:
+            overload = op.get_overload([a.dtype], {})
+        except KeyError as exc:
+            raise RuntimeError(f"No overload of {op} found for tile element type {type_repr(a.dtype)}") from exc
+        # build the right overload on demand
+        if overload.value_func is None:
+            overload.build(None)
+        value_type = overload.value_func(None, None)
+        if not type_is_scalar(value_type) and not type_is_vector(value_type) and not type_is_matrix(value_type):
+            raise TypeError(f"Operator {op} returns unsupported type {type_repr(value_type)} for a tile element")
+        return tile(dtype=value_type, shape=a.shape)
+    else:
+        return tile(dtype=a.dtype, shape=a.shape)
+def tile_unary_map_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    op = arg_values["op"]
+    tile_a = arg_values["a"]
+    overload = op.get_overload([tile_a.type.dtype], {})
+    # necessary, in case return type is different from input tile types
+    tile_r = Var(label=None, type=return_type)
+    return ((overload, tile_a, tile_r), ())
 add_builtin(
     "tile_map",
     input_types={"op": Callable, "a": tile(dtype=Scalar, shape=Tuple[int, ...])},
     value_func=tile_unary_map_value_func,
-    # dispatch_func=tile_map_dispatch_func,
+    dispatch_func=tile_unary_map_dispatch_func,
     # variadic=True,
     native_func="tile_unary_map",
     doc="""Apply a unary function onto the tile.
@@ -3950,7 +4442,7 @@ add_builtin(
     :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's data type
-    :returns: A tile with the same dimensions and data type as the input tile.
+    :returns: A tile with the same dimensions as the input tile. Its datatype is specified by the return type of op
     Example:
@@ -3991,10 +4483,6 @@ def tile_binary_map_value_func(arg_types, arg_values):
     if not is_tile(b):
         raise TypeError(f"tile_map() 'b' argument must be a tile, got {b!r}")
-    # ensure types equal
-    if not types_equal(a.dtype, b.dtype):
-        raise TypeError(f"tile_map() arguments must have the same dtype, got {a.dtype} and {b.dtype}")
     if len(a.shape) != len(b.shape):
         raise ValueError(
             f"tile_map() shapes must have the same number of dimensions, got {len(a.shape)} and {len(b.shape)}"
@@ -4004,7 +4492,47 @@ def tile_binary_map_value_func(arg_types, arg_values):
         if a.shape[i] != b.shape[i]:
             raise ValueError(f"tile_map() shapes do not match on dimension {i}, got {a.shape} and {b.shape}")
-    return tile(dtype=a.dtype, shape=a.shape)
+    if "op" in arg_values:
+        op = arg_values["op"]
+        try:
+            overload = op.get_overload([a.dtype, b.dtype], {})
+        except KeyError as exc:
+            raise RuntimeError(
+                f"No overload of {op} found for tile element types {type_repr(a.dtype)}, {type_repr(b.dtype)}"
+            ) from exc
+        # build the right overload on demand
+        if overload.value_func is None:
+            overload.build(None)
+        value_type = overload.value_func(None, None)
+        if not type_is_scalar(value_type) and not type_is_vector(value_type) and not type_is_matrix(value_type):
+            raise TypeError(f"Operator {op} returns unsupported type {type_repr(value_type)} for a tile element")
+        return tile(dtype=value_type, shape=a.shape)
+    else:
+        # ensure types equal
+        if not types_equal(a.dtype, b.dtype):
+            raise TypeError(
+                f"tile_map() arguments must have the same dtype for this operation, got {a.dtype} and {b.dtype}"
+            )
+        return tile(dtype=a.dtype, shape=a.shape)
+def tile_binary_map_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    op = arg_values["op"]
+    tile_a = arg_values["a"]
+    tile_b = arg_values["b"]
+    overload = op.get_overload([tile_a.type.dtype, tile_b.type.dtype], {})
+    # necessary, in case return type is different from input tile types
+    tile_r = Var(label=None, type=return_type)
+    return ((overload, tile_a, tile_b, tile_r), ())
 add_builtin(
@@ -4015,18 +4543,18 @@ add_builtin(
         "b": tile(dtype=Scalar, shape=Tuple[int, ...]),
     },
     value_func=tile_binary_map_value_func,
-    # dispatch_func=tile_map_dispatch_func,
+    dispatch_func=tile_binary_map_dispatch_func,
     # variadic=True,
     native_func="tile_binary_map",
     doc="""Apply a binary function onto the tile.
     This function cooperatively applies a binary function to each element of the tiles using all threads in the block.
-    Both input tiles must have the same dimensions and datatype.
+    Both input tiles must have the same dimensions, and if using a builtin op, the same datatypes.
     :param op: A callable function that accepts two arguments and returns one argument, all of the same type, may be a user function or builtin
     :param a: The first input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
     :param b: The second input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A tile with the same dimensions and datatype as the input tiles.
+    :returns: A tile with the same dimensions as the input tiles. Its datatype is specified by the return type of op
     Example:
@@ -5544,7 +6072,7 @@ def array_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any
         return array(dtype=Scalar)
     dtype = arg_values["dtype"]
-    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    shape = extract_tuple(arg_values["shape"], as_constant=False)
     return array(dtype=dtype, ndim=len(shape))
@@ -5554,7 +6082,7 @@ def array_dispatch_func(input_types: Mapping[str, type], return_type: Any, args:
     # to the underlying C++ function's runtime and template params.
     dtype = return_type.dtype
-    shape = extract_tuple(args["shape"], as_constant=True)
+    shape = extract_tuple(args["shape"], as_constant=False)
     func_args = (args["ptr"], *shape)
     template_args = (dtype,)
@@ -5563,7 +6091,7 @@ def array_dispatch_func(input_types: Mapping[str, type], return_type: Any, args:
 add_builtin(
     "array",
-    input_types={"ptr": warp.uint64, "shape": Tuple[int, ...], "dtype": Scalar},
+    input_types={"ptr": warp.uint64, "shape": Tuple[int, ...], "dtype": Any},
     value_func=array_value_func,
     export_func=lambda input_types: {k: v for k, v in input_types.items() if k != "dtype"},
     dispatch_func=array_dispatch_func,
@@ -5575,6 +6103,48 @@ add_builtin(
 )
+def zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return fixedarray(dtype=Scalar)
+    dtype = arg_values["dtype"]
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise RuntimeError("the `shape` argument must be specified as a constant when zero-initializing an array")
+    return fixedarray(dtype=dtype, shape=shape)
+def zeros_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    # We're in the codegen stage where we emit the code calling the built-in.
+    # Further validate the given argument values if needed and map them
+    # to the underlying C++ function's runtime and template params.
+    dtype = return_type.dtype
+    shape = extract_tuple(args["shape"], as_constant=True)
+    size = math.prod(shape)
+    func_args = shape
+    template_args = (size, dtype)
+    return (func_args, template_args)
+add_builtin(
+    "zeros",
+    input_types={"shape": Tuple[int, ...], "dtype": Any},
+    value_func=zeros_value_func,
+    export_func=lambda input_types: {},
+    dispatch_func=zeros_dispatch_func,
+    native_func="fixedarray_t",
+    group="Utility",
+    export=False,
+    missing_grad=True,
+    hidden=True,  # Unhide once we can document both a built-in and a Python scope function sharing the same name.
+)
 # does argument checking and type propagation for address()
 def address_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     arr_type = arg_types["arr"]
@@ -5864,8 +6434,8 @@ def atomic_op_dispatch_func(input_types: Mapping[str, type], return_type: Any, a
 for array_type in array_types:
-    # don't list indexed array operations explicitly in docs
-    hidden = array_type == indexedarray
+    # don't list fixed or indexed array operations explicitly in docs
+    hidden = array_type in (indexedarray, fixedarray)
     add_builtin(
         "atomic_add",
@@ -6187,46 +6757,110 @@ for array_type in array_types:
 # used to index into builtin types, i.e.: y = vec3[1]
-def extract_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
-    return arg_types["a"]._wp_scalar_type_
+def vector_extract_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    vec_type = arg_types["a"]
+    idx_type = arg_types["i"]
+    if isinstance(idx_type, slice_t):
+        length = idx_type.get_length(vec_type._length_)
+        return vector(length=length, dtype=vec_type._wp_scalar_type_)
+    return vec_type._wp_scalar_type_
+def vector_extract_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = tuple(args.values())
+    template_args = getattr(return_type, "_shape_", ())
+    return (func_args, template_args)
 add_builtin(
     "extract",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int},
-    value_func=extract_value_func,
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any},
+    value_func=vector_extract_value_func,
+    dispatch_func=vector_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
 add_builtin(
     "extract",
-    input_types={"a": quaternion(dtype=Scalar), "i": int},
-    value_func=extract_value_func,
+    input_types={"a": quaternion(dtype=Scalar), "i": Any},
+    value_func=vector_extract_value_func,
+    dispatch_func=vector_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
 add_builtin(
     "extract",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int},
-    value_func=lambda arg_types, arg_values: vector(
-        length=arg_types["a"]._shape_[1], dtype=arg_types["a"]._wp_scalar_type_
-    ),
+    input_types={"a": transformation(dtype=Scalar), "i": Any},
+    value_func=vector_extract_value_func,
+    dispatch_func=vector_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
+def matrix_extract_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    mat_type = arg_types["a"]
+    idx_types = tuple(arg_types[x] for x in "ij" if arg_types.get(x, None) is not None)
+    # Compute the resulting shape from the slicing, with -1 being simple indexing.
+    shape = tuple(
+        idx.get_length(mat_type._shape_[i]) if isinstance(idx, slice_t) else -1 for i, idx in enumerate(idx_types)
+    )
+    # Append any non indexed slice.
+    for i in range(len(idx_types), len(mat_type._shape_)):
+        shape += (mat_type._shape_[i],)
+    # Count how many dimensions the output value will have.
+    ndim = sum(1 for x in shape if x >= 0)
+    if ndim == 0:
+        return mat_type._wp_scalar_type_
+    assert shape[0] != -1 or shape[1] != -1
+    if ndim == 1:
+        length = shape[0] if shape[0] != -1 else shape[1]
+        return vector(length=length, dtype=mat_type._wp_scalar_type_)
+    assert ndim == 2
+    # When a matrix dimension is 0, all other dimensions are also expected to be 0.
+    if any(x == 0 for x in shape):
+        shape = (0,) * len(shape)
+    return matrix(shape=shape, dtype=mat_type._wp_scalar_type_)
+def matrix_extract_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    idx_types = tuple(args[x].type for x in "ij" if args.get(x, None) is not None)
+    has_slice = any(isinstance(x, slice_t) for x in idx_types)
+    func_args = tuple(args.values())
+    template_args = getattr(return_type, "_shape_", ()) if has_slice else ()
+    return (func_args, template_args)
 add_builtin(
     "extract",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int},
-    value_func=extract_value_func,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any},
+    value_func=matrix_extract_value_func,
+    dispatch_func=matrix_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
 add_builtin(
     "extract",
-    input_types={"a": transformation(dtype=Scalar), "i": int},
-    value_func=extract_value_func,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any},
+    value_func=matrix_extract_value_func,
+    dispatch_func=matrix_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
@@ -6247,6 +6881,19 @@ def vector_index_dispatch_func(input_types: Mapping[str, type], return_type: Any
     return (func_args, template_args)
+def matrix_ij_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    mat_type = arg_types["a"]
+    value_type = mat_type._wp_scalar_type_
+    return Reference(value_type)
+def matrix_ij_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = (Reference(args["a"]), args["i"], args["j"])
+    template_args = ()
+    return (func_args, template_args)
 # implements &vector[index]
 add_builtin(
     "index",
@@ -6287,6 +6934,16 @@ add_builtin(
     group="Utility",
     skip_replay=True,
 )
+# implements &(*matrix)[i, j]
+add_builtin(
+    "indexref",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int},
+    value_func=matrix_ij_value_func,
+    dispatch_func=matrix_ij_dispatch_func,
+    hidden=True,
+    group="Utility",
+    skip_replay=True,
+)
 # implements &(*quaternion)[index]
 add_builtin(
     "indexref",
@@ -6309,11 +6966,46 @@ add_builtin(
 )
+def vector_assign_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    vec = args["a"].type
+    idx = args["i"].type
+    value_type = strip_reference(args["value"].type)
+    if isinstance(idx, slice_t):
+        length = idx.get_length(vec._length_)
+        if type_is_vector(value_type):
+            if not types_equal(value_type._wp_scalar_type_, vec._wp_scalar_type_):
+                raise ValueError(
+                    f"The provided vector is expected to be of length {length} with dtype {type_repr(vec._wp_scalar_type_)}."
+                )
+            if value_type._length_ != length:
+                raise ValueError(
+                    f"The length of the provided vector ({args['value'].type._length_}) isn't compatible with the given slice (expected {length})."
+                )
+            template_args = (length,)
+        else:
+            # Disallow broadcasting.
+            raise ValueError(
+                f"The provided value is expected to be a vector of length {length}, with dtype {type_repr(vec._wp_scalar_type_)}."
+            )
+    else:
+        if not types_equal(value_type, vec._wp_scalar_type_):
+            raise ValueError(
+                f"The provided value is expected to be a scalar of type {type_repr(vec._wp_scalar_type_)}."
+            )
+        template_args = ()
+    func_args = tuple(args.values())
+    return (func_args, template_args)
 # implements vector[index] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6322,8 +7014,9 @@ add_builtin(
 # implements quaternion[index] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": quaternion(dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6331,15 +7024,16 @@ add_builtin(
 # implements transformation[index] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": transformation(dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-def vector_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+def vector_assign_copy_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     vec_type = arg_types["a"]
     return vec_type
@@ -6347,8 +7041,9 @@ def vector_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[
 # implements vector[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
     "assign_copy",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
-    value_func=vector_assign_value_func,
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
+    value_func=vector_assign_copy_value_func,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6357,8 +7052,9 @@ add_builtin(
 # implements quaternion[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
     "assign_copy",
-    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
-    value_func=vector_assign_value_func,
+    input_types={"a": quaternion(dtype=Scalar), "i": Any, "value": Any},
+    value_func=vector_assign_copy_value_func,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6367,8 +7063,9 @@ add_builtin(
 # implements transformation[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
     "assign_copy",
-    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
-    value_func=vector_assign_value_func,
+    input_types={"a": transformation(dtype=Scalar), "i": Any, "value": Any},
+    value_func=vector_assign_copy_value_func,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6377,8 +7074,9 @@ add_builtin(
 # implements vector[idx] += scalar
 add_builtin(
     "add_inplace",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6387,8 +7085,9 @@ add_builtin(
 # implements quaternion[idx] += scalar
 add_builtin(
     "add_inplace",
-    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": quaternion(dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6397,8 +7096,9 @@ add_builtin(
 # implements transformation[idx] += scalar
 add_builtin(
     "add_inplace",
-    input_types={"a": transformation(dtype=Float), "i": int, "value": Float},
+    input_types={"a": transformation(dtype=Float), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6417,8 +7117,9 @@ add_builtin(
 # implements vector[idx] -= scalar
 add_builtin(
     "sub_inplace",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6427,8 +7128,9 @@ add_builtin(
 # implements quaternion[idx] -= scalar
 add_builtin(
     "sub_inplace",
-    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": quaternion(dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6437,8 +7139,9 @@ add_builtin(
 # implements transformation[idx] -= scalar
 add_builtin(
     "sub_inplace",
-    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": transformation(dtype=Float), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6499,61 +7202,154 @@ def matrix_vector_sametype(arg_types: Mapping[str, Any]):
     return mat_size == vec_size and mat_type == vec_type
-# implements matrix[i,j] = scalar
+def matrix_assign_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    mat = args["a"].type
+    value_type = strip_reference(args["value"].type)
+    idxs = tuple(args[x].type for x in "ij" if args.get(x, None) is not None)
+    has_slice = any(isinstance(x, slice_t) for x in idxs)
+    if has_slice:
+        # Compute the resulting shape from the slicing, with -1 being simple indexing.
+        shape = tuple(idx.get_length(mat._shape_[i]) if isinstance(idx, slice_t) else -1 for i, idx in enumerate(idxs))
+        # Append any non indexed slice.
+        for i in range(len(idxs), len(mat._shape_)):
+            shape += (mat._shape_[i],)
+        # Count how many dimensions the output value will have.
+        ndim = sum(1 for x in shape if x >= 0)
+        assert ndim > 0
+        if ndim == 1:
+            length = shape[0] if shape[0] != -1 else shape[1]
+            if type_is_vector(value_type):
+                if not types_equal(value_type._wp_scalar_type_, mat._wp_scalar_type_):
+                    raise ValueError(
+                        f"The provided vector is expected to be of length {length} with dtype {type_repr(mat._wp_scalar_type_)}."
+                    )
+                if value_type._length_ != length:
+                    raise ValueError(
+                        f"The length of the provided vector ({value_type._length_}) isn't compatible with the given slice (expected {length})."
+                    )
+                template_args = (length,)
+            else:
+                # Disallow broadcasting.
+                raise ValueError(
+                    f"The provided value is expected to be a vector of length {length}, with dtype {type_repr(mat._wp_scalar_type_)}."
+                )
+        else:
+            assert ndim == 2
+            # When a matrix dimension is 0, all other dimensions are also expected to be 0.
+            if any(x == 0 for x in shape):
+                shape = (0,) * len(shape)
+            if type_is_matrix(value_type):
+                if not types_equal(value_type._wp_scalar_type_, mat._wp_scalar_type_):
+                    raise ValueError(
+                        f"The provided matrix is expected to be of shape {shape} with dtype {type_repr(mat._wp_scalar_type_)}."
+                    )
+                if value_type._shape_ != shape:
+                    raise ValueError(
+                        f"The shape of the provided matrix ({value_type._shape_}) isn't compatible with the given slice (expected {shape})."
+                    )
+                template_args = shape
+            else:
+                # Disallow broadcasting.
+                raise ValueError(
+                    f"The provided value is expected to be a matrix of shape {shape}, with dtype {type_repr(mat._wp_scalar_type_)}."
+                )
+    elif len(idxs) == 1:
+        if not type_is_vector(value_type) or not types_equal(value_type._wp_scalar_type_, mat._wp_scalar_type_):
+            raise ValueError(
+                f"The provided value is expected to be a vector of length {mat._shape_[1]}, with dtype {type_repr(mat._wp_scalar_type_)}."
+            )
+        if value_type._length_ != mat._shape_[1]:
+            raise ValueError(
+                f"The length of the provided vector ({value_type._length_}) isn't compatible with the given slice (expected {mat._shape_[1]})."
+            )
+        template_args = ()
+    elif len(idxs) == 2:
+        if not types_equal(value_type, mat._wp_scalar_type_):
+            raise ValueError(
+                f"The provided value is expected to be a scalar of type {type_repr(mat._wp_scalar_type_)}."
+            )
+        template_args = ()
+    else:
+        raise AssertionError
+    func_args = tuple(args.values())
+    return (func_args, template_args)
+# implements matrix[i] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    constraint=matrix_vector_sametype,
     value_type=None,
+    dispatch_func=matrix_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-# implements matrix[i] = vector
+# implements matrix[i,j] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
-    constraint=matrix_vector_sametype,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
     value_type=None,
+    dispatch_func=matrix_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-def matrix_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+def matrix_assign_copy_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     mat_type = arg_types["a"]
     return mat_type
-# implements matrix[i,j] = scalar
+# implements matrix[i] = value
 add_builtin(
     "assign_copy",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
-    value_func=matrix_assign_value_func,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    value_func=matrix_assign_copy_value_func,
+    dispatch_func=matrix_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-# implements matrix[i] = vector
+# implements matrix[i,j] = value
 add_builtin(
     "assign_copy",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
-    constraint=matrix_vector_sametype,
-    value_func=matrix_assign_value_func,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
+    value_func=matrix_assign_copy_value_func,
+    dispatch_func=matrix_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-# implements matrix[i,j] += scalar
+# implements matrix[i] += value
 add_builtin(
     "add_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    constraint=matrix_vector_sametype,
     value_type=None,
     hidden=True,
     export=False,
@@ -6561,11 +7357,10 @@ add_builtin(
 )
-# implements matrix[i] += vector
+# implements matrix[i,j] += value
 add_builtin(
     "add_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
-    constraint=matrix_vector_sametype,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
     value_type=None,
     hidden=True,
     export=False,
@@ -6573,10 +7368,10 @@ add_builtin(
 )
-# implements matrix[i,j] -= scalar
+# implements matrix[i] -= value
 add_builtin(
     "sub_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
     hidden=True,
     export=False,
@@ -6584,10 +7379,10 @@ add_builtin(
 )
-# implements matrix[i] -= vector
+# implements matrix[i,j] -= value
 add_builtin(
     "sub_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
     value_type=None,
     hidden=True,
     export=False,
@@ -6807,6 +7602,7 @@ add_builtin(
 # ---------------------------------
 # Operators
 add_builtin(
     "add", input_types={"a": Scalar, "b": Scalar}, value_func=sametypes_create_value_func(Scalar), group="Operators"
 )
@@ -7079,7 +7875,7 @@ add_builtin(
     "mod",
     input_types={"a": vector(length=Any, dtype=Scalar), "b": vector(length=Any, dtype=Scalar)},
     constraint=sametypes,
-    value_func=sametypes_create_value_func(Scalar),
+    value_func=sametypes_create_value_func(vector(length=Any, dtype=Scalar)),
     doc="Modulo operation using truncated division.",
     group="Operators",
 )
@@ -7481,7 +8277,7 @@ def tile_matmul_lto_dispatch_func(
     num_threads = options["block_dim"]
     arch = options["output_arch"]
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, 0, 0, a, b, out), template_args, [], 0)
     else:
@@ -7671,7 +8467,7 @@ def tile_fft_generic_lto_dispatch_func(
     arch = options["output_arch"]
     ept = size // num_threads
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ([], [], [], 0)
     else:
@@ -7792,28 +8588,27 @@ def tile_cholesky_generic_lto_dispatch_func(
         raise TypeError("tile_cholesky() returns one output")
     out = return_values[0]
-    dtype, precision_enum = cusolver_type_map[a.type.dtype]
     # We already ensured a is square in tile_cholesky_generic_value_func()
     M, N = a.type.shape
     if out.type.shape[0] != M or out.type.shape[1] != M:
         raise ValueError("tile_cholesky() output tile must be square")
-    solver = "potrf"
-    solver_enum = cusolver_function_map[solver]
-    side_enum = cusolver_side_map["-"]
-    diag_enum = cusolver_diag_map["-"]
-    fill_mode = cusolver_fill_mode_map["lower"]
     arch = options["output_arch"]
-    num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, int*)"
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, a, out), [], [], 0)
     else:
+        solver = "potrf"
+        solver_enum = cusolver_function_map[solver]
+        side_enum = cusolver_side_map["-"]
+        diag_enum = cusolver_diag_map["-"]
+        fill_mode = cusolver_fill_mode_map["lower"]
+        dtype, precision_enum = cusolver_type_map[a.type.dtype]
+        num_threads = options["block_dim"]
+        parameter_list = f"({dtype}*, int*)"
+        req_smem_bytes = a.type.size * type_size_in_bytes(a.type.dtype)
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
@@ -7831,6 +8626,7 @@ def tile_cholesky_generic_lto_dispatch_func(
             num_threads,
             parameter_list,
             builder,
+            smem_estimate_bytes=req_smem_bytes,
         )
         return ((Var(lto_symbol, str, False, True, False), a, out), [], [lto_code_data], 0)
@@ -7918,9 +8714,7 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
     if any(T not in cusolver_type_map.keys() for T in [y.type.dtype, L.type.dtype]):
         raise TypeError("tile_cholesky_solve() arguments be tiles of float64 or float32")
-    dtype, precision_enum = cusolver_type_map[L.type.dtype]
     M, N = L.type.shape
-    NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
     if len(x.type.shape) > 2 or len(x.type.shape) < 1:
         raise TypeError(f"tile_cholesky_solve() output vector must be 1D or 2D, got {len(x.type.shape)}-D")
@@ -7931,21 +8725,23 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
             f"got {x.type.shape[0]} elements in output and {M} rows in 'L'"
         )
-    solver = "potrs"
-    solver_enum = cusolver_function_map[solver]
-    side_enum = cusolver_side_map["-"]
-    diag_enum = cusolver_diag_map["-"]
-    fill_mode = cusolver_fill_mode_map["lower"]
     arch = options["output_arch"]
-    num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, {dtype}*)"
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, L, y, x), [], [], 0)
     else:
+        NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
+        solver = "potrs"
+        solver_enum = cusolver_function_map[solver]
+        side_enum = cusolver_side_map["-"]
+        diag_enum = cusolver_diag_map["-"]
+        fill_mode = cusolver_fill_mode_map["lower"]
+        dtype, precision_enum = cusolver_type_map[L.type.dtype]
+        num_threads = options["block_dim"]
+        parameter_list = f"({dtype}*, {dtype}*)"
+        req_smem_bytes = (x.type.size + y.type.size + L.type.size) * type_size_in_bytes(L.type.dtype)
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
@@ -7963,6 +8759,7 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
             num_threads,
             parameter_list,
             builder,
+            smem_estimate_bytes=req_smem_bytes,
         )
         return ((Var(lto_symbol, str, False, True, False), L, y, x), [], [lto_code_data], 0)
@@ -8013,9 +8810,7 @@ def tile_lower_solve_generic_lto_dispatch_func(
     z = return_values[0]
-    dtype, precision_enum = cusolver_type_map[L.type.dtype]
     M, N = L.type.shape
-    NRHS = z.type.shape[1] if len(z.type.shape) > 1 else 1
     if len(z.type.shape) > 2 or len(z.type.shape) < 1:
         raise TypeError(f"tile_lower_solve() output vector must be 1D or 2D, got {len(z.type.shape)}-D")
@@ -8026,21 +8821,23 @@ def tile_lower_solve_generic_lto_dispatch_func(
             f"got {z.type.shape[0]} elements in output and {M} rows in 'L'"
         )
-    solver = "trsm"
-    solver_enum = cusolver_function_map[solver]
-    side_enum = cusolver_side_map["left"]
-    diag_enum = cusolver_diag_map["nounit"]
-    fill_mode = cusolver_fill_mode_map["lower"]
     arch = options["output_arch"]
-    num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, {dtype}*)"
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, L, y, z), [], [], 0)
     else:
+        NRHS = z.type.shape[1] if len(z.type.shape) > 1 else 1
+        solver = "trsm"
+        solver_enum = cusolver_function_map[solver]
+        side_enum = cusolver_side_map["left"]
+        diag_enum = cusolver_diag_map["nounit"]
+        fill_mode = cusolver_fill_mode_map["lower"]
+        dtype, precision_enum = cusolver_type_map[L.type.dtype]
+        num_threads = options["block_dim"]
+        parameter_list = f"({dtype}*, {dtype}*)"
+        req_smem_bytes = (z.type.size + y.type.size + L.type.size) * type_size_in_bytes(L.type.dtype)
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
@@ -8058,6 +8855,7 @@ def tile_lower_solve_generic_lto_dispatch_func(
             num_threads,
             parameter_list,
             builder,
+            smem_estimate_bytes=req_smem_bytes,
         )
         return ((Var(lto_symbol, str, False, True, False), L, y, z), [], [lto_code_data], 0)
@@ -8144,9 +8942,7 @@ def tile_upper_solve_generic_lto_dispatch_func(
     x = return_values[0]
-    dtype, precision_enum = cusolver_type_map[U.type.dtype]
     M, N = U.type.shape
-    NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
     if len(z.type.shape) > 2 or len(z.type.shape) < 1:
         raise TypeError(f"tile_upper_solve() output tile must be 1D or 2D, got {len(z.type.shape)}-D")
@@ -8157,21 +8953,23 @@ def tile_upper_solve_generic_lto_dispatch_func(
             f"got {z.type.shape[0]} elements in output and {M} rows in 'U'"
         )
-    solver = "trsm"
-    solver_enum = cusolver_function_map[solver]
-    side_enum = cusolver_side_map["left"]
-    diag_enum = cusolver_diag_map["nounit"]
-    fill_mode = cusolver_fill_mode_map["upper"]
     arch = options["output_arch"]
-    num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, {dtype}*)"
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, U, z, x), [], [], 0)
     else:
+        NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
+        solver = "trsm"
+        solver_enum = cusolver_function_map[solver]
+        side_enum = cusolver_side_map["left"]
+        diag_enum = cusolver_diag_map["nounit"]
+        fill_mode = cusolver_fill_mode_map["upper"]
+        dtype, precision_enum = cusolver_type_map[U.type.dtype]
+        num_threads = options["block_dim"]
+        parameter_list = f"({dtype}*, {dtype}*)"
+        req_smem_bytes = (x.type.size + z.type.size + U.type.size) * type_size_in_bytes(U.type.dtype)
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
@@ -8189,6 +8987,7 @@ def tile_upper_solve_generic_lto_dispatch_func(
             num_threads,
             parameter_list,
             builder,
+            smem_estimate_bytes=req_smem_bytes,
         )
         return ((Var(lto_symbol, str, False, True, False), U, z, x), [], [lto_code_data], 0)
@@ -8413,3 +9212,22 @@ add_builtin(
     group="Utility",
     export=False,
 )
+# ---------------------------------
+# Slicing
+def slice_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    return slice_t(**arg_values)
+add_builtin(
+    "slice",
+    input_types={"start": int, "stop": int, "step": int},
+    value_func=slice_value_func,
+    native_func="slice_t",
+    export=False,
+    group="Utility",
+    hidden=True,
+    missing_grad=True,
+)