PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/builtins.py CHANGED Viewed

@@ -17,10 +17,12 @@ from __future__ import annotations
 import builtins
 import functools
+import math
 from typing import Any, Callable, Mapping, Sequence
 import warp.build
 import warp.context
+import warp.utils
 from warp.codegen import Reference, Var, get_arg_value, strip_reference
 from warp.types import *
@@ -124,6 +126,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Scalar),
     doc="Return -1 if ``x`` < 0, return 1 otherwise.",
     group="Scalar Math",
+    missing_grad=True,
 )
 add_builtin(
@@ -132,6 +135,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Scalar),
     doc="Return 1.0 if ``x`` < 0.0, return 0.0 otherwise.",
     group="Scalar Math",
+    missing_grad=True,
 )
 add_builtin(
     "nonzero",
@@ -139,6 +143,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Scalar),
     doc="Return 1.0 if ``x`` is not equal to zero, return 0.0 otherwise.",
     group="Scalar Math",
+    missing_grad=True,
 )
 add_builtin(
@@ -290,6 +295,7 @@ add_builtin(
     This is the most intuitive form of rounding in the colloquial sense, but can be slower than other options like :func:`warp.rint()`.
     Differs from :func:`numpy.round()`, which behaves the same way as :func:`numpy.rint()`.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -300,6 +306,7 @@ add_builtin(
     doc="""Return the nearest integer value to ``x``, rounding halfway cases to nearest even integer.
     It is generally faster than :func:`warp.round()`. Equivalent to :func:`numpy.rint()`.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -312,6 +319,7 @@ add_builtin(
     In other words, it discards the fractional part of ``x``.
     It is similar to casting ``float(int(a))``, but preserves the negative sign when ``x`` is in the range [-0.0, -1.0).
     Equivalent to :func:`numpy.trunc()` and :func:`numpy.fix()`.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -320,6 +328,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Float),
     group="Scalar Math",
     doc="""Return the largest integer that is less than or equal to ``x``.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -328,6 +337,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Float),
     group="Scalar Math",
     doc="""Return the smallest integer that is greater than or equal to ``x``.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -338,6 +348,7 @@ add_builtin(
     doc="""Retrieve the fractional part of ``x``.
     In other words, it discards the integer part of ``x`` and is equivalent to ``x - trunc(x)``.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -346,6 +357,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Scalar Math",
     doc="""Return ``True`` if ``a`` is a finite number, otherwise return ``False``.""",
+    missing_grad=True,
 )
 add_builtin(
     "isfinite",
@@ -353,6 +365,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if all elements of the vector ``a`` are finite, otherwise return ``False``.",
+    missing_grad=True,
 )
 add_builtin(
     "isfinite",
@@ -360,6 +373,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if all elements of the quaternion ``a`` are finite, otherwise return ``False``.",
+    missing_grad=True,
 )
 add_builtin(
     "isfinite",
@@ -367,6 +381,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if all elements of the matrix ``a`` are finite, otherwise return ``False``.",
+    missing_grad=True,
 )
 add_builtin(
@@ -375,6 +390,7 @@ add_builtin(
     value_type=builtins.bool,
     doc="Return ``True`` if ``a`` is NaN, otherwise return ``False``.",
     group="Scalar Math",
+    missing_grad=True,
 )
 add_builtin(
     "isnan",
@@ -382,6 +398,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the vector ``a`` is NaN, otherwise return ``False``.",
+    missing_grad=True,
 )
 add_builtin(
     "isnan",
@@ -389,6 +406,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the quaternion ``a`` is NaN, otherwise return ``False``.",
+    missing_grad=True,
 )
 add_builtin(
     "isnan",
@@ -396,6 +414,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the matrix ``a`` is NaN, otherwise return ``False``.",
+    missing_grad=True,
 )
 add_builtin(
@@ -404,6 +423,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Scalar Math",
     doc="""Return ``True`` if ``a`` is positive or negative infinity, otherwise return ``False``.""",
+    missing_grad=True,
 )
 add_builtin(
     "isinf",
@@ -411,6 +431,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the vector ``a`` is positive or negative infinity, otherwise return ``False``.",
+    missing_grad=True,
 )
 add_builtin(
     "isinf",
@@ -418,6 +439,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the quaternion ``a`` is positive or negative infinity, otherwise return ``False``.",
+    missing_grad=True,
 )
 add_builtin(
     "isinf",
@@ -425,6 +447,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the matrix ``a`` is positive or negative infinity, otherwise return ``False``.",
+    missing_grad=True,
 )
@@ -1180,6 +1203,7 @@ add_builtin(
     doc="Create an identity matrix with shape=(n,n) with the type given by ``dtype``.",
     group="Vector Math",
     export=False,
+    missing_grad=True,
 )
@@ -1544,6 +1568,7 @@ add_builtin(
     group="Quaternion Math",
     doc="Construct an identity quaternion with zero imaginary part and real part of 1.0",
     export=True,
+    missing_grad=True,
 )
 add_builtin(
@@ -1759,6 +1784,7 @@ add_builtin(
     doc="Construct a spatial transform vector of given dtype.",
     group="Spatial Math",
     export=False,
+    missing_grad=True,
 )
@@ -1793,6 +1819,7 @@ add_builtin(
     group="Transformations",
     doc="Construct an identity transform with zero translation and identity rotation.",
     export=True,
+    missing_grad=True,
 )
 add_builtin(
@@ -2355,6 +2382,7 @@ def tile_load_tuple_value_func(arg_types: Mapping[str, type], arg_values: Mappin
 def tile_load_tuple_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     a = args["a"]
     shape = extract_tuple(args["shape"], as_constant=True)
+    bounds_check = args["bounds_check"]
     if None in shape:
         raise ValueError("Tile functions require shape to be a compile time constant.")
@@ -2365,17 +2393,23 @@ def tile_load_tuple_dispatch_func(input_types: Mapping[str, type], return_type:
         offset = (0,) * a.type.ndim
     func_args = (a, *offset)
-    template_args = shape
+    template_args = (return_type.dtype, bounds_check.constant, *shape)
     return (func_args, template_args)
 add_builtin(
     "tile_load",
-    input_types={"a": array(dtype=Any), "shape": Tuple[int, ...], "offset": Tuple[int, ...], "storage": str},
+    input_types={
+        "a": array(dtype=Any),
+        "shape": Tuple[int, ...],
+        "offset": Tuple[int, ...],
+        "storage": str,
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_load_tuple_value_func,
     dispatch_func=tile_load_tuple_dispatch_func,
-    defaults={"offset": None, "storage": "register"},
+    defaults={"offset": None, "storage": "register", "bounds_check": True},
     variadic=False,
     doc="""Loads a tile from a global memory array.
@@ -2386,6 +2420,7 @@ add_builtin(
     :param offset: Offset in the source array to begin reading from (optional)
     :param storage: The storage location for the tile: ``"register"`` for registers
       (default) or ``"shared"`` for shared memory.
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster load times
     :returns: A tile with shape as specified and data type the same as the source array""",
     group="Tile Primitives",
     export=False,
@@ -2394,16 +2429,160 @@ add_builtin(
 # overload for scalar shape
 add_builtin(
     "tile_load",
-    input_types={"a": array(dtype=Any), "shape": int, "offset": int, "storage": str},
+    input_types={"a": array(dtype=Any), "shape": int, "offset": int, "storage": str, "bounds_check": builtins.bool},
     value_func=tile_load_tuple_value_func,
     dispatch_func=tile_load_tuple_dispatch_func,
-    defaults={"offset": None, "storage": "register"},
+    defaults={"offset": None, "storage": "register", "bounds_check": True},
     group="Tile Primitives",
     hidden=True,
     export=False,
 )
+def tile_load_indexed_tuple_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return tile(dtype=Any, shape=Tuple[int, ...])
+    a = arg_types["a"]
+    indices_tile = arg_types["indices"]
+    indices_tile.storage = "shared"  # force to shared
+    axis = arg_values["axis"]
+    if axis >= a.ndim:
+        raise ValueError(f"tile_load_indexed() axis argument must be valid axis of array {a}, got {axis}.")
+    indices_tile_dim = len(indices_tile.shape)
+    if indices_tile_dim != 1:
+        raise ValueError(
+            f"tile_load_indexed() indices argument must be a 1D tile, got {indices_tile_dim} dimensions instead."
+        )
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
+    num_indices = indices_tile.shape[0]
+    if num_indices != shape[axis]:
+        raise ValueError(
+            "The number of elements in the 1D indices tile must match the output tile shape along the specified axis."
+        )
+    if "offset" in arg_values:
+        offset = extract_tuple(arg_values["offset"])
+    else:
+        offset = (0,) * a.ndim
+    if a.ndim != len(shape):
+        raise ValueError(
+            f"tile_load_indexed() array argument must have same number of dimensions as the tile shape, trying to perform an {len(shape)} dimensional load from an array with {a.ndim} dimensions."
+        )
+    if a.ndim != len(offset):
+        raise ValueError(
+            f"tile_load_indexed() offset argument must have the same number of dimensions as the array to load from, got {len(offset)} indices for an array with {a.ndim} dimensions"
+        )
+    if arg_values["storage"] not in {"shared", "register"}:
+        raise ValueError(f"Invalid value for 'storage': {arg_values['storage']!r}. Expected 'shared' or 'register'.")
+    return tile(dtype=a.dtype, shape=shape, storage=arg_values["storage"])
+def tile_load_indexed_tuple_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    a = args["a"]
+    indices_tile = args["indices"]
+    axis = args["axis"]
+    shape = extract_tuple(args["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
+    func_args = (a, indices_tile, axis, *offset)
+    template_args = shape
+    return (func_args, template_args)
+add_builtin(
+    "tile_load_indexed",
+    input_types={
+        "a": array(dtype=Any),
+        "indices": tile(dtype=int, shape=Tuple[int]),
+        "shape": Tuple[int, ...],
+        "offset": Tuple[int, ...],
+        "axis": int,
+        "storage": str,
+    },
+    value_func=tile_load_indexed_tuple_value_func,
+    dispatch_func=tile_load_indexed_tuple_dispatch_func,
+    defaults={"offset": None, "axis": 0, "storage": "register"},
+    variadic=False,
+    doc="""Loads a tile from a global memory array, with loads along a specified axis mapped according to a 1D tile of indices.
+    :param a: The source array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param shape: Shape of the tile to load, must have the same number of dimensions as ``a``, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the source array to begin reading from (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    :param storage: The storage location for the tile: ``"register"`` for registers (default) or ``"shared"`` for shared memory.
+    :returns: A tile with shape as specified and data type the same as the source array
+    This example shows how to select and store the even indexed rows from a 2D array:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        HALF_M = wp.constant(TILE_M // 2)
+        HALF_N = wp.constant(TILE_N // 2)
+        @wp.kernel
+        def compute(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            evens = wp.tile_arange(HALF_M, dtype=int, storage="shared") * 2
+            t0 = wp.tile_load_indexed(x, indices=evens, shape=(HALF_M, TILE_N), offset=(i*TILE_M, j*TILE_N), axis=0, storage="register")
+            wp.tile_store(y, t0, offset=(i*HALF_M, j*TILE_N))
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N).reshape(M, N)
+        x = wp.array(arr, dtype=float)
+        y = wp.zeros((M // 2, N), dtype=float)
+        wp.launch_tiled(compute, dim=[2,2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 0.  1.  2.  3.]
+         [ 8.  9. 10. 11.]]
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_store_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2440,6 +2619,7 @@ def tile_store_value_func(arg_types, arg_values):
 def tile_store_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     a = args["a"]
     t = args["t"]
+    bounds_check = args["bounds_check"]
     if "offset" in args:
         offset = extract_tuple(args["offset"])
@@ -2447,17 +2627,22 @@ def tile_store_dispatch_func(input_types: Mapping[str, type], return_type: Any,
         offset = (0,) * a.type.ndim
     func_args = (a, *offset, t)
-    template_args = []
+    template_args = (a.type.dtype, bounds_check.constant)
     return (func_args, template_args)
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": Tuple[int, ...]},
+    input_types={
+        "a": array(dtype=Any),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_store_value_func,
     dispatch_func=tile_store_dispatch_func,
-    defaults={"offset": None},
+    defaults={"offset": None, "bounds_check": True},
     variadic=False,
     skip_replay=True,
     doc="""Store a tile to a global memory array.
@@ -2466,7 +2651,9 @@ add_builtin(
     :param a: The destination array in global memory
     :param t: The source tile to store data from, must have the same data type and number of dimensions as the destination array
-    :param offset: Offset in the destination array (optional)""",
+    :param offset: Offset in the destination array (optional)
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster write times
+    """,
     group="Tile Primitives",
     export=False,
 )
@@ -2474,10 +2661,15 @@ add_builtin(
 # overload for scalar offset
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": int},
+    input_types={
+        "a": array(dtype=Any),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": int,
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_store_value_func,
     dispatch_func=tile_store_dispatch_func,
-    defaults={"offset": None},
+    defaults={"offset": None, "bounds_check": True},
     variadic=False,
     skip_replay=True,
     group="Tile Primitives",
@@ -2486,6 +2678,151 @@ add_builtin(
 )
+def tile_store_indexed_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+    a = arg_types["a"]
+    t = arg_types["t"]
+    indices_tile = arg_types["indices"]
+    indices_tile.storage = "shared"  # force to shared
+    axis = arg_values["axis"]
+    if axis >= a.ndim:
+        raise ValueError(f"tile_store_indexed() axis argument must be valid axis of array {a}, got {axis}.")
+    indices_tile_dim = len(indices_tile.shape)
+    if indices_tile_dim != 1:
+        raise ValueError(
+            f"tile_store_indexed() indices argument must be a 1D tile, got {indices_tile_dim} dimensions instead."
+        )
+    num_indices = indices_tile.shape[0]
+    if num_indices != t.shape[axis]:
+        raise ValueError(
+            "The number of elements in the 1D indices tile must match the input tile shape along the specified axis."
+        )
+    if "offset" in arg_types:
+        c = extract_tuple(arg_values["offset"])
+    else:
+        c = (0,) * a.ndim
+    if len(c) != a.ndim:
+        raise ValueError(
+            f"tile_store_indexed() 'a' argument must have {len(c)} dimensions, "
+            f"calculated based on the provided offset arguments, but got {a.ndim} dimensions."
+        )
+    if len(t.shape) != a.ndim:
+        raise ValueError(
+            f"tile_store_indexed() 'a' argument must have the same number of dimensions as the 't' argument, "
+            f"but got {a.ndim} dimensions for 'a' and {len(t.shape)} dimensions for 't'"
+        )
+    if not types_equal(arg_types["a"].dtype, arg_types["t"].dtype):
+        raise TypeError(
+            f"tile_store_indexed() 'a' and 't' arguments must have the same dtype, got {arg_types['a'].dtype} and {arg_types['t'].dtype}"
+        )
+    return None
+def tile_store_indexed_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    a = args["a"]
+    indices_tile = args["indices"]
+    axis = args["axis"]
+    t = args["t"]
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
+    func_args = (a, indices_tile, axis, *offset, t)
+    template_args = []
+    return (func_args, template_args)
+add_builtin(
+    "tile_store_indexed",
+    input_types={
+        "a": array(dtype=Any),
+        "indices": tile(dtype=int, shape=Tuple[int]),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+        "axis": int,
+    },
+    value_func=tile_store_indexed_value_func,
+    dispatch_func=tile_store_indexed_dispatch_func,
+    defaults={"offset": None, "axis": 0},
+    variadic=False,
+    skip_replay=True,
+    doc="""Store a tile to a global memory array, with storage along a specified axis mapped according to a 1D tile of indices.
+    :param a: The destination array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param t: The source tile to store data from, must have the same data type and number of dimensions as the destination array, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the destination array (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    This example shows how to map tile rows to the even rows of a 2D array:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        TWO_M = wp.constant(TILE_M * 2)
+        TWO_N = wp.constant(TILE_N * 2)
+        @wp.kernel
+        def compute(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            t = wp.tile_load(x, shape=(TILE_M, TILE_N), offset=(i*TILE_M, j*TILE_N), storage="register")
+            evens_M = wp.tile_arange(TILE_M, dtype=int, storage="shared") * 2
+            wp.tile_store_indexed(y, indices=evens_M, t=t, offset=(i*TWO_M, j*TILE_N), axis=0)
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N, dtype=float).reshape(M, N)
+        x = wp.array(arr, dtype=float, requires_grad=True, device=device)
+        y = wp.zeros((M * 2, N), dtype=float, requires_grad=True, device=device)
+        wp.launch_tiled(compute, dim=[2,2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 0.  1.  2.  3.]
+         [ 0.  0.  0.  0.]
+         [ 4.  5.  6.  7.]
+         [ 0.  0.  0.  0.]
+         [ 8.  9. 10. 11.]
+         [ 0.  0.  0.  0.]
+         [12. 13. 14. 15.]
+         [ 0.  0.  0.  0.]]
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_atomic_add_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2526,6 +2863,7 @@ def tile_atomic_add_value_func(arg_types, arg_values):
 def tile_atomic_add_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     a = args["a"]
     t = args["t"]
+    bounds_check = args["bounds_check"]
     if "offset" in args:
         offset = extract_tuple(args["offset"])
@@ -2533,17 +2871,22 @@ def tile_atomic_add_dispatch_func(input_types: Mapping[str, type], return_type:
         offset = (0,) * a.type.ndim
     func_args = (a, *offset, t)
-    template_args = []
+    template_args = (a.type.dtype, bounds_check.constant)
     return (func_args, template_args)
 add_builtin(
     "tile_atomic_add",
-    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": Tuple[int, ...]},
+    input_types={
+        "a": array(dtype=Any),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_atomic_add_value_func,
     dispatch_func=tile_atomic_add_dispatch_func,
-    defaults={"offset": None},
+    defaults={"offset": None, "bounds_check": True},
     variadic=False,
     skip_replay=True,
     doc="""Atomically add a tile onto the array `a`, each element will be updated atomically.
@@ -2551,6 +2894,7 @@ add_builtin(
     :param a: Array in global memory, should have the same ``dtype`` as the input tile
     :param t: Source tile to add to the destination array
     :param offset: Offset in the destination array (optional)
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster write times
     :returns: A tile with the same dimensions and data type as the source tile, holding the original value of the destination elements""",
     group="Tile Primitives",
     export=False,
@@ -2559,10 +2903,15 @@ add_builtin(
 # overload for scalar offset
 add_builtin(
     "tile_atomic_add",
-    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": int},
+    input_types={
+        "a": array(dtype=Any),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": int,
+        "bounds_check": builtins.bool,
+    },
     value_func=tile_atomic_add_value_func,
     dispatch_func=tile_atomic_add_dispatch_func,
-    defaults={"offset": None},
+    defaults={"offset": None, "bounds_check": True},
     variadic=False,
     skip_replay=True,
     group="Tile Primitives",
@@ -2571,6 +2920,143 @@ add_builtin(
 )
+def tile_atomic_add_indexed_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Any, shape=Tuple[int, ...])
+    a = arg_types["a"]
+    t = arg_types["t"]
+    indices_tile = arg_types["indices"]
+    indices_tile.storage = "shared"  # force to shared
+    axis = arg_values["axis"]
+    if axis >= a.ndim:
+        raise ValueError(f"tile_atomic_add_indexed() axis argument must be valid axis of array {a}, got {axis}.")
+    indices_tile_dim = len(indices_tile.shape)
+    if indices_tile_dim != 1:
+        raise ValueError(
+            f"tile_atomic_add_indexed() indices argument must be a 1D tile, got {indices_tile_dim} dimensions instead."
+        )
+    num_indices = indices_tile.shape[0]
+    if num_indices != t.shape[axis]:
+        raise ValueError(
+            "The number of elements in the 1D indices tile must match the input tile shape along the specified axis."
+        )
+    if "offset" in arg_types:
+        c = extract_tuple(arg_values["offset"])
+    else:
+        c = (0,) * a.ndim
+    if len(c) != a.ndim:
+        raise ValueError(
+            f"tile_atomic_add_indexed() 'a' argument must have {len(c)} dimensions, "
+            f"calculated based on the provided offset arguments, but got {a.ndim} dimensions."
+        )
+    if len(t.shape) != a.ndim:
+        raise ValueError(
+            f"tile_atomic_add_indexed() 'a' argument must have the same number of dimensions as the 't' argument, "
+            f"but got {a.ndim} dimensions for 'a' and {len(t.shape)} dimensions for 't'"
+        )
+    if not types_equal(arg_types["a"].dtype, arg_types["t"].dtype):
+        raise TypeError(
+            f"tile_atomic_add_indexed() 'a' and 't' arguments must have the same dtype, got {arg_types['a'].dtype} and {arg_types['t'].dtype}"
+        )
+    return tile(dtype=t.dtype, shape=t.shape, storage=t.storage)
+def tile_atomic_add_indexed_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    a = args["a"]
+    indices_tile = args["indices"]
+    axis = args["axis"]
+    t = args["t"]
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
+    func_args = (a, indices_tile, axis, *offset, t)
+    template_args = []
+    return (func_args, template_args)
+add_builtin(
+    "tile_atomic_add_indexed",
+    input_types={
+        "a": array(dtype=Any),
+        "indices": tile(dtype=int, shape=Tuple[int]),
+        "t": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+        "axis": int,
+    },
+    value_func=tile_atomic_add_indexed_value_func,
+    dispatch_func=tile_atomic_add_indexed_dispatch_func,
+    defaults={"offset": None, "axis": 0},
+    variadic=False,
+    skip_replay=True,
+    doc="""Atomically add a tile to a global memory array, with storage along a specified axis mapped according to a 1D tile of indices.
+    :param a: The destination array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param t: The source tile to extract data from, must have the same data type and number of dimensions as the destination array, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the destination array (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    This example shows how to compute a blocked, row-wise reduction:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        @wp.kernel
+        def tile_atomic_add_indexed(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            t = wp.tile_load(x, shape=(TILE_M, TILE_N), offset=(i*TILE_M, j*TILE_N), storage="register")
+            zeros = wp.tile_zeros(TILE_M, dtype=int, storage="shared")
+            wp.tile_atomic_add_indexed(y, indices=zeros, t=t, offset=(i, j*TILE_N), axis=0)
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N, dtype=float).reshape(M, N)
+        x = wp.array(arr, dtype=float, requires_grad=True, device=device)
+        y = wp.zeros((2, N), dtype=float, requires_grad=True, device=device)
+        wp.launch_tiled(tile_atomic_add_indexed, dim=[2,2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 4.  6.  8. 10.]
+         [20. 22. 24. 26.]]
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_view_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -3525,6 +4011,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -3578,6 +4065,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -3631,6 +4119,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -3683,6 +4172,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -3735,6 +4225,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -3792,6 +4283,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -3855,6 +4347,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -3918,6 +4411,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -3934,14 +4428,45 @@ def tile_unary_map_value_func(arg_types, arg_values):
     if not is_tile(a):
         raise TypeError(f"tile_map() 'a' argument must be a tile, got {a!r}")
-    return tile(dtype=a.dtype, shape=a.shape)
+    if "op" in arg_values:
+        op = arg_values["op"]
+        try:
+            overload = op.get_overload([a.dtype], {})
+        except KeyError as exc:
+            raise RuntimeError(f"No overload of {op} found for tile element type {type_repr(a.dtype)}") from exc
+        # build the right overload on demand
+        if overload.value_func is None:
+            overload.build(None)
+        value_type = overload.value_func(None, None)
+        if not type_is_scalar(value_type) and not type_is_vector(value_type) and not type_is_matrix(value_type):
+            raise TypeError(f"Operator {op} returns unsupported type {type_repr(value_type)} for a tile element")
+        return tile(dtype=value_type, shape=a.shape)
+    else:
+        return tile(dtype=a.dtype, shape=a.shape)
+def tile_unary_map_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    op = arg_values["op"]
+    tile_a = arg_values["a"]
+    overload = op.get_overload([tile_a.type.dtype], {})
+    # necessary, in case return type is different from input tile types
+    tile_r = Var(label=None, type=return_type)
+    return ((overload, tile_a, tile_r), ())
 add_builtin(
     "tile_map",
     input_types={"op": Callable, "a": tile(dtype=Scalar, shape=Tuple[int, ...])},
     value_func=tile_unary_map_value_func,
-    # dispatch_func=tile_map_dispatch_func,
+    dispatch_func=tile_unary_map_dispatch_func,
     # variadic=True,
     native_func="tile_unary_map",
     doc="""Apply a unary function onto the tile.
@@ -3950,7 +4475,7 @@ add_builtin(
     :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's data type
-    :returns: A tile with the same dimensions and data type as the input tile.
+    :returns: A tile with the same dimensions as the input tile. Its datatype is specified by the return type of op
     Example:
@@ -3991,10 +4516,6 @@ def tile_binary_map_value_func(arg_types, arg_values):
     if not is_tile(b):
         raise TypeError(f"tile_map() 'b' argument must be a tile, got {b!r}")
-    # ensure types equal
-    if not types_equal(a.dtype, b.dtype):
-        raise TypeError(f"tile_map() arguments must have the same dtype, got {a.dtype} and {b.dtype}")
     if len(a.shape) != len(b.shape):
         raise ValueError(
             f"tile_map() shapes must have the same number of dimensions, got {len(a.shape)} and {len(b.shape)}"
@@ -4004,7 +4525,47 @@ def tile_binary_map_value_func(arg_types, arg_values):
         if a.shape[i] != b.shape[i]:
             raise ValueError(f"tile_map() shapes do not match on dimension {i}, got {a.shape} and {b.shape}")
-    return tile(dtype=a.dtype, shape=a.shape)
+    if "op" in arg_values:
+        op = arg_values["op"]
+        try:
+            overload = op.get_overload([a.dtype, b.dtype], {})
+        except KeyError as exc:
+            raise RuntimeError(
+                f"No overload of {op} found for tile element types {type_repr(a.dtype)}, {type_repr(b.dtype)}"
+            ) from exc
+        # build the right overload on demand
+        if overload.value_func is None:
+            overload.build(None)
+        value_type = overload.value_func(None, None)
+        if not type_is_scalar(value_type) and not type_is_vector(value_type) and not type_is_matrix(value_type):
+            raise TypeError(f"Operator {op} returns unsupported type {type_repr(value_type)} for a tile element")
+        return tile(dtype=value_type, shape=a.shape)
+    else:
+        # ensure types equal
+        if not types_equal(a.dtype, b.dtype):
+            raise TypeError(
+                f"tile_map() arguments must have the same dtype for this operation, got {a.dtype} and {b.dtype}"
+            )
+        return tile(dtype=a.dtype, shape=a.shape)
+def tile_binary_map_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    op = arg_values["op"]
+    tile_a = arg_values["a"]
+    tile_b = arg_values["b"]
+    overload = op.get_overload([tile_a.type.dtype, tile_b.type.dtype], {})
+    # necessary, in case return type is different from input tile types
+    tile_r = Var(label=None, type=return_type)
+    return ((overload, tile_a, tile_b, tile_r), ())
 add_builtin(
@@ -4015,18 +4576,18 @@ add_builtin(
         "b": tile(dtype=Scalar, shape=Tuple[int, ...]),
     },
     value_func=tile_binary_map_value_func,
-    # dispatch_func=tile_map_dispatch_func,
+    dispatch_func=tile_binary_map_dispatch_func,
     # variadic=True,
     native_func="tile_binary_map",
     doc="""Apply a binary function onto the tile.
     This function cooperatively applies a binary function to each element of the tiles using all threads in the block.
-    Both input tiles must have the same dimensions and datatype.
+    Both input tiles must have the same dimensions, and if using a builtin op, the same datatypes.
     :param op: A callable function that accepts two arguments and returns one argument, all of the same type, may be a user function or builtin
     :param a: The first input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
     :param b: The second input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A tile with the same dimensions and datatype as the input tiles.
+    :returns: A tile with the same dimensions as the input tiles. Its datatype is specified by the return type of op
     Example:
@@ -4104,6 +4665,7 @@ add_builtin(
     doc="WIP",
     group="Utility",
     hidden=True,
+    missing_grad=True,
 )
 add_builtin(
@@ -4119,6 +4681,7 @@ add_builtin(
     doc="WIP",
     group="Utility",
     hidden=True,
+    missing_grad=True,
 )
 add_builtin(
@@ -4128,6 +4691,7 @@ add_builtin(
     doc="WIP",
     group="Utility",
     hidden=True,
+    missing_grad=True,
 )
 add_builtin(
@@ -4179,6 +4743,7 @@ add_builtin(
     :param low: The lower bound of the bounding box in BVH space
     :param high: The upper bound of the bounding box in BVH space""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4194,6 +4759,7 @@ add_builtin(
     :param start: The start of the ray in BVH space
     :param dir: The direction of the ray in BVH space""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4204,6 +4770,7 @@ add_builtin(
     doc="""Move to the next bound returned by the query.
     The index of the current bound is stored in ``index``, returns ``False`` if there are no more overlapping bound.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4538,12 +5105,13 @@ add_builtin(
     group="Geometry",
     doc="""Construct an axis-aligned bounding box query against a :class:`Mesh`.
-    This query can be used to iterate over all triangles inside a volume.
+    This query can be used to iterate over all bounding boxes of the triangles inside a volume.
     :param id: The mesh identifier
     :param low: The lower bound of the bounding box in mesh space
     :param high: The upper bound of the bounding box in mesh space""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4551,10 +5119,11 @@ add_builtin(
     input_types={"query": MeshQueryAABB, "index": int},
     value_type=builtins.bool,
     group="Geometry",
-    doc="""Move to the next triangle overlapping the query bounding box.
+    doc="""Move to the next triangle whose bounding box overlaps the query bounding box.
     The index of the current face is stored in ``index``, returns ``False`` if there are no more overlapping triangles.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4584,6 +5153,7 @@ add_builtin(
     This query can be used to iterate over all neighboring point within a fixed radius from the query point.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4595,6 +5165,7 @@ add_builtin(
     The index of the current neighbor is stored in ``index``, returns ``False`` if there are no more neighbors.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4608,6 +5179,7 @@ add_builtin(
     Returns -1 if the :class:`HashGrid` has not been reserved.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4619,6 +5191,7 @@ add_builtin(
     Returns > 0 if triangles intersect.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4638,6 +5211,7 @@ add_builtin(
     group="Geometry",
     doc="""Evaluates the face normal the mesh given a face index.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4647,6 +5221,7 @@ add_builtin(
     group="Geometry",
     doc="""Returns the point of the mesh given a index.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4656,6 +5231,7 @@ add_builtin(
     group="Geometry",
     doc="""Returns the velocity of the mesh given a index.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4665,6 +5241,7 @@ add_builtin(
     group="Geometry",
     doc="""Returns the point-index of the mesh given a face-vertex index.""",
     export=False,
+    missing_grad=True,
 )
@@ -4705,12 +5282,32 @@ add_builtin(
 # ---------------------------------
 # Iterators
-add_builtin("iter_next", input_types={"range": range_t}, value_type=int, group="Utility", export=False, hidden=True)
 add_builtin(
-    "iter_next", input_types={"query": HashGridQuery}, value_type=int, group="Utility", export=False, hidden=True
+    "iter_next",
+    input_types={"range": range_t},
+    value_type=int,
+    group="Utility",
+    export=False,
+    hidden=True,
+    missing_grad=True,
+)
+add_builtin(
+    "iter_next",
+    input_types={"query": HashGridQuery},
+    value_type=int,
+    group="Utility",
+    export=False,
+    hidden=True,
+    missing_grad=True,
 )
 add_builtin(
-    "iter_next", input_types={"query": MeshQueryAABB}, value_type=int, group="Utility", export=False, hidden=True
+    "iter_next",
+    input_types={"query": MeshQueryAABB},
+    value_type=int,
+    group="Utility",
+    export=False,
+    hidden=True,
+    missing_grad=True,
 )
 add_builtin(
@@ -4721,6 +5318,7 @@ add_builtin(
     group="Utility",
     doc="""Returns the range in reversed order.""",
     export=False,
+    missing_grad=True,
 )
 # ---------------------------------
@@ -4869,6 +5467,7 @@ add_builtin(
     doc="""Returns the value of voxel with coordinates ``i``, ``j``, ``k`` for a volume of type type `dtype`.
     If the voxel at this index does not exist, this function returns the background value.""",
+    missing_grad=True,
 )
@@ -4889,6 +5488,7 @@ add_builtin(
     export=False,
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -4919,6 +5519,7 @@ add_builtin(
     doc="""Returns the value of voxel with coordinates ``i``, ``j``, ``k``.
     If the voxel at this index does not exist, this function returns the background value""",
+    missing_grad=True,
 )
 add_builtin(
@@ -4927,6 +5528,7 @@ add_builtin(
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4947,6 +5549,7 @@ add_builtin(
     doc="""Returns the vector value of voxel with coordinates ``i``, ``j``, ``k``.
     If the voxel at this index does not exist, this function returns the background value.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -4955,6 +5558,7 @@ add_builtin(
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -4973,6 +5577,7 @@ add_builtin(
     doc="""Returns the :class:`int32` value of voxel with coordinates ``i``, ``j``, ``k``.
     If the voxel at this index does not exist, this function returns the background value.""",
+    missing_grad=True,
 )
 add_builtin(
@@ -4981,6 +5586,7 @@ add_builtin(
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
     export=False,
+    missing_grad=True,
 )
@@ -5062,6 +5668,7 @@ add_builtin(
     If the voxel at this index does not exist, this function returns -1.
     This function is available for both index grids and classical volumes.
     """,
+    missing_grad=True,
 )
 add_builtin(
@@ -5103,6 +5710,7 @@ add_builtin(
     value_type=uint32,
     group="Random",
     doc="Initialize a new random number generator given a user-defined seed. Returns a 32-bit integer representing the RNG state.",
+    missing_grad=True,
 )
 add_builtin(
@@ -5114,6 +5722,7 @@ add_builtin(
     This alternative constructor can be useful in parallel programs, where a kernel as a whole should share a seed,
     but each thread should generate uncorrelated values. In this case usage should be ``r = rand_init(seed, tid)``""",
+    missing_grad=True,
 )
 add_builtin(
@@ -5122,6 +5731,7 @@ add_builtin(
     value_type=int,
     group="Random",
     doc="Return a random integer in the range [-2^31, 2^31).",
+    missing_grad=True,
 )
 add_builtin(
     "randi",
@@ -5129,6 +5739,7 @@ add_builtin(
     value_type=int,
     group="Random",
     doc="Return a random integer between [low, high).",
+    missing_grad=True,
 )
 add_builtin(
     "randu",
@@ -5136,6 +5747,7 @@ add_builtin(
     value_type=uint32,
     group="Random",
     doc="Return a random unsigned integer in the range [0, 2^32).",
+    missing_grad=True,
 )
 add_builtin(
     "randu",
@@ -5143,6 +5755,7 @@ add_builtin(
     value_type=uint32,
     group="Random",
     doc="Return a random unsigned integer between [low, high).",
+    missing_grad=True,
 )
 add_builtin(
     "randf",
@@ -5150,6 +5763,7 @@ add_builtin(
     value_type=float,
     group="Random",
     doc="Return a random float between [0.0, 1.0).",
+    missing_grad=True,
 )
 add_builtin(
     "randf",
@@ -5157,6 +5771,7 @@ add_builtin(
     value_type=float,
     group="Random",
     doc="Return a random float between [low, high).",
+    missing_grad=True,
 )
 add_builtin(
     "randn",
@@ -5164,6 +5779,7 @@ add_builtin(
     value_type=float,
     group="Random",
     doc="Sample a normal (Gaussian) distribution of mean 0 and variance 1. ",
+    missing_grad=True,
 )
 add_builtin(
@@ -5172,6 +5788,7 @@ add_builtin(
     value_type=int,
     group="Random",
     doc="Inverse-transform sample a cumulative distribution function.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_triangle",
@@ -5179,6 +5796,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Uniformly sample a triangle. Returns sample barycentric coordinates.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_unit_ring",
@@ -5186,6 +5804,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Uniformly sample a ring in the xy plane.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_unit_disk",
@@ -5193,6 +5812,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Uniformly sample a disk in the xy plane.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_unit_sphere_surface",
@@ -5200,6 +5820,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit sphere surface.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_unit_sphere",
@@ -5207,6 +5828,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit sphere.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_unit_hemisphere_surface",
@@ -5214,6 +5836,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit hemisphere surface.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_unit_hemisphere",
@@ -5221,6 +5844,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit hemisphere.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_unit_square",
@@ -5228,6 +5852,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Uniformly sample a unit square.",
+    missing_grad=True,
 )
 add_builtin(
     "sample_unit_cube",
@@ -5235,6 +5860,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit cube.",
+    missing_grad=True,
 )
 add_builtin(
@@ -5246,6 +5872,7 @@ add_builtin(
     :param state: RNG state
     :param lam: The expected value of the distribution""",
+    missing_grad=True,
 )
 add_builtin(
@@ -5363,9 +5990,16 @@ add_builtin(
     dispatch_func=printf_dispatch_func,
     group="Utility",
     doc="Allows printing formatted strings using C-style format specifiers.",
+    missing_grad=True,
 )
-add_builtin("print", input_types={"value": Any}, doc="Print variable to stdout", export=False, group="Utility")
+add_builtin(
+    "print",
+    input_types={"value": Any},
+    doc="Print variable to stdout",
+    export=False,
+    group="Utility",
+)
 add_builtin(
     "breakpoint",
@@ -5375,6 +6009,7 @@ add_builtin(
     group="Utility",
     namespace="",
     native_func="__debugbreak",
+    missing_grad=True,
 )
 # helpers
@@ -5392,6 +6027,7 @@ add_builtin(
     This function may not be called from user-defined Warp functions.""",
     namespace="",
     native_func="builtin_tid1d",
+    missing_grad=True,
 )
 add_builtin(
@@ -5402,6 +6038,7 @@ add_builtin(
     doc="Returns the number of threads in the current block.",
     namespace="",
     native_func="builtin_block_dim",
+    missing_grad=True,
 )
 add_builtin(
@@ -5416,6 +6053,7 @@ add_builtin(
     This function may not be called from user-defined Warp functions.""",
     namespace="",
     native_func="builtin_tid2d",
+    missing_grad=True,
 )
 add_builtin(
@@ -5430,6 +6068,7 @@ add_builtin(
     This function may not be called from user-defined Warp functions.""",
     namespace="",
     native_func="builtin_tid3d",
+    missing_grad=True,
 )
 add_builtin(
@@ -5444,17 +6083,37 @@ add_builtin(
     This function may not be called from user-defined Warp functions.""",
     namespace="",
     native_func="builtin_tid4d",
+    missing_grad=True,
 )
+def copy_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    a = arg_types["a"]
+    # if the input is a shared tile, we force a copy
+    if is_tile(a) and a.storage == "shared":
+        return tile(
+            dtype=a.dtype,
+            shape=a.shape,
+            storage=a.storage,
+            strides=a.strides,
+            layout=a.layout,
+            owner=True,
+        )
+    return a
 add_builtin(
     "copy",
     input_types={"a": Any},
-    value_func=lambda arg_types, arg_values: arg_types["a"],
+    value_func=copy_value_func,
     hidden=True,
     export=False,
     group="Utility",
 )
 add_builtin(
     "assign",
     input_types={"dest": Any, "src": Any},
@@ -5464,6 +6123,37 @@ add_builtin(
 )
+def select_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return Any
+    v_true = arg_types["value_if_true"]
+    v_false = arg_types["value_if_false"]
+    if not types_equal(v_true, v_false):
+        raise RuntimeError(
+            f"select() true value type ({v_true}) must be of the same type as the false type ({v_false})"
+        )
+    if is_tile(v_false):
+        if v_true.storage == "register":
+            return v_true
+        if v_false.storage == "register":
+            return v_false
+        # both v_true and v_false are shared
+        return tile(
+            dtype=v_true.dtype,
+            shape=v_true.shape,
+            storage=v_true.storage,
+            strides=v_true.strides,
+            layout=v_true.layout,
+            owner=True,
+        )
+    return v_true
 def select_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     warp.utils.warn(
         "wp.select() is deprecated and will be removed in a future\n"
@@ -5480,7 +6170,7 @@ def select_dispatch_func(input_types: Mapping[str, type], return_type: Any, args
 add_builtin(
     "select",
     input_types={"cond": builtins.bool, "value_if_false": Any, "value_if_true": Any},
-    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    value_func=select_value_func,
     dispatch_func=select_dispatch_func,
     doc="""Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``.
@@ -5493,7 +6183,7 @@ for t in int_types:
     add_builtin(
         "select",
         input_types={"cond": t, "value_if_false": Any, "value_if_true": Any},
-        value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+        value_func=select_value_func,
         dispatch_func=select_dispatch_func,
         doc="""Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``.
@@ -5505,7 +6195,7 @@ for t in int_types:
 add_builtin(
     "select",
     input_types={"arr": array(dtype=Any), "value_if_false": Any, "value_if_true": Any},
-    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    value_func=select_value_func,
     dispatch_func=select_dispatch_func,
     doc="""Select between two arguments, if ``arr`` is null then return ``value_if_false``, otherwise return ``value_if_true``.
@@ -5515,10 +6205,40 @@ add_builtin(
     group="Utility",
 )
+def where_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return Any
+    v_true = arg_types["value_if_true"]
+    v_false = arg_types["value_if_false"]
+    if not types_equal(v_true, v_false):
+        raise RuntimeError(f"where() true value type ({v_true}) must be of the same type as the false type ({v_false})")
+    if is_tile(v_false):
+        if v_true.storage == "register":
+            return v_true
+        if v_false.storage == "register":
+            return v_false
+        # both v_true and v_false are shared
+        return tile(
+            dtype=v_true.dtype,
+            shape=v_true.shape,
+            storage=v_true.storage,
+            strides=v_true.strides,
+            layout=v_true.layout,
+            owner=True,
+        )
+    return v_true
 add_builtin(
     "where",
     input_types={"cond": builtins.bool, "value_if_true": Any, "value_if_false": Any},
-    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    value_func=where_value_func,
     doc="Select between two arguments, if ``cond`` is ``True`` then return ``value_if_true``, otherwise return ``value_if_false``.",
     group="Utility",
 )
@@ -5526,14 +6246,14 @@ for t in int_types:
     add_builtin(
         "where",
         input_types={"cond": t, "value_if_true": Any, "value_if_false": Any},
-        value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+        value_func=where_value_func,
         doc="Select between two arguments, if ``cond`` is ``True`` then return ``value_if_true``, otherwise return ``value_if_false``.",
         group="Utility",
     )
 add_builtin(
     "where",
     input_types={"arr": array(dtype=Any), "value_if_true": Any, "value_if_false": Any},
-    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    value_func=where_value_func,
     doc="Select between two arguments, if ``arr`` is not null then return ``value_if_true``, otherwise return ``value_if_false``.",
     group="Utility",
 )
@@ -5544,7 +6264,7 @@ def array_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any
         return array(dtype=Scalar)
     dtype = arg_values["dtype"]
-    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    shape = extract_tuple(arg_values["shape"], as_constant=False)
     return array(dtype=dtype, ndim=len(shape))
@@ -5554,7 +6274,7 @@ def array_dispatch_func(input_types: Mapping[str, type], return_type: Any, args:
     # to the underlying C++ function's runtime and template params.
     dtype = return_type.dtype
-    shape = extract_tuple(args["shape"], as_constant=True)
+    shape = extract_tuple(args["shape"], as_constant=False)
     func_args = (args["ptr"], *shape)
     template_args = (dtype,)
@@ -5563,7 +6283,7 @@ def array_dispatch_func(input_types: Mapping[str, type], return_type: Any, args:
 add_builtin(
     "array",
-    input_types={"ptr": warp.uint64, "shape": Tuple[int, ...], "dtype": Scalar},
+    input_types={"ptr": warp.uint64, "shape": Tuple[int, ...], "dtype": Any},
     value_func=array_value_func,
     export_func=lambda input_types: {k: v for k, v in input_types.items() if k != "dtype"},
     dispatch_func=array_dispatch_func,
@@ -5575,6 +6295,48 @@ add_builtin(
 )
+def zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return fixedarray(dtype=Scalar)
+    dtype = arg_values["dtype"]
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise RuntimeError("the `shape` argument must be specified as a constant when zero-initializing an array")
+    return fixedarray(dtype=dtype, shape=shape)
+def zeros_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    # We're in the codegen stage where we emit the code calling the built-in.
+    # Further validate the given argument values if needed and map them
+    # to the underlying C++ function's runtime and template params.
+    dtype = return_type.dtype
+    shape = extract_tuple(args["shape"], as_constant=True)
+    size = math.prod(shape)
+    func_args = shape
+    template_args = (size, dtype)
+    return (func_args, template_args)
+add_builtin(
+    "zeros",
+    input_types={"shape": Tuple[int, ...], "dtype": Any},
+    value_func=zeros_value_func,
+    export_func=lambda input_types: {},
+    dispatch_func=zeros_dispatch_func,
+    native_func="fixedarray_t",
+    group="Utility",
+    export=False,
+    missing_grad=True,
+    hidden=True,  # Unhide once we can document both a built-in and a Python scope function sharing the same name.
+)
 # does argument checking and type propagation for address()
 def address_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     arr_type = arg_types["arr"]
@@ -5751,6 +6513,7 @@ add_builtin(
     hidden=True,
     skip_replay=True,
     group="Utility",
+    missing_grad=True,
 )
@@ -5767,6 +6530,7 @@ add_builtin(
     dispatch_func=load_dispatch_func,
     hidden=True,
     group="Utility",
+    missing_grad=True,
 )
@@ -5864,8 +6628,8 @@ def atomic_op_dispatch_func(input_types: Mapping[str, type], return_type: Any, a
 for array_type in array_types:
-    # don't list indexed array operations explicitly in docs
-    hidden = array_type == indexedarray
+    # don't list fixed or indexed array operations explicitly in docs
+    hidden = array_type in (indexedarray, fixedarray)
     add_builtin(
         "atomic_add",
@@ -6083,6 +6847,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        missing_grad=True,
     )
     add_builtin(
         "atomic_cas",
@@ -6096,6 +6861,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        missing_grad=True,
     )
     add_builtin(
         "atomic_cas",
@@ -6109,6 +6875,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        missing_grad=True,
     )
     add_builtin(
         "atomic_cas",
@@ -6130,6 +6897,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        missing_grad=True,
     )
     add_builtin(
@@ -6144,6 +6912,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        missing_grad=True,
     )
     add_builtin(
         "atomic_exch",
@@ -6157,6 +6926,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        missing_grad=True,
     )
     add_builtin(
         "atomic_exch",
@@ -6170,6 +6940,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        missing_grad=True,
     )
     add_builtin(
         "atomic_exch",
@@ -6187,46 +6958,110 @@ for array_type in array_types:
 # used to index into builtin types, i.e.: y = vec3[1]
-def extract_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
-    return arg_types["a"]._wp_scalar_type_
+def vector_extract_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    vec_type = arg_types["a"]
+    idx_type = arg_types["i"]
+    if isinstance(idx_type, slice_t):
+        length = idx_type.get_length(vec_type._length_)
+        return vector(length=length, dtype=vec_type._wp_scalar_type_)
+    return vec_type._wp_scalar_type_
+def vector_extract_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = tuple(args.values())
+    template_args = getattr(return_type, "_shape_", ())
+    return (func_args, template_args)
 add_builtin(
     "extract",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int},
-    value_func=extract_value_func,
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any},
+    value_func=vector_extract_value_func,
+    dispatch_func=vector_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
 add_builtin(
     "extract",
-    input_types={"a": quaternion(dtype=Scalar), "i": int},
-    value_func=extract_value_func,
+    input_types={"a": quaternion(dtype=Scalar), "i": Any},
+    value_func=vector_extract_value_func,
+    dispatch_func=vector_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
 add_builtin(
     "extract",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int},
-    value_func=lambda arg_types, arg_values: vector(
-        length=arg_types["a"]._shape_[1], dtype=arg_types["a"]._wp_scalar_type_
-    ),
+    input_types={"a": transformation(dtype=Scalar), "i": Any},
+    value_func=vector_extract_value_func,
+    dispatch_func=vector_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
+def matrix_extract_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    mat_type = arg_types["a"]
+    idx_types = tuple(arg_types[x] for x in "ij" if arg_types.get(x, None) is not None)
+    # Compute the resulting shape from the slicing, with -1 being simple indexing.
+    shape = tuple(
+        idx.get_length(mat_type._shape_[i]) if isinstance(idx, slice_t) else -1 for i, idx in enumerate(idx_types)
+    )
+    # Append any non indexed slice.
+    for i in range(len(idx_types), len(mat_type._shape_)):
+        shape += (mat_type._shape_[i],)
+    # Count how many dimensions the output value will have.
+    ndim = sum(1 for x in shape if x >= 0)
+    if ndim == 0:
+        return mat_type._wp_scalar_type_
+    assert shape[0] != -1 or shape[1] != -1
+    if ndim == 1:
+        length = shape[0] if shape[0] != -1 else shape[1]
+        return vector(length=length, dtype=mat_type._wp_scalar_type_)
+    assert ndim == 2
+    # When a matrix dimension is 0, all other dimensions are also expected to be 0.
+    if any(x == 0 for x in shape):
+        shape = (0,) * len(shape)
+    return matrix(shape=shape, dtype=mat_type._wp_scalar_type_)
+def matrix_extract_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    idx_types = tuple(args[x].type for x in "ij" if args.get(x, None) is not None)
+    has_slice = any(isinstance(x, slice_t) for x in idx_types)
+    func_args = tuple(args.values())
+    template_args = getattr(return_type, "_shape_", ()) if has_slice else ()
+    return (func_args, template_args)
 add_builtin(
     "extract",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int},
-    value_func=extract_value_func,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any},
+    value_func=matrix_extract_value_func,
+    dispatch_func=matrix_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
 add_builtin(
     "extract",
-    input_types={"a": transformation(dtype=Scalar), "i": int},
-    value_func=extract_value_func,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any},
+    value_func=matrix_extract_value_func,
+    dispatch_func=matrix_extract_dispatch_func,
+    export=False,
     hidden=True,
     group="Utility",
 )
@@ -6247,6 +7082,19 @@ def vector_index_dispatch_func(input_types: Mapping[str, type], return_type: Any
     return (func_args, template_args)
+def matrix_ij_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    mat_type = arg_types["a"]
+    value_type = mat_type._wp_scalar_type_
+    return Reference(value_type)
+def matrix_ij_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = (Reference(args["a"]), args["i"], args["j"])
+    template_args = ()
+    return (func_args, template_args)
 # implements &vector[index]
 add_builtin(
     "index",
@@ -6256,6 +7104,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    missing_grad=True,
 )
 # implements &quaternion[index]
 add_builtin(
@@ -6266,6 +7115,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    missing_grad=True,
 )
 # implements &transformation[index]
 add_builtin(
@@ -6276,6 +7126,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    missing_grad=True,
 )
 # implements &(*vector)[index]
 add_builtin(
@@ -6286,6 +7137,18 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    missing_grad=True,
+)
+# implements &(*matrix)[i, j]
+add_builtin(
+    "indexref",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int},
+    value_func=matrix_ij_value_func,
+    dispatch_func=matrix_ij_dispatch_func,
+    hidden=True,
+    group="Utility",
+    skip_replay=True,
+    missing_grad=True,
 )
 # implements &(*quaternion)[index]
 add_builtin(
@@ -6296,6 +7159,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    missing_grad=True,
 )
 # implements &(*transformation)[index]
 add_builtin(
@@ -6306,14 +7170,50 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    missing_grad=True,
 )
+def vector_assign_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    vec = args["a"].type
+    idx = args["i"].type
+    value_type = strip_reference(args["value"].type)
+    if isinstance(idx, slice_t):
+        length = idx.get_length(vec._length_)
+        if type_is_vector(value_type):
+            if not types_equal(value_type._wp_scalar_type_, vec._wp_scalar_type_):
+                raise ValueError(
+                    f"The provided vector is expected to be of length {length} with dtype {type_repr(vec._wp_scalar_type_)}."
+                )
+            if value_type._length_ != length:
+                raise ValueError(
+                    f"The length of the provided vector ({args['value'].type._length_}) isn't compatible with the given slice (expected {length})."
+                )
+            template_args = (length,)
+        else:
+            # Disallow broadcasting.
+            raise ValueError(
+                f"The provided value is expected to be a vector of length {length}, with dtype {type_repr(vec._wp_scalar_type_)}."
+            )
+    else:
+        if not types_equal(value_type, vec._wp_scalar_type_):
+            raise ValueError(
+                f"The provided value is expected to be a scalar of type {type_repr(vec._wp_scalar_type_)}."
+            )
+        template_args = ()
+    func_args = tuple(args.values())
+    return (func_args, template_args)
 # implements vector[index] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6322,8 +7222,9 @@ add_builtin(
 # implements quaternion[index] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": quaternion(dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6331,15 +7232,16 @@ add_builtin(
 # implements transformation[index] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": transformation(dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-def vector_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+def vector_assign_copy_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     vec_type = arg_types["a"]
     return vec_type
@@ -6347,8 +7249,9 @@ def vector_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[
 # implements vector[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
     "assign_copy",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
-    value_func=vector_assign_value_func,
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
+    value_func=vector_assign_copy_value_func,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6357,8 +7260,9 @@ add_builtin(
 # implements quaternion[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
     "assign_copy",
-    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
-    value_func=vector_assign_value_func,
+    input_types={"a": quaternion(dtype=Scalar), "i": Any, "value": Any},
+    value_func=vector_assign_copy_value_func,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6367,8 +7271,9 @@ add_builtin(
 # implements transformation[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
     "assign_copy",
-    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
-    value_func=vector_assign_value_func,
+    input_types={"a": transformation(dtype=Scalar), "i": Any, "value": Any},
+    value_func=vector_assign_copy_value_func,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6377,8 +7282,9 @@ add_builtin(
 # implements vector[idx] += scalar
 add_builtin(
     "add_inplace",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6387,8 +7293,9 @@ add_builtin(
 # implements quaternion[idx] += scalar
 add_builtin(
     "add_inplace",
-    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": quaternion(dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6397,8 +7304,9 @@ add_builtin(
 # implements transformation[idx] += scalar
 add_builtin(
     "add_inplace",
-    input_types={"a": transformation(dtype=Float), "i": int, "value": Float},
+    input_types={"a": transformation(dtype=Float), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6417,8 +7325,9 @@ add_builtin(
 # implements vector[idx] -= scalar
 add_builtin(
     "sub_inplace",
-    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6427,8 +7336,9 @@ add_builtin(
 # implements quaternion[idx] -= scalar
 add_builtin(
     "sub_inplace",
-    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": quaternion(dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6437,8 +7347,9 @@ add_builtin(
 # implements transformation[idx] -= scalar
 add_builtin(
     "sub_inplace",
-    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
+    input_types={"a": transformation(dtype=Float), "i": Any, "value": Any},
     value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
@@ -6470,6 +7381,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    missing_grad=True,
 )
@@ -6488,6 +7400,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    missing_grad=True,
 )
@@ -6499,61 +7412,154 @@ def matrix_vector_sametype(arg_types: Mapping[str, Any]):
     return mat_size == vec_size and mat_type == vec_type
-# implements matrix[i,j] = scalar
+def matrix_assign_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    mat = args["a"].type
+    value_type = strip_reference(args["value"].type)
+    idxs = tuple(args[x].type for x in "ij" if args.get(x, None) is not None)
+    has_slice = any(isinstance(x, slice_t) for x in idxs)
+    if has_slice:
+        # Compute the resulting shape from the slicing, with -1 being simple indexing.
+        shape = tuple(idx.get_length(mat._shape_[i]) if isinstance(idx, slice_t) else -1 for i, idx in enumerate(idxs))
+        # Append any non indexed slice.
+        for i in range(len(idxs), len(mat._shape_)):
+            shape += (mat._shape_[i],)
+        # Count how many dimensions the output value will have.
+        ndim = sum(1 for x in shape if x >= 0)
+        assert ndim > 0
+        if ndim == 1:
+            length = shape[0] if shape[0] != -1 else shape[1]
+            if type_is_vector(value_type):
+                if not types_equal(value_type._wp_scalar_type_, mat._wp_scalar_type_):
+                    raise ValueError(
+                        f"The provided vector is expected to be of length {length} with dtype {type_repr(mat._wp_scalar_type_)}."
+                    )
+                if value_type._length_ != length:
+                    raise ValueError(
+                        f"The length of the provided vector ({value_type._length_}) isn't compatible with the given slice (expected {length})."
+                    )
+                template_args = (length,)
+            else:
+                # Disallow broadcasting.
+                raise ValueError(
+                    f"The provided value is expected to be a vector of length {length}, with dtype {type_repr(mat._wp_scalar_type_)}."
+                )
+        else:
+            assert ndim == 2
+            # When a matrix dimension is 0, all other dimensions are also expected to be 0.
+            if any(x == 0 for x in shape):
+                shape = (0,) * len(shape)
+            if type_is_matrix(value_type):
+                if not types_equal(value_type._wp_scalar_type_, mat._wp_scalar_type_):
+                    raise ValueError(
+                        f"The provided matrix is expected to be of shape {shape} with dtype {type_repr(mat._wp_scalar_type_)}."
+                    )
+                if value_type._shape_ != shape:
+                    raise ValueError(
+                        f"The shape of the provided matrix ({value_type._shape_}) isn't compatible with the given slice (expected {shape})."
+                    )
+                template_args = shape
+            else:
+                # Disallow broadcasting.
+                raise ValueError(
+                    f"The provided value is expected to be a matrix of shape {shape}, with dtype {type_repr(mat._wp_scalar_type_)}."
+                )
+    elif len(idxs) == 1:
+        if not type_is_vector(value_type) or not types_equal(value_type._wp_scalar_type_, mat._wp_scalar_type_):
+            raise ValueError(
+                f"The provided value is expected to be a vector of length {mat._shape_[1]}, with dtype {type_repr(mat._wp_scalar_type_)}."
+            )
+        if value_type._length_ != mat._shape_[1]:
+            raise ValueError(
+                f"The length of the provided vector ({value_type._length_}) isn't compatible with the given slice (expected {mat._shape_[1]})."
+            )
+        template_args = ()
+    elif len(idxs) == 2:
+        if not types_equal(value_type, mat._wp_scalar_type_):
+            raise ValueError(
+                f"The provided value is expected to be a scalar of type {type_repr(mat._wp_scalar_type_)}."
+            )
+        template_args = ()
+    else:
+        raise AssertionError
+    func_args = tuple(args.values())
+    return (func_args, template_args)
+# implements matrix[i] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    constraint=matrix_vector_sametype,
     value_type=None,
+    dispatch_func=matrix_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-# implements matrix[i] = vector
+# implements matrix[i,j] = value
 add_builtin(
     "assign_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
-    constraint=matrix_vector_sametype,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
     value_type=None,
+    dispatch_func=matrix_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-def matrix_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+def matrix_assign_copy_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     mat_type = arg_types["a"]
     return mat_type
-# implements matrix[i,j] = scalar
+# implements matrix[i] = value
 add_builtin(
     "assign_copy",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
-    value_func=matrix_assign_value_func,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    value_func=matrix_assign_copy_value_func,
+    dispatch_func=matrix_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-# implements matrix[i] = vector
+# implements matrix[i,j] = value
 add_builtin(
     "assign_copy",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
-    constraint=matrix_vector_sametype,
-    value_func=matrix_assign_value_func,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
+    value_func=matrix_assign_copy_value_func,
+    dispatch_func=matrix_assign_dispatch_func,
     hidden=True,
     export=False,
     group="Utility",
 )
-# implements matrix[i,j] += scalar
+# implements matrix[i] += value
 add_builtin(
     "add_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    constraint=matrix_vector_sametype,
     value_type=None,
     hidden=True,
     export=False,
@@ -6561,11 +7567,10 @@ add_builtin(
 )
-# implements matrix[i] += vector
+# implements matrix[i,j] += value
 add_builtin(
     "add_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
-    constraint=matrix_vector_sametype,
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
     value_type=None,
     hidden=True,
     export=False,
@@ -6573,10 +7578,10 @@ add_builtin(
 )
-# implements matrix[i,j] -= scalar
+# implements matrix[i] -= value
 add_builtin(
     "sub_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
     value_type=None,
     hidden=True,
     export=False,
@@ -6584,10 +7589,10 @@ add_builtin(
 )
-# implements matrix[i] -= vector
+# implements matrix[i,j] -= value
 add_builtin(
     "sub_inplace",
-    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
     value_type=None,
     hidden=True,
     export=False,
@@ -6606,6 +7611,7 @@ for t in scalar_types + vector_types + (bool,):
         doc="Prints an error to stdout if ``a`` and ``b`` are not equal",
         group="Utility",
         hidden=True,
+        missing_grad=True,
     )
     add_builtin(
@@ -6616,6 +7622,7 @@ for t in scalar_types + vector_types + (bool,):
         group="Utility",
         hidden=True,
         export=False,
+        missing_grad=True,
     )
@@ -6634,6 +7641,7 @@ add_builtin(
     doc="Prints an error to stdout if ``a`` and ``b`` are not equal",
     group="Utility",
     hidden=True,
+    missing_grad=True,
 )
 add_builtin(
     "expect_neq",
@@ -6644,6 +7652,7 @@ add_builtin(
     group="Utility",
     hidden=True,
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -6654,6 +7663,7 @@ add_builtin(
     doc="Prints an error to stdout if ``a`` and ``b`` are not equal",
     group="Utility",
     hidden=True,
+    missing_grad=True,
 )
 add_builtin(
     "expect_neq",
@@ -6664,6 +7674,7 @@ add_builtin(
     group="Utility",
     hidden=True,
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -6754,6 +7765,7 @@ add_builtin(
     value_type=None,
     doc="Prints an error to stdout if ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
+    missing_grad=True,
 )
 add_builtin(
     "expect_near",
@@ -6763,6 +7775,7 @@ add_builtin(
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
+    missing_grad=True,
 )
 add_builtin(
     "expect_near",
@@ -6772,6 +7785,7 @@ add_builtin(
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
+    missing_grad=True,
 )
 add_builtin(
     "expect_near",
@@ -6785,6 +7799,7 @@ add_builtin(
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
+    missing_grad=True,
 )
 # ---------------------------------
@@ -6795,6 +7810,7 @@ add_builtin(
     input_types={"arr": array(dtype=Scalar), "value": Scalar},
     value_type=int,
     doc="Search a sorted array ``arr`` for the closest element greater than or equal to ``value``.",
+    missing_grad=True,
 )
 add_builtin(
@@ -6802,11 +7818,13 @@ add_builtin(
     input_types={"arr": array(dtype=Scalar), "arr_begin": int, "arr_end": int, "value": Scalar},
     value_type=int,
     doc="Search a sorted array ``arr`` in the range [arr_begin, arr_end) for the closest element greater than or equal to ``value``.",
+    missing_grad=True,
 )
 # ---------------------------------
 # Operators
 add_builtin(
     "add", input_types={"a": Scalar, "b": Scalar}, value_func=sametypes_create_value_func(Scalar), group="Operators"
 )
@@ -6876,13 +7894,36 @@ add_builtin(
 )
 # bitwise operators
-add_builtin("bit_and", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("bit_or", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("bit_xor", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("lshift", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("rshift", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("invert", input_types={"a": Int}, value_func=sametypes_create_value_func(Int))
+add_builtin(
+    "bit_and",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    missing_grad=True,
+)
+add_builtin(
+    "bit_or",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    missing_grad=True,
+)
+add_builtin(
+    "bit_xor",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    missing_grad=True,
+)
+add_builtin("lshift", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int), group="Operators")
+add_builtin(
+    "rshift",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    missing_grad=True,
+)
+add_builtin("invert", input_types={"a": Int}, value_func=sametypes_create_value_func(Int), group="Operators")
 add_builtin(
     "mul", input_types={"a": Scalar, "b": Scalar}, value_func=sametypes_create_value_func(Scalar), group="Operators"
@@ -7079,9 +8120,10 @@ add_builtin(
     "mod",
     input_types={"a": vector(length=Any, dtype=Scalar), "b": vector(length=Any, dtype=Scalar)},
     constraint=sametypes,
-    value_func=sametypes_create_value_func(Scalar),
+    value_func=sametypes_create_value_func(vector(length=Any, dtype=Scalar)),
     doc="Modulo operation using truncated division.",
     group="Operators",
+    missing_grad=True,
 )
 add_builtin(
@@ -7141,6 +8183,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Scalar),
     doc="",
     group="Operators",
+    missing_grad=True,
 )
 add_builtin("pos", input_types={"x": Scalar}, value_func=sametypes_create_value_func(Scalar), group="Operators")
@@ -7188,12 +8231,16 @@ add_builtin(
     group="Operators",
 )
-add_builtin("unot", input_types={"a": builtins.bool}, value_type=builtins.bool, doc="", group="Operators")
+add_builtin(
+    "unot", input_types={"a": builtins.bool}, value_type=builtins.bool, doc="", group="Operators", missing_grad=True
+)
 for t in int_types:
-    add_builtin("unot", input_types={"a": t}, value_type=builtins.bool, doc="", group="Operators")
+    add_builtin("unot", input_types={"a": t}, value_type=builtins.bool, doc="", group="Operators", missing_grad=True)
-add_builtin("unot", input_types={"a": array(dtype=Any)}, value_type=builtins.bool, doc="", group="Operators")
+add_builtin(
+    "unot", input_types={"a": array(dtype=Any)}, value_type=builtins.bool, doc="", group="Operators", missing_grad=True
+)
 # Tile operators
@@ -7387,6 +8434,7 @@ add_builtin(
     doc="Add a square matrix and a diagonal matrix 'd' represented as a 1D tile",
     group="Tile Primitives",
     export=False,
+    missing_grad=True,
 )
@@ -7481,7 +8529,7 @@ def tile_matmul_lto_dispatch_func(
     num_threads = options["block_dim"]
     arch = options["output_arch"]
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, 0, 0, a, b, out), template_args, [], 0)
     else:
@@ -7671,7 +8719,7 @@ def tile_fft_generic_lto_dispatch_func(
     arch = options["output_arch"]
     ept = size // num_threads
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ([], [], [], 0)
     else:
@@ -7714,6 +8762,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    missing_grad=True,
 )
 add_builtin(
@@ -7735,6 +8784,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    missing_grad=True,
 )
@@ -7792,28 +8842,27 @@ def tile_cholesky_generic_lto_dispatch_func(
         raise TypeError("tile_cholesky() returns one output")
     out = return_values[0]
-    dtype, precision_enum = cusolver_type_map[a.type.dtype]
     # We already ensured a is square in tile_cholesky_generic_value_func()
     M, N = a.type.shape
     if out.type.shape[0] != M or out.type.shape[1] != M:
         raise ValueError("tile_cholesky() output tile must be square")
-    solver = "potrf"
-    solver_enum = cusolver_function_map[solver]
-    side_enum = cusolver_side_map["-"]
-    diag_enum = cusolver_diag_map["-"]
-    fill_mode = cusolver_fill_mode_map["lower"]
     arch = options["output_arch"]
-    num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, int*)"
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, a, out), [], [], 0)
     else:
+        solver = "potrf"
+        solver_enum = cusolver_function_map[solver]
+        side_enum = cusolver_side_map["-"]
+        diag_enum = cusolver_diag_map["-"]
+        fill_mode = cusolver_fill_mode_map["lower"]
+        dtype, precision_enum = cusolver_type_map[a.type.dtype]
+        num_threads = options["block_dim"]
+        parameter_list = f"({dtype}*, int*)"
+        req_smem_bytes = a.type.size * type_size_in_bytes(a.type.dtype)
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
@@ -7831,6 +8880,7 @@ def tile_cholesky_generic_lto_dispatch_func(
             num_threads,
             parameter_list,
             builder,
+            smem_estimate_bytes=req_smem_bytes,
         )
         return ((Var(lto_symbol, str, False, True, False), a, out), [], [lto_code_data], 0)
@@ -7859,6 +8909,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    missing_grad=True,
 )
@@ -7918,9 +8969,7 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
     if any(T not in cusolver_type_map.keys() for T in [y.type.dtype, L.type.dtype]):
         raise TypeError("tile_cholesky_solve() arguments be tiles of float64 or float32")
-    dtype, precision_enum = cusolver_type_map[L.type.dtype]
     M, N = L.type.shape
-    NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
     if len(x.type.shape) > 2 or len(x.type.shape) < 1:
         raise TypeError(f"tile_cholesky_solve() output vector must be 1D or 2D, got {len(x.type.shape)}-D")
@@ -7931,21 +8980,23 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
             f"got {x.type.shape[0]} elements in output and {M} rows in 'L'"
         )
-    solver = "potrs"
-    solver_enum = cusolver_function_map[solver]
-    side_enum = cusolver_side_map["-"]
-    diag_enum = cusolver_diag_map["-"]
-    fill_mode = cusolver_fill_mode_map["lower"]
     arch = options["output_arch"]
-    num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, {dtype}*)"
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, L, y, x), [], [], 0)
     else:
+        NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
+        solver = "potrs"
+        solver_enum = cusolver_function_map[solver]
+        side_enum = cusolver_side_map["-"]
+        diag_enum = cusolver_diag_map["-"]
+        fill_mode = cusolver_fill_mode_map["lower"]
+        dtype, precision_enum = cusolver_type_map[L.type.dtype]
+        num_threads = options["block_dim"]
+        parameter_list = f"({dtype}*, {dtype}*)"
+        req_smem_bytes = (x.type.size + y.type.size + L.type.size) * type_size_in_bytes(L.type.dtype)
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
@@ -7963,6 +9014,7 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
             num_threads,
             parameter_list,
             builder,
+            smem_estimate_bytes=req_smem_bytes,
         )
         return ((Var(lto_symbol, str, False, True, False), L, y, x), [], [lto_code_data], 0)
@@ -7988,6 +9040,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    missing_grad=True,
 )
@@ -8013,9 +9066,7 @@ def tile_lower_solve_generic_lto_dispatch_func(
     z = return_values[0]
-    dtype, precision_enum = cusolver_type_map[L.type.dtype]
     M, N = L.type.shape
-    NRHS = z.type.shape[1] if len(z.type.shape) > 1 else 1
     if len(z.type.shape) > 2 or len(z.type.shape) < 1:
         raise TypeError(f"tile_lower_solve() output vector must be 1D or 2D, got {len(z.type.shape)}-D")
@@ -8026,21 +9077,23 @@ def tile_lower_solve_generic_lto_dispatch_func(
             f"got {z.type.shape[0]} elements in output and {M} rows in 'L'"
         )
-    solver = "trsm"
-    solver_enum = cusolver_function_map[solver]
-    side_enum = cusolver_side_map["left"]
-    diag_enum = cusolver_diag_map["nounit"]
-    fill_mode = cusolver_fill_mode_map["lower"]
     arch = options["output_arch"]
-    num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, {dtype}*)"
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, L, y, z), [], [], 0)
     else:
+        NRHS = z.type.shape[1] if len(z.type.shape) > 1 else 1
+        solver = "trsm"
+        solver_enum = cusolver_function_map[solver]
+        side_enum = cusolver_side_map["left"]
+        diag_enum = cusolver_diag_map["nounit"]
+        fill_mode = cusolver_fill_mode_map["lower"]
+        dtype, precision_enum = cusolver_type_map[L.type.dtype]
+        num_threads = options["block_dim"]
+        parameter_list = f"({dtype}*, {dtype}*)"
+        req_smem_bytes = (z.type.size + y.type.size + L.type.size) * type_size_in_bytes(L.type.dtype)
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
@@ -8058,6 +9111,7 @@ def tile_lower_solve_generic_lto_dispatch_func(
             num_threads,
             parameter_list,
             builder,
+            smem_estimate_bytes=req_smem_bytes,
         )
         return ((Var(lto_symbol, str, False, True, False), L, y, z), [], [lto_code_data], 0)
@@ -8119,6 +9173,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    missing_grad=True,
 )
@@ -8144,9 +9199,7 @@ def tile_upper_solve_generic_lto_dispatch_func(
     x = return_values[0]
-    dtype, precision_enum = cusolver_type_map[U.type.dtype]
     M, N = U.type.shape
-    NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
     if len(z.type.shape) > 2 or len(z.type.shape) < 1:
         raise TypeError(f"tile_upper_solve() output tile must be 1D or 2D, got {len(z.type.shape)}-D")
@@ -8157,21 +9210,23 @@ def tile_upper_solve_generic_lto_dispatch_func(
             f"got {z.type.shape[0]} elements in output and {M} rows in 'U'"
         )
-    solver = "trsm"
-    solver_enum = cusolver_function_map[solver]
-    side_enum = cusolver_side_map["left"]
-    diag_enum = cusolver_diag_map["nounit"]
-    fill_mode = cusolver_fill_mode_map["upper"]
     arch = options["output_arch"]
-    num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, {dtype}*)"
-    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, U, z, x), [], [], 0)
     else:
+        NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
+        solver = "trsm"
+        solver_enum = cusolver_function_map[solver]
+        side_enum = cusolver_side_map["left"]
+        diag_enum = cusolver_diag_map["nounit"]
+        fill_mode = cusolver_fill_mode_map["upper"]
+        dtype, precision_enum = cusolver_type_map[U.type.dtype]
+        num_threads = options["block_dim"]
+        parameter_list = f"({dtype}*, {dtype}*)"
+        req_smem_bytes = (x.type.size + z.type.size + U.type.size) * type_size_in_bytes(U.type.dtype)
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
@@ -8189,6 +9244,7 @@ def tile_upper_solve_generic_lto_dispatch_func(
             num_threads,
             parameter_list,
             builder,
+            smem_estimate_bytes=req_smem_bytes,
         )
         return ((Var(lto_symbol, str, False, True, False), U, z, x), [], [lto_code_data], 0)
@@ -8250,6 +9306,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    missing_grad=True,
 )
@@ -8269,6 +9326,7 @@ add_builtin(
     The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
     (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).""",
     group="Code Generation",
+    missing_grad=True,
 )
@@ -8293,6 +9351,7 @@ add_builtin(
     doc="Return the number of elements in a vector.",
     group="Utility",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -8302,6 +9361,7 @@ add_builtin(
     doc="Return the number of elements in a quaternion.",
     group="Utility",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -8311,6 +9371,7 @@ add_builtin(
     doc="Return the number of rows in a matrix.",
     group="Utility",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -8320,6 +9381,7 @@ add_builtin(
     doc="Return the number of elements in a transformation.",
     group="Utility",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -8329,6 +9391,7 @@ add_builtin(
     doc="Return the size of the first dimension in an array.",
     group="Utility",
     export=False,
+    missing_grad=True,
 )
 add_builtin(
@@ -8338,6 +9401,7 @@ add_builtin(
     doc="Return the number of rows in a tile.",
     group="Utility",
     export=False,
+    missing_grad=True,
 )
@@ -8412,4 +9476,24 @@ add_builtin(
     doc="Return the number of elements in a tuple.",
     group="Utility",
     export=False,
+    missing_grad=True,
+)
+# ---------------------------------
+# Slicing
+def slice_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    return slice_t(**arg_values)
+add_builtin(
+    "slice",
+    input_types={"start": int, "stop": int, "step": int},
+    value_func=slice_value_func,
+    native_func="slice_t",
+    export=False,
+    group="Utility",
+    hidden=True,
+    missing_grad=True,
 )