PyPI - warp-lang - Versions diffs - 1.7.2__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.7.2__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (180) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/warp.so +0 -0
warp/build.py +241 -252
warp/build_dll.py +125 -26
warp/builtins.py +1907 -384
warp/codegen.py +257 -101
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +657 -223
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +97 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +107 -52
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +12 -17
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +99 -0
warp/native/builtin.h +174 -31
warp/native/coloring.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +3 -3
warp/native/mat.h +5 -10
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/quat.h +28 -4
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/tile.h +583 -72
warp/native/tile_radix_sort.h +1108 -0
warp/native/tile_reduce.h +237 -2
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +6 -16
warp/native/warp.cpp +36 -4
warp/native/warp.cu +574 -51
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +58 -29
warp/render/render_usd.py +124 -61
warp/sim/__init__.py +9 -0
warp/sim/collide.py +252 -78
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +751 -320
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +52 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +15 -1
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_atomic_cas.py +299 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +1 -24
warp/tests/test_quat.py +6 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +51 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_struct.py +2 -2
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/tile/test_tile.py +420 -1
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_reduce.py +213 -0
warp/tests/tile/test_tile_shared_memory.py +130 -1
warp/tests/tile/test_tile_sort.py +117 -0
warp/tests/unittest_suites.py +4 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/RECORD +177 -165
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0

warp/builtins.py CHANGED Viewed

@@ -13,13 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 import builtins
 import functools
 from typing import Any, Callable, Mapping, Sequence
 import warp.build
 import warp.context
-from warp.codegen import Reference, Var, strip_reference
+from warp.codegen import Reference, Var, get_arg_value, strip_reference
 from warp.types import *
 from .context import add_builtin
@@ -55,6 +57,33 @@ def sametypes_create_value_func(default: TypeVar):
     return fn
+def extract_tuple(arg, as_constant=False):
+    if isinstance(arg, Var):
+        if isinstance(arg.type, warp.types.tuple_t):
+            out = arg.type.values
+        else:
+            out = (arg,)
+    elif isinstance(arg, warp.types.tuple_t):
+        out = arg.values
+    elif not isinstance(arg, Sequence):
+        out = (arg,)
+    else:
+        out = arg
+    if as_constant:
+        return tuple(x.constant if isinstance(x, Var) else x for x in out)
+    return out
+def static_len_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return int
+    length = warp.types.type_length(arg_types["a"])
+    return Var(None, type=int, constant=length)
 # ---------------------------------
 # Scalar Math
@@ -399,7 +428,7 @@ add_builtin(
 )
-def scalar_infer_type(arg_types: Union[Mapping[str, type], Tuple[type, ...], None]):
+def scalar_infer_type(arg_types: Mapping[str, type] | tuple[type, ...] | None):
     if arg_types is None:
         return Scalar
@@ -1155,6 +1184,11 @@ add_builtin(
 def matrix_transform_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    warp.utils.warn(
+        "the built-in `wp.matrix()` function to construct a 4x4 matrix from a 3D position, quaternion, "
+        "and 3D scale vector will be deprecated in favor of `wp.transform_compose()`.",
+        DeprecationWarning,
+    )
     if arg_types is None:
         return matrix(shape=(4, 4), dtype=Float)
@@ -1204,21 +1238,47 @@ add_builtin(
     dispatch_func=matrix_transform_dispatch_func,
     native_func="mat_t",
     doc="""Construct a 4x4 transformation matrix that applies the transformations as
-    Translation(pos)*Rotation(rot)*Scaling(scale) when applied to column vectors, i.e.: y = (TRS)*x""",
+    Translation(pos)*Rotation(rot)*Scaling(scale) when applied to column vectors, i.e.: y = (TRS)*x
+    .. warning::
+       This function has been deprecated in favor of :func:`warp.math.transform_compose()`.""",
     group="Vector Math",
     export=False,
 )
-# not making these functions available outside kernels (export=False) as they
-# return data via references, which we don't currently support:
+def svd3_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return (
+            matrix(shape=(3, 3), dtype=Float),
+            vector(length=3, dtype=Float),
+            matrix(shape=(3, 3), dtype=Float),
+        )
+    dtype = arg_types["A"]._wp_scalar_type_
+    return (
+        matrix(shape=(3, 3), dtype=dtype),
+        vector(length=3, dtype=dtype),
+        matrix(shape=(3, 3), dtype=dtype),
+    )
+add_builtin(
+    "svd3",
+    input_types={"A": matrix(shape=(3, 3), dtype=Float)},
+    value_func=svd3_value_func,
+    group="Vector Math",
+    doc="""Compute the SVD of a 3x3 matrix ``A``. The singular values are returned in ``sigma``,
+    while the left and right basis vectors are returned in ``U`` and ``V``.""",
+)
 add_builtin(
     "svd3",
     input_types={
         "A": matrix(shape=(3, 3), dtype=Float),
         "U": matrix(shape=(3, 3), dtype=Float),
         "sigma": vector(length=3, dtype=Float),
-        "V": matrix(shape=(3, 3), dtype=Scalar),
+        "V": matrix(shape=(3, 3), dtype=Float),
     },
     value_type=None,
     group="Vector Math",
@@ -1227,13 +1287,39 @@ add_builtin(
     while the left and right basis vectors are returned in ``U`` and ``V``.""",
 )
+def svd2_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return (
+            matrix(shape=(2, 2), dtype=Float),
+            vector(length=2, dtype=Float),
+            matrix(shape=(2, 2), dtype=Float),
+        )
+    dtype = arg_types["A"]._wp_scalar_type_
+    return (
+        matrix(shape=(2, 2), dtype=dtype),
+        vector(length=2, dtype=dtype),
+        matrix(shape=(2, 2), dtype=dtype),
+    )
+add_builtin(
+    "svd2",
+    input_types={"A": matrix(shape=(2, 2), dtype=Float)},
+    value_func=svd2_value_func,
+    group="Vector Math",
+    doc="""Compute the SVD of a 2x2 matrix ``A``. The singular values are returned in ``sigma``,
+    while the left and right basis vectors are returned in ``U`` and ``V``.""",
+)
 add_builtin(
     "svd2",
     input_types={
         "A": matrix(shape=(2, 2), dtype=Float),
         "U": matrix(shape=(2, 2), dtype=Float),
         "sigma": vector(length=2, dtype=Float),
-        "V": matrix(shape=(2, 2), dtype=Scalar),
+        "V": matrix(shape=(2, 2), dtype=Float),
     },
     value_type=None,
     group="Vector Math",
@@ -1242,6 +1328,30 @@ add_builtin(
     while the left and right basis vectors are returned in ``U`` and ``V``.""",
 )
+def qr3_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return (
+            matrix(shape=(3, 3), dtype=Float),
+            matrix(shape=(3, 3), dtype=Float),
+        )
+    dtype = arg_types["A"]._wp_scalar_type_
+    return (
+        matrix(shape=(3, 3), dtype=dtype),
+        matrix(shape=(3, 3), dtype=dtype),
+    )
+add_builtin(
+    "qr3",
+    input_types={"A": matrix(shape=(3, 3), dtype=Float)},
+    value_func=qr3_value_func,
+    group="Vector Math",
+    doc="""Compute the QR decomposition of a 3x3 matrix ``A``. The orthogonal matrix is returned in ``Q``,
+    while the upper triangular matrix is returned in ``R``.""",
+)
 add_builtin(
     "qr3",
     input_types={
@@ -1256,6 +1366,27 @@ add_builtin(
     while the upper triangular matrix is returned in ``R``.""",
 )
+def eig3_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return (matrix(shape=(3, 3), dtype=Float), vector(length=3, dtype=Float))
+    dtype = arg_types["A"]._wp_scalar_type_
+    return (
+        matrix(shape=(3, 3), dtype=dtype),
+        vector(length=3, dtype=dtype),
+    )
+add_builtin(
+    "eig3",
+    input_types={"A": matrix(shape=(3, 3), dtype=Float)},
+    value_func=eig3_value_func,
+    group="Vector Math",
+    doc="""Compute the eigendecomposition of a 3x3 matrix ``A``. The eigenvectors are returned as the columns of ``Q``,
+    while the corresponding eigenvalues are returned in ``d``.""",
+)
 add_builtin(
     "eig3",
     input_types={
@@ -1422,13 +1553,34 @@ add_builtin(
     group="Quaternion Math",
     doc="Construct a quaternion representing a rotation of angle radians around the given axis.",
 )
+def quat_to_axis_angle_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return (vector(length=3, dtype=Float), Float)
+    dtype = arg_types["quat"]._wp_scalar_type_
+    return (vector(length=3, dtype=dtype), dtype)
+add_builtin(
+    "quat_to_axis_angle",
+    input_types={"quat": quaternion(dtype=Float)},
+    value_func=quat_to_axis_angle_value_func,
+    group="Quaternion Math",
+    doc="Extract the rotation axis and angle radians a quaternion represents.",
+)
 add_builtin(
     "quat_to_axis_angle",
     input_types={"quat": quaternion(dtype=Float), "axis": vector(length=3, dtype=Float), "angle": Float},
     value_type=None,
     group="Quaternion Math",
     doc="Extract the rotation axis and angle radians a quaternion represents.",
+    export=False,
 )
 add_builtin(
     "quat_from_matrix",
     input_types={"mat": matrix(shape=(3, 3), dtype=Float)},
@@ -1506,6 +1658,48 @@ def transformation_value_func(arg_types: Mapping[str, type], arg_values: Mapping
     if arg_types is None:
         return transformation(dtype=Float)
+    dtype = arg_values.get("dtype", None)
+    variadic_arg_types = arg_types.get("args", ())
+    variadic_arg_count = len(variadic_arg_types)
+    if variadic_arg_count == 0:
+        # Zero-initialization, e.g.: `wp.transform()`, `wp.transformation(dtype=wp.float16)`.
+        if dtype is None:
+            dtype = float32
+    elif variadic_arg_count == 1:
+        # Initialization by filling a value, e.g.: `wp.transform(123)`,
+        # `wp.transformation(123)`.
+        value_type = strip_reference(variadic_arg_types[0])
+        if dtype is None:
+            dtype = value_type
+        elif not warp.types.scalars_equal(value_type, dtype):
+            raise RuntimeError(
+                f"the value used to fill this transform is expected to be of the type `{dtype.__name__}`"
+            )
+    elif variadic_arg_count == 7:
+        # Initializing by value, e.g.: `wp.transform(1, 2, 3, 4, 5, 6, 7)`.
+        try:
+            value_type = scalar_infer_type(variadic_arg_types)
+        except RuntimeError:
+            raise RuntimeError("all values given when constructing a transform must have the same type") from None
+        if dtype is None:
+            dtype = value_type
+        elif not warp.types.scalars_equal(value_type, dtype):
+            raise RuntimeError(
+                f"all values used to initialize this transform are expected to be of the type `{dtype.__name__}`"
+            )
+    if dtype is None:
+        raise RuntimeError("could not infer the `dtype` argument when calling the `wp.transform()` function")
+    return transformation(dtype=dtype)
+def transformation_pq_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return transformation(dtype=Float)
     try:
         value_type = float_infer_type(arg_types)
     except RuntimeError:
@@ -1540,20 +1734,35 @@ def transformation_dispatch_func(input_types: Mapping[str, type], return_type: A
 add_builtin(
     "transformation",
-    input_types={"pos": vector(length=3, dtype=Float), "rot": quaternion(dtype=Float), "dtype": Float},
+    input_types={"p": vector(length=3, dtype=Float), "q": quaternion(dtype=Float), "dtype": Float},
     defaults={"dtype": None},
-    value_func=transformation_value_func,
+    value_func=transformation_pq_value_func,
     export_func=lambda input_types: {k: v for k, v in input_types.items() if k != "dtype"},
     dispatch_func=transformation_dispatch_func,
     native_func="transform_t",
     group="Transformations",
-    doc="Construct a rigid-body transformation with translation part ``pos`` and rotation ``rot``.",
+    doc="Construct a rigid-body transformation with translation part ``p`` and rotation ``q``.",
+    export=False,
+)
+add_builtin(
+    "transformation",
+    input_types={"*args": Float, "dtype": Float},
+    defaults={"dtype": None},
+    variadic=True,
+    initializer_list_func=lambda arg_types, arg_values: len(arg_types.get("args", ())) > 1,
+    value_func=transformation_value_func,
+    export_func=lambda input_types: {k: v for k, v in input_types.items() if k not in ("dtype")},
+    dispatch_func=transformation_dispatch_func,
+    native_func="transform_t",
+    doc="Construct a spatial transform vector of given dtype.",
+    group="Spatial Math",
     export=False,
 )
 def transform_identity_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
-    # if arg_types is None then we are in 'export' mode
     if arg_types is None:
         # return transformation(dtype=Float)
         return transformf
@@ -1600,6 +1809,40 @@ add_builtin(
     group="Transformations",
     doc="Return the rotational part of a transform ``xform``.",
 )
+add_builtin(
+    "transform_set_translation",
+    input_types={"xform": transformation(dtype=Float), "p": vector(length=3, dtype=Float)},
+    value_type=None,
+    group="Transformations",
+    doc="Set the translational part of a transform ``xform``.",
+)
+add_builtin(
+    "transform_set_rotation",
+    input_types={"xform": transformation(dtype=Float), "q": quaternion(dtype=Float)},
+    value_type=None,
+    group="Transformations",
+    doc="Set the rotational part of a transform ``xform``.",
+)
+# performs a copy internally if wp.config.enable_vector_component_overwrites is True
+add_builtin(
+    "transform_set_translation_copy",
+    input_types={"xform": transformation(dtype=Float), "p": vector(length=3, dtype=Float)},
+    value_type=transformation(dtype=Float),
+    group="Transformations",
+    doc="Set the translational part of a transform ``xform``.",
+    hidden=True,
+    export=False,
+)
+# performs a copy internally if wp.config.enable_vector_component_overwrites is True
+add_builtin(
+    "transform_set_rotation_copy",
+    input_types={"xform": transformation(dtype=Float), "q": quaternion(dtype=Float)},
+    value_type=transformation(dtype=Float),
+    group="Transformations",
+    doc="Set the rotational part of a transform ``xform``.",
+    hidden=True,
+    export=False,
+)
 add_builtin(
     "transform_multiply",
     input_types={"a": transformation(dtype=Float), "b": transformation(dtype=Float)},
@@ -1831,40 +2074,15 @@ add_builtin(
 # Tile-based primitives
-def tile_unpack_shape(arg_values):
-    shape = arg_values["shape"]
-    if not isinstance(shape, tuple):
-        # promote to tuple
-        shape = (shape,)
-    # check that components are constants
-    for d in shape:
-        if d is None:
-            raise ValueError("Tile functions require shape to be a compile time constant.")
-    return shape
-def tile_unpack_offset(arg_values, ndim=0):
-    if "offset" in arg_values:
-        offset = arg_values["offset"]
-    else:
-        offset = (0,) * ndim
-    if isinstance(offset, tuple):
-        return offset
-    else:
-        # promote to tuple
-        return (offset,)
 def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Any, shape=Tuple[int, ...])
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
-    shape = tile_unpack_shape(arg_values)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
     if "dtype" not in arg_values:
         raise TypeError("tile_zeros() missing required keyword argument 'dtype'")
@@ -1877,17 +2095,20 @@ def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
     dtype = arg_values["dtype"]
-    return TileZeros(dtype=dtype, shape=shape, storage=arg_values["storage"])
+    return tile(dtype=dtype, shape=shape, storage=arg_values["storage"])
 def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
-    shape = tile_unpack_shape(arg_values)
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
     dtype = arg_values["dtype"]
     template_args = []
     template_args.append(dtype)
-    for d in shape:
-        template_args.append(d.constant)
+    template_args.extend(shape)
     return ([], template_args)
@@ -1929,9 +2150,12 @@ add_builtin(
 def tile_ones_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Any, shape=Tuple[int, ...])
-    shape = tile_unpack_shape(arg_values)
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
     if "dtype" not in arg_values:
         raise TypeError("tile_ones() missing required keyword argument 'dtype'")
@@ -1944,17 +2168,20 @@ def tile_ones_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     dtype = arg_values["dtype"]
-    return TileOnes(dtype=dtype, shape=shape, storage=arg_values["storage"])
+    return tile(dtype=dtype, shape=shape, storage=arg_values["storage"])
 def tile_ones_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
-    shape = tile_unpack_shape(arg_values)
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
     dtype = arg_values["dtype"]
     template_args = []
     template_args.append(dtype)
-    for d in shape:
-        template_args.append(d.constant)
+    template_args.extend(shape)
     return ([], template_args)
@@ -1994,7 +2221,7 @@ add_builtin(
 def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Scalar, shape=Tuple[int])
     if "args" not in arg_values:
         raise TypeError("tile_arange() requires at least one positional argument specifying the range")
@@ -2029,7 +2256,8 @@ def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
     if arg_values["storage"] not in {"shared", "register"}:
         raise ValueError(f"Invalid value for 'storage': {arg_values['storage']!r}. Expected 'shared' or 'register'.")
-    return TileRange(dtype=dtype, start=start, stop=stop, step=step, storage=arg_values["storage"])
+    n = int((stop - start) / step)
+    return tile(dtype=dtype, shape=(n,), storage=arg_values["storage"])
 def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
@@ -2045,13 +2273,13 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     args = arg_values["args"]
     if len(args) == 1:
-        start = warp.codegen.Var(label=None, type=return_type.dtype, constant=return_type.start)
+        start = warp.codegen.Var(label=None, type=return_type.dtype, constant=0)
         stop = args[0]
-        step = warp.codegen.Var(label=None, type=return_type.dtype, constant=return_type.step)
+        step = warp.codegen.Var(label=None, type=return_type.dtype, constant=1)
     elif len(args) == 2:
         start = args[0]
         stop = args[1]
-        step = warp.codegen.Var(label=None, type=return_type.dtype, constant=return_type.step)
+        step = warp.codegen.Var(label=None, type=return_type.dtype, constant=1)
     elif len(args) == 3:
         start = args[0]
         stop = args[1]
@@ -2069,7 +2297,7 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
 add_builtin(
     "tile_arange",
-    input_types={"*args": Scalar, "dtype": Any, "storage": str},
+    input_types={"*args": Scalar, "dtype": Scalar, "storage": str},
     defaults={"dtype": None, "storage": "register"},
     value_func=tile_arange_value_func,
     dispatch_func=tile_arange_dispatch_func,
@@ -2094,12 +2322,19 @@ add_builtin(
 def tile_load_tuple_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     if arg_types is None:
-        return array(dtype=Scalar)
+        return tile(dtype=Any, shape=Tuple[int, ...])
     a = arg_types["a"]
-    shape = tile_unpack_shape(arg_values)
-    offset = tile_unpack_offset(arg_values, a.ndim)
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
+    if "offset" in arg_values:
+        offset = extract_tuple(arg_values["offset"])
+    else:
+        offset = (0,) * a.ndim
     if a.ndim != len(shape):
         raise ValueError(
@@ -2114,16 +2349,23 @@ def tile_load_tuple_value_func(arg_types: Mapping[str, type], arg_values: Mappin
     if arg_values["storage"] not in {"shared", "register"}:
         raise ValueError(f"Invalid value for 'storage': {arg_values['storage']!r}. Expected 'shared' or 'register'.")
-    return Tile(dtype=a.dtype, shape=shape, storage=arg_values["storage"])
+    return tile(dtype=a.dtype, shape=shape, storage=arg_values["storage"])
 def tile_load_tuple_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     a = args["a"]
-    shape = tile_unpack_shape(args)
-    offset = tile_unpack_offset(args, a.type.ndim)
+    shape = extract_tuple(args["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
     func_args = (a, *offset)
-    template_args = (d.constant for d in shape)
+    template_args = shape
     return (func_args, template_args)
@@ -2170,7 +2412,10 @@ def tile_store_value_func(arg_types, arg_values):
     a = arg_types["a"]
     t = arg_types["t"]
-    c = tile_unpack_offset(arg_types, a.ndim)
+    if "offset" in arg_types:
+        c = extract_tuple(arg_values["offset"])
+    else:
+        c = (0,) * a.ndim
     if len(c) != a.ndim:
         raise ValueError(
@@ -2196,7 +2441,10 @@ def tile_store_dispatch_func(input_types: Mapping[str, type], return_type: Any,
     a = args["a"]
     t = args["t"]
-    offset = tile_unpack_offset(args, a.type.ndim)
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
     func_args = (a, *offset, t)
     template_args = []
@@ -2206,7 +2454,7 @@ def tile_store_dispatch_func(input_types: Mapping[str, type], return_type: Any,
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "t": Tile(dtype=Any, shape=Any), "offset": Tuple[int, ...]},
+    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": Tuple[int, ...]},
     value_func=tile_store_value_func,
     dispatch_func=tile_store_dispatch_func,
     defaults={"offset": None},
@@ -2226,7 +2474,7 @@ add_builtin(
 # overload for scalar offset
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "t": Tile(dtype=Any, shape=Any), "offset": int},
+    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": int},
     value_func=tile_store_value_func,
     dispatch_func=tile_store_dispatch_func,
     defaults={"offset": None},
@@ -2241,12 +2489,16 @@ add_builtin(
 def tile_atomic_add_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Any, shape=Tuple[int, ...])
     a = arg_types["a"]
     t = arg_types["t"]
-    c = tile_unpack_offset(arg_types, a.ndim)
+    if "offset" in arg_types:
+        c = extract_tuple(arg_values["offset"])
+    else:
+        c = (0,) * a.ndim
     if len(c) != a.ndim:
         raise ValueError(
             f"tile_atomic_add() 'a' argument must have {len(c)} dimensions, "
@@ -2264,14 +2516,21 @@ def tile_atomic_add_value_func(arg_types, arg_values):
             f"tile_atomic_add() 'a' and 't' arguments must have the same dtype, got {arg_types['a'].dtype} and {arg_types['t'].dtype}"
         )
-    return Tile(dtype=arg_types["t"].dtype, shape=arg_types["t"].shape, storage=arg_types["t"].storage)
+    return tile(
+        dtype=arg_types["t"].dtype,
+        shape=arg_types["t"].shape,
+        storage=arg_types["t"].storage,
+    )
 def tile_atomic_add_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
     a = args["a"]
     t = args["t"]
-    offset = tile_unpack_offset(args, a.type.ndim)
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * a.type.ndim
     func_args = (a, *offset, t)
     template_args = []
@@ -2281,13 +2540,13 @@ def tile_atomic_add_dispatch_func(input_types: Mapping[str, type], return_type:
 add_builtin(
     "tile_atomic_add",
-    input_types={"a": array(dtype=Any), "t": Tile(dtype=Any, shape=Any), "offset": Tuple[int, ...]},
+    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": Tuple[int, ...]},
     value_func=tile_atomic_add_value_func,
     dispatch_func=tile_atomic_add_dispatch_func,
     defaults={"offset": None},
     variadic=False,
     skip_replay=True,
-    doc="""Atomically add a 1D tile to the array `a`, each element will be updated atomically.
+    doc="""Atomically add a tile onto the array `a`, each element will be updated atomically.
     :param a: Array in global memory, should have the same ``dtype`` as the input tile
     :param t: Source tile to add to the destination array
@@ -2300,7 +2559,7 @@ add_builtin(
 # overload for scalar offset
 add_builtin(
     "tile_atomic_add",
-    input_types={"a": array(dtype=Any), "t": Tile(dtype=Any, shape=Any), "offset": int},
+    input_types={"a": array(dtype=Any), "t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": int},
     value_func=tile_atomic_add_value_func,
     dispatch_func=tile_atomic_add_dispatch_func,
     defaults={"offset": None},
@@ -2315,54 +2574,59 @@ add_builtin(
 def tile_view_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Any, shape=Tuple[int, ...])
-    tile = arg_types["t"]
-    offset = arg_types["offset"]
+    tile_type = arg_types["t"]
+    offset = extract_tuple(arg_values["offset"])
-    if len(offset) > len(tile.shape):
-        raise ValueError(f"tile_view() specified too many offset coordinates {len(offset)} > {len(tile.shape)}")
+    if len(offset) > len(tile_type.shape):
+        raise ValueError(f"tile_view() specified too many offset coordinates {len(offset)} > {len(tile_type.shape)}")
     if "shape" in arg_values:
         # if shape is specified take it directly, e.g.:
         # tile_view(t, offset=(i,j), shape=(m,n))
-        shape = arg_values["shape"]
-        strides = tile.strides
+        shape = extract_tuple(arg_values["shape"], as_constant=True)
+        strides = tile_type.strides
-        if len(shape) != len(tile.shape):
+        if len(shape) != len(tile_type.shape):
             raise ValueError(
-                f"tile_view() if shape is specified it must have same number of dimensions as source tile, expected {len(tile.shape)}, got {len(shape)}"
+                f"tile_view() if shape is specified it must have same number of dimensions as source tile, expected {len(tile_type.shape)}, got {len(shape)}"
             )
     else:
         # if not specified, then take output shape from unspecified src dimensions
         # e.g.: tile[i] will return a whole row of a 2D tile
-        shape = tile.shape[len(offset) :]
-        strides = tile.strides[len(offset) :]
+        shape = tile_type.shape[len(offset) :]
+        strides = tile_type.strides[len(offset) :]
     assert len(shape) == len(strides)
     # force source tile to shared memory
-    tile.storage = "shared"
+    tile_type.storage = "shared"
-    output = Tile(dtype=tile.dtype, shape=shape, strides=strides, layout=tile.layout, storage="shared", owner=False)
+    output = tile(
+        dtype=tile_type.dtype, shape=shape, strides=strides, layout=tile_type.layout, storage="shared", owner=False
+    )
     return output
 def tile_view_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
     tile = arg_values["t"]
-    coord = arg_values["offset"]
+    coord = extract_tuple(arg_values["offset"])
     # zero-pad coord to match source array
     view_coord = [0] * len(tile.type.shape)
     for i in range(len(coord)):
         view_coord[i] = coord[i]
-    return ((tile, *view_coord), (return_type,))
+    func_args = (tile, *view_coord)
+    template_args = (return_type,)
+    return (func_args, template_args)
 add_builtin(
     "tile_view",
-    input_types={"t": Tile(dtype=Any, shape=Any), "offset": Tuple[int, ...], "shape": Tuple[int, ...]},
+    input_types={"t": tile(dtype=Any, shape=Tuple[int, ...]), "offset": Tuple[int, ...], "shape": Tuple[int, ...]},
     value_func=tile_view_value_func,
     dispatch_func=tile_view_dispatch_func,
     defaults={"shape": None},
@@ -2379,116 +2643,363 @@ add_builtin(
 )
-def tile_assign_value_func(arg_types, arg_values):
+def tile_squeeze_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
     if arg_types is None:
-        return None
+        return tile(dtype=Any, shape=Tuple[int, ...])
-    # force the destination tile to shared memory
-    arg_types["dst"].storage = "shared"
-    return None
+    tile_type = arg_types["t"]
+    shape = tile_type.shape
+    strides = tile_type.strides
+    ndim = len(shape)
+    if "axis" in arg_values:
+        axis = arg_values["axis"]
-def tile_assign_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
-    dst = args["dst"]
-    src = args["src"]
+        if not isinstance(axis, Sequence):
+            # promote to tuple
+            axis = (axis,)
-    offset = tile_unpack_offset(args, len(dst.type.shape))
+        # promote negative indices to their positive equivalents
+        axis = tuple([a if a >= 0 else a + ndim for a in axis])
-    func_args = (dst, src, *offset)
-    template_args = []
+        # validate that specified axes are size 1
+        for a in axis:
+            if shape[a] != 1:
+                raise ValueError(
+                    f"Cannot select an axis to squeeze out which has size not equal to one, axis={a}, size={shape[a]}"
+                )
-    return (func_args, template_args)
+        # build new shape by skipping specified axes (if size is 1)
+        new_shape = tuple(dim for i, dim in enumerate(shape) if i not in axis)
+        new_strides = tuple(stride for i, stride in enumerate(strides) if i not in axis)
+    else:
+        # no axis specified: remove all singleton dimensions
+        new_shape = tuple(dim for dim in shape if dim != 1)
+        new_strides = tuple(stride for i, stride in enumerate(strides) if shape[i] != 1)
-add_builtin(
-    "tile_assign",
-    input_types={"dst": Tile(dtype=Any, shape=Any), "src": Tile(dtype=Any, shape=Any), "offset": Tuple[int, ...]},
-    value_func=tile_assign_value_func,
-    dispatch_func=tile_assign_dispatch_func,
-    defaults={"offset": None},
-    doc="""Assign a tile to a subrange of a destination tile.
+    # force source tile to shared memory
+    tile_type.storage = "shared"
+    output = tile(
+        dtype=tile_type.dtype,
+        shape=new_shape,
+        strides=new_strides,
+        layout=tile_type.layout,
+        storage="shared",
+        owner=False,
+    )
+    return output
-    :param dst: The destination tile to assign to
-    :param src: The source tile to read values from
-    :param offset: Offset in the destination tile to write to""",
-    group="Tile Primitives",
-    export=False,
-)
-# handles expressions like tile[i,j] = 1.0
-add_builtin(
-    "assign",
-    input_types={"dst": Tile(dtype=Any, shape=Any), "i": int, "src": Scalar},
-    value_func=tile_assign_value_func,
-    group="Tile Primitives",
-    export=False,
-    hidden=True,
-)
+def tile_squeeze_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    source_tile = arg_values["t"]
-add_builtin(
-    "assign",
-    input_types={"dst": Tile(dtype=Any, shape=Any), "i": int, "j": int, "src": Scalar},
-    value_func=tile_assign_value_func,
-    group="Tile Primitives",
-    export=False,
-    hidden=True,
-)
+    return ((source_tile,), (return_type,))
-add_builtin(
-    "assign",
-    input_types={"dst": Tile(dtype=Any, shape=Any), "i": int, "j": int, "k": int, "src": Scalar},
-    value_func=tile_assign_value_func,
-    group="Tile Primitives",
-    export=False,
-    hidden=True,
-)
 add_builtin(
-    "assign",
-    input_types={"dst": Tile(dtype=Any, shape=Any), "i": int, "j": int, "k": int, "l": int, "src": Scalar},
-    value_func=tile_assign_value_func,
+    "tile_squeeze",
+    input_types={"t": tile(dtype=Any, shape=Tuple[int, ...]), "axis": Tuple[int, ...]},
+    defaults={"axis": None},
+    value_func=tile_squeeze_value_func,
+    dispatch_func=tile_squeeze_dispatch_func,
+    variadic=False,
+    doc="""Return a squeezed view of a tile with the same data.
+    :param t: Input tile to squeeze
+    :param axis: A subset of the entries of length one in the shape (optional)
+    :returns: The input tile but with all or a subset of the dimensions of length one removed.""",
     group="Tile Primitives",
     export=False,
-    hidden=True,
 )
-def tile_value_func(arg_types, arg_values):
+def tile_reshape_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile
-    if len(arg_types) != 1:
-        raise TypeError(f"tile() takes exactly 1 positional argument but {len(arg_types)} were given")
+        return tile(dtype=Any, shape=Tuple[int, ...])
-    dtype = None
-    length = None
+    tile_type = arg_types["t"]
-    if type_is_vector(arg_types["x"]):
-        dtype = arg_types["x"]._wp_scalar_type_
-        length = arg_types["x"]._shape_[0]
-        shape = (length, warp.codegen.options["block_dim"])
-    else:
-        dtype = arg_types["x"]
-        shape = (warp.codegen.options["block_dim"],)
+    # calculate total size of tile_type
+    size = 1
+    for s in tile_type.shape:
+        size *= int(s)
-    return Tile(dtype=dtype, shape=shape, op="tile")
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
-add_builtin(
-    "tile",
-    input_types={"x": Any},
-    value_func=tile_value_func,
-    variadic=True,
-    doc="""Construct a new tile from per-thread kernel values.
+    # check for -1 dimension and reformat
+    if -1 in shape:
+        idx = size
+        denom = 1
+        minus_one_count = 0
+        for i, d in enumerate(shape):
+            if d == -1:
+                idx = i
+                minus_one_count += 1
+            else:
+                denom *= d
+        if minus_one_count > 1:
+            raise RuntimeError("Cannot infer shape if more than one index is -1.")
+        new_shape = list(shape)
+        new_shape[idx] = int(size / denom)
+        shape = tuple(new_shape)
+    # calculate total size of new shape
+    new_size = 1
+    for s in shape:
+        new_size *= int(s)
+    if new_size != size:
+        raise ValueError(f"New shape {shape} has total size {new_size} which does not match original size {size}")
+    # compute new strides matching shape
+    strides = []
+    stride = 1
+    for s in reversed(shape):
+        strides.append(stride)
+        stride *= s
+    strides = tuple(reversed(strides))
-    This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
+    # force source tile to shared memory
+    tile_type.storage = "shared"
+    output = tile(
+        dtype=tile_type.dtype, shape=shape, strides=strides, layout=tile_type.layout, storage="shared", owner=False
+    )
+    return output
+def tile_reshape_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    tile = arg_values["t"]
+    return ((tile,), (return_type,))
+add_builtin(
+    "tile_reshape",
+    input_types={"t": tile(dtype=Any, shape=Tuple[int, ...]), "shape": Tuple[int, ...]},
+    value_func=tile_reshape_value_func,
+    dispatch_func=tile_reshape_dispatch_func,
+    variadic=False,
+    doc="""Return a reshaped view of a tile with the same data.
+    :param t: Input tile to reshape
+    :param shape: New shape for the tile
+    :returns: A tile containing the same data as the input tile, but arranged in a new shape.""",
+    group="Tile Primitives",
+    export=False,
+)
+def tile_astype_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Any, shape=Tuple[int, ...])
+    tile_type = arg_types["t"]
+    dtype = arg_values["dtype"]
+    return tile(dtype=dtype, shape=tile_type.shape)
+def tile_astype_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    tile = arg_values["t"]
+    return ((tile,), (return_type,))
+add_builtin(
+    "tile_astype",
+    input_types={"t": tile(dtype=Scalar, shape=Tuple[int, ...]), "dtype": Scalar},
+    value_func=tile_astype_value_func,
+    dispatch_func=tile_astype_dispatch_func,
+    variadic=False,
+    doc="""Return a new tile with the same data as the input tile, but with a different data type.
+    :param t: Input tile
+    :param dtype: New data type for the tile
+    :returns: A tile with the same data as the input tile, but with a different data type""",
+    group="Tile Primitives",
+    export=False,
+)
+def tile_assign_value_func(arg_types, arg_values):
+    if arg_types is None:
+        return None
+    # force the destination tile to shared memory
+    arg_types["dst"].storage = "shared"
+    return None
+def tile_assign_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    dst = args["dst"]
+    src = args["src"]
+    if "offset" in args:
+        offset = extract_tuple(args["offset"])
+    else:
+        offset = (0,) * len(dst.type.shape)
+    func_args = (dst, src, *offset)
+    template_args = []
+    return (func_args, template_args)
+add_builtin(
+    "tile_assign",
+    input_types={
+        "dst": tile(dtype=Any, shape=Tuple[int, ...]),
+        "src": tile(dtype=Any, shape=Tuple[int, ...]),
+        "offset": Tuple[int, ...],
+    },
+    value_func=tile_assign_value_func,
+    dispatch_func=tile_assign_dispatch_func,
+    defaults={"offset": None},
+    doc="""Assign a tile to a subrange of a destination tile.
+    :param dst: The destination tile to assign to
+    :param src: The source tile to read values from
+    :param offset: Offset in the destination tile to write to""",
+    group="Tile Primitives",
+    export=False,
+)
+# handles expressions like tile[i,j] = 1.0
+add_builtin(
+    "assign",
+    input_types={"dst": tile(dtype=Any, shape=Tuple[int]), "i": int, "src": Any},
+    value_func=tile_assign_value_func,
+    group="Tile Primitives",
+    export=False,
+    hidden=True,
+)
+add_builtin(
+    "assign",
+    input_types={"dst": tile(dtype=Any, shape=Tuple[int, int]), "i": int, "j": int, "src": Any},
+    value_func=tile_assign_value_func,
+    group="Tile Primitives",
+    export=False,
+    hidden=True,
+)
+add_builtin(
+    "assign",
+    input_types={"dst": tile(dtype=Any, shape=Tuple[int, int, int]), "i": int, "j": int, "k": int, "src": Any},
+    value_func=tile_assign_value_func,
+    group="Tile Primitives",
+    export=False,
+    hidden=True,
+)
+add_builtin(
+    "assign",
+    input_types={
+        "dst": tile(dtype=Any, shape=Tuple[int, int, int, int]),
+        "i": int,
+        "j": int,
+        "k": int,
+        "l": int,
+        "src": Any,
+    },
+    value_func=tile_assign_value_func,
+    group="Tile Primitives",
+    export=False,
+    hidden=True,
+)
+def tile_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Any, shape=Tuple)
+    if len(arg_types) > 2:
+        raise TypeError(f"tile() takes 1 positional argument and 1 optional argument but {len(arg_types)} were given")
+    preserve_type = arg_values["preserve_type"]
+    if preserve_type:
+        dtype = arg_types["x"]
+        shape = (warp.codegen.options["block_dim"],)
+        return tile(dtype=dtype, shape=shape)
+    else:
+        if type_is_vector(arg_types["x"]):
+            dtype = arg_types["x"]._wp_scalar_type_
+            length = arg_types["x"]._shape_[0]
+            shape = (length, warp.codegen.options["block_dim"])
+        elif type_is_quaternion(arg_types["x"]):
+            dtype = arg_types["x"]._wp_scalar_type_
+            shape = (4, warp.codegen.options["block_dim"])
+        elif type_is_matrix(arg_types["x"]):
+            dtype = arg_types["x"]._wp_scalar_type_
+            rows = arg_types["x"]._shape_[0]
+            cols = arg_types["x"]._shape_[1]
+            shape = (rows, cols, warp.codegen.options["block_dim"])
+        else:
+            dtype = arg_types["x"]
+            shape = (warp.codegen.options["block_dim"],)
+        return tile(dtype=dtype, shape=shape)
+def tile_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    x = arg_values["x"]
+    preserve_type = get_arg_value(arg_values["preserve_type"])
+    if preserve_type:
+        dtype = x.type
+        return ((x,), (dtype,))
+    else:
+        if type_is_vector(x.type):
+            dtype = x.type._wp_scalar_type_
+            length = x.type._shape_[0]
+            return ((x,), (dtype, length))
+        elif type_is_quaternion(x.type):
+            dtype = x.type._wp_scalar_type_
+            return ((x,), (dtype, 4))
+        elif type_is_matrix(x.type):
+            dtype = x.type._wp_scalar_type_
+            rows = x.type._shape_[0]
+            cols = x.type._shape_[1]
+            return ((x,), (rows, cols, dtype))
+        else:
+            dtype = x.type
+            return ((x,), (dtype,))
+add_builtin(
+    "tile",
+    input_types={"x": Any, "preserve_type": bool},
+    value_func=tile_value_func,
+    dispatch_func=tile_dispatch_func,
+    variadic=True,
+    defaults={"preserve_type": False},
+    doc="""Construct a new tile from per-thread kernel values.
+    This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
     * If the input value is a scalar, then the resulting tile has ``shape=(block_dim,)``
     * If the input value is a vector, then the resulting tile has ``shape=(length(vector), block_dim)``
+    * If the input value is a vector, and ``preserve_type=True``, then the resulting tile has ``dtype=vector`` and ``shape=(block_dim,)``
+    * If the input value is a matrix, then the resulting tile has ``shape=(rows, cols, block_dim)``
+    * If the input value is a matrix, and ``preserve_type=True``, then the resulting tile has ``dtype=matrix`` and ``shape=(block_dim,)``
     :param x: A per-thread local value, e.g. scalar, vector, or matrix.
-    :returns: A tile with first dimension according to the value type length and a second dimension equal to ``block_dim``
+    :param preserve_type: If true, the tile will have the same data type as the input value.
+    :returns: If ``preserve_type=True``, a tile of type ``x.type`` of length ``block_dim``. Otherwise, an N-dimensional tile such that the first N-1 dimensions match the shape of ``x`` and the final dimension is of size ``block_dim``.
     This example shows how to create a linear sequence from thread variables:
@@ -2511,13 +3022,14 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    hidden=True,
 )
 def untile_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Scalar
+        return Any
     if len(arg_types) != 1:
         raise TypeError(f"untile() takes exactly 1 positional argument but {len(arg_types)} were given")
@@ -2536,13 +3048,15 @@ def untile_value_func(arg_types, arg_values):
         return t.dtype
     elif len(t.shape) == 2:
         return warp.types.vector(t.shape[0], t.dtype)
+    elif len(t.shape) == 3:
+        return warp.types.matrix((t.shape[0], t.shape[1]), t.dtype)
     else:
         raise ValueError(f"untile() argument must have a positive size in dimension 0, but got {t.shape[0]}")
 add_builtin(
     "untile",
-    input_types={"a": Tile(dtype=Any, shape=Any)},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...])},
     value_func=untile_value_func,
     variadic=True,
     doc="""Convert a tile back to per-thread values.
@@ -2592,7 +3106,7 @@ add_builtin(
 def tile_extract_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Scalar
+        return Any
     # force the input tile to shared memory
     arg_types["a"].storage = "shared"
@@ -2602,10 +3116,10 @@ def tile_extract_value_func(arg_types, arg_values):
 add_builtin(
     "tile_extract",
-    input_types={"a": Tile(dtype=Any, shape=Any), "i": int},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int]), "i": int},
     value_func=tile_extract_value_func,
     variadic=False,
-    doc="""Extract a single element from the tile and return it as a scalar type.
+    doc="""Extract a single element from the tile.
     This function will extract an element from the tile and broadcast its value to all threads in the block.
@@ -2619,13 +3133,12 @@ add_builtin(
     export=False,
 )
 add_builtin(
     "tile_extract",
-    input_types={"a": Tile(dtype=Any, shape=Any), "i": int, "j": int},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, int]), "i": int, "j": int},
     value_func=tile_extract_value_func,
     variadic=False,
-    doc="""Extract a single element from the tile and return it as a scalar type.
+    doc="""Extract a single element from the tile.
     This function will extract an element from the tile and broadcast its value to all threads in the block.
@@ -2642,10 +3155,10 @@ add_builtin(
 add_builtin(
     "tile_extract",
-    input_types={"a": Tile(dtype=Any, shape=Any), "i": int, "j": int, "k": int},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, int, int]), "i": int, "j": int, "k": int},
     value_func=tile_extract_value_func,
     variadic=False,
-    doc="""Extract a single element from the tile and return it as a scalar type.
+    doc="""Extract a single element from the tile.
     This function will extract an element from the tile and broadcast its value to all threads in the block.
@@ -2663,10 +3176,10 @@ add_builtin(
 add_builtin(
     "tile_extract",
-    input_types={"a": Tile(dtype=Any, shape=Any), "i": int, "j": int, "k": int, "l": int},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, int, int, int]), "i": int, "j": int, "k": int, "l": int},
     value_func=tile_extract_value_func,
     variadic=False,
-    doc="""Extract a single element from the tile and return it as a scalar type.
+    doc="""Extract a single element from the tile.
     This function will extract an element from the tile and broadcast its value to all threads in the block.
@@ -2684,10 +3197,90 @@ add_builtin(
 )
+def tile_inplace_value_func(arg_types, arg_values):
+    if not types_equal(arg_types["a"].dtype, arg_types["value"]):
+        raise TypeError(
+            f"'value' must have the same dtype as target tile for inplace ops, got {arg_types['a'].dtype} and {arg_types['value']}"
+        )
+    # force the input tile to shared memory
+    # as inplace addition/subtraction relies on shared memory atomics
+    arg_types["a"].storage = "shared"
+    return None
+add_builtin(
+    "tile_add_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_add_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_add_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_add_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "l": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_sub_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_sub_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_sub_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_sub_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "l": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
 def tile_transpose_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile
+        return tile(dtype=Any, shape=Tuple[int, int])
     if len(arg_types) != 1:
         raise TypeError(f"tile_transpose() takes exactly 1 positional argument but {len(arg_types)} were given")
@@ -2708,10 +3301,9 @@ def tile_transpose_value_func(arg_types, arg_values):
     # force the input tile to shared memory
     t.storage = "shared"
-    return Tile(
+    return tile(
         dtype=t.dtype,
         shape=t.shape[::-1],
-        op="transpose",
         storage=t.storage,
         strides=t.strides[::-1],
         layout=layout,
@@ -2721,7 +3313,7 @@ def tile_transpose_value_func(arg_types, arg_values):
 add_builtin(
     "tile_transpose",
-    input_types={"a": Tile(dtype=Any, shape=Any)},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, int])},
     value_func=tile_transpose_value_func,
     variadic=True,
     doc="""Transpose a tile.
@@ -2739,12 +3331,16 @@ add_builtin(
 def tile_broadcast_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile
+        return tile(dtype=Any, shape=Tuple[int, ...])
     t = arg_types["a"]
     # target shape and strides
-    target_shape = tile_unpack_shape(arg_values)
+    target_shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in target_shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
     target_strides = [0] * len(target_shape)
     offset = len(target_shape) - len(t.shape)
@@ -2769,10 +3365,7 @@ def tile_broadcast_value_func(arg_types, arg_values):
     # force the input tile to shared memory
     t.storage = "shared"
-    tile_type = Tile(
-        dtype=t.dtype, shape=target_shape, op="broadcast", storage=t.storage, strides=target_strides, owner=False
-    )
-    return tile_type
+    return tile(dtype=t.dtype, shape=target_shape, storage=t.storage, strides=target_strides, owner=False)
 def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
@@ -2787,7 +3380,7 @@ def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any
 add_builtin(
     "tile_broadcast",
-    input_types={"a": Tile(dtype=Any, shape=Any), "shape": Tuple[int, ...]},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "shape": Tuple[int, ...]},
     value_func=tile_broadcast_value_func,
     dispatch_func=tile_broadcast_dispatch_func,
     variadic=False,
@@ -2807,7 +3400,7 @@ add_builtin(
 def tile_sum_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=(1,))
+        return tile(dtype=Scalar, shape=(1,))
     if len(arg_types) != 1:
         raise TypeError(f"tile_sum() takes exactly 1 positional argument but {len(arg_types)} were given")
@@ -2817,12 +3410,12 @@ def tile_sum_value_func(arg_types, arg_values):
     if not is_tile(a):
         raise TypeError(f"tile_sum() argument must be a tile, got {a!r}")
-    return Tile(dtype=a.dtype, shape=(1,), op="sum")
+    return tile(dtype=a.dtype, shape=(1,))
 add_builtin(
     "tile_sum",
-    input_types={"a": Tile},
+    input_types={"a": tile(dtype=Scalar, shape=Tuple[int, ...])},
     value_func=tile_sum_value_func,
     variadic=True,
     doc="""Cooperatively compute the sum of the tile elements using all threads in the block.
@@ -2856,10 +3449,89 @@ add_builtin(
 )
+def tile_sort_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+    if len(arg_types) != 2:
+        raise TypeError(
+            f"tile_sort() takes exactly 2 positional arguments (keys and values) but {len(arg_types)} were given"
+        )
+    a = arg_types["keys"]
+    b = arg_types["values"]
+    if not is_tile(a):
+        raise TypeError(f"First tile_sort() argument must be a tile, got {a!r}")
+    if not is_tile(b):
+        raise TypeError(f"Second tile_sort() argument must be a tile, got {b!r}")
+    if not (a.dtype is warp.float32 or a.dtype is warp.int32 or a.dtype is warp.uint32):
+        raise TypeError(f"First tile_sort() argument must be a tile of type float or int, got {a.dtype}")
+    # set the storage type to the inputs to shared
+    a.storage = "shared"
+    b.storage = "shared"
+    if len(a.shape) != len(b.shape):
+        raise ValueError(
+            f"tile_sort() shapes must have the same number of dimensions, got {len(a.shape)} and {len(b.shape)}"
+        )
+    for i in range(len(a.shape)):
+        if a.shape[i] != b.shape[i]:
+            raise ValueError(f"tile_sort() shapes do not match on dimension {i}, got {a.shape} and {b.shape}")
+    return None
+add_builtin(
+    "tile_sort",
+    input_types={"keys": tile(dtype=Any, shape=Tuple[int]), "values": tile(dtype=Any, shape=Tuple[int])},
+    value_func=tile_sort_value_func,
+    variadic=True,
+    doc="""Cooperatively sort the elements of two tiles in ascending order based on the keys, using all threads in the block.
+    :param keys: Keys to sort by. Supported key types: :class:`float32`, :class:`int32`, :class:`uint32`. Must be in shared memory.
+    :param values: Values to sort along with keys. No type restrictions. Must be in shared memory.
+    :returns: No return value. Sorts both tiles in-place.
+    Example:
+    .. code-block:: python
+        @wp.kernel
+        def compute():
+            keys = wp.tile_arange(32, 0, -1, dtype=int, storage="shared")
+            values = wp.tile_arange(0, 32, 1, dtype=int, storage="shared")
+            wp.tile_sort(keys, values)
+            print(keys)
+            print(values)
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
+    Prints:
+    .. code-block:: text
+        [1, 2, ..., 32] = tile(shape=(32), storage=shared)
+        [31, 30, 29, ..., 0] = tile(shape=(32), storage=shared)
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_min_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=(1,))
+        return tile(dtype=Scalar, shape=(1,))
     if len(arg_types) != 1:
         raise TypeError(f"tile_min() takes exactly 1 positional argument but {len(arg_types)} were given")
@@ -2869,12 +3541,12 @@ def tile_min_value_func(arg_types, arg_values):
     if not is_tile(a):
         raise TypeError(f"tile_min() argument must be a tile, got {a!r}")
-    return Tile(dtype=a.dtype, shape=(1,), op="min")
+    return tile(dtype=a.dtype, shape=(1,))
 add_builtin(
     "tile_min",
-    input_types={"a": Tile},
+    input_types={"a": tile(dtype=Scalar, shape=Tuple[int, ...])},
     value_func=tile_min_value_func,
     variadic=True,
     doc="""Cooperatively compute the minimum of the tile elements using all threads in the block.
@@ -2909,10 +3581,63 @@ add_builtin(
 )
+def tile_argmin_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Int, shape=(1,))
+    if len(arg_types) != 1:
+        raise TypeError(f"tile_argmin() takes exactly 1 positional argument but {len(arg_types)} were given")
+    a = arg_types["a"]
+    if not is_tile(a):
+        raise TypeError(f"tile_argmin() argument must be a tile, got {a!r}")
+    return tile(dtype=warp.int32, shape=(1,))
+add_builtin(
+    "tile_argmin",
+    input_types={"a": tile(dtype=Scalar, shape=Tuple[int, ...])},
+    value_func=tile_argmin_value_func,
+    variadic=True,
+    doc="""Cooperatively compute the index of the minimum element in the tile using all threads in the block.
+    :param a: The tile to compute the argmin from
+    :returns: A single-element tile holding the index of the minimum value
+    Example:
+    .. code-block:: python
+        @wp.kernel
+        def compute():
+            t = wp.tile_arange(64, 128)
+            s = wp.tile_argmin(t)
+            print(s)
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
+    Prints:
+    .. code-block:: text
+        [0] = tile(shape=(1), storage=register)
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_max_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=(1,))
+        return tile(dtype=Scalar, shape=(1,))
     if len(arg_types) != 1:
         raise TypeError(f"tile_max() takes exactly 1 positional argument but {len(arg_types)} were given")
@@ -2922,12 +3647,12 @@ def tile_max_value_func(arg_types, arg_values):
     if not is_tile(a):
         raise TypeError(f"tile_max() argument must be a tile, got {a!r}")
-    return Tile(dtype=a.dtype, shape=(1,), op="min")
+    return tile(dtype=a.dtype, shape=(1,))
 add_builtin(
     "tile_max",
-    input_types={"a": Tile(dtype=Any, shape=Any)},
+    input_types={"a": tile(dtype=Scalar, shape=Tuple[int, ...])},
     value_func=tile_max_value_func,
     variadic=False,
     doc="""Cooperatively compute the maximum of the tile elements using all threads in the block.
@@ -2961,17 +3686,69 @@ add_builtin(
 )
+def tile_argmax_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Int, shape=(1,))
+    if len(arg_types) != 1:
+        raise TypeError(f"tile_argmax() takes exactly 1 positional argument but {len(arg_types)} were given")
+    a = arg_types["a"]
+    if not is_tile(a):
+        raise TypeError(f"tile_argmax() argument must be a tile, got {a!r}")
+    return tile(dtype=warp.int32, shape=(1,))
+add_builtin(
+    "tile_argmax",
+    input_types={"a": tile(dtype=Scalar, shape=Tuple[int, ...])},
+    value_func=tile_argmax_value_func,
+    variadic=False,
+    doc="""Cooperatively compute the index of the maximum element in the tile using all threads in the block.
+    :param a: The tile to compute the argmax from
+    :returns: A single-element tile holding the index of the maximum value
+    Example:
+    .. code-block:: python
+        @wp.kernel
+        def compute():
+            t = wp.tile_arange(64, 128)
+            s = wp.tile_argmax(t)
+            print(s)
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
+    Prints:
+    .. code-block:: text
+        [63] = tile(shape=(1), storage=register)
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 # does type propagation for load()
 def tile_reduce_value_func(arg_types, arg_values):
     if arg_types is None:
-        return Tile(dtype=Any, shape=(1,))
+        return tile(dtype=Scalar, shape=(1,))
     a = arg_types["a"]
     if not is_tile(a):
         raise TypeError(f"tile_reduce() 'a' argument must be a tile, got {a!r}")
-    return Tile(dtype=a.dtype, shape=(1,), op="reduce")
+    return tile(dtype=a.dtype, shape=(1,))
 def tile_reduce_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
@@ -2982,7 +3759,7 @@ def tile_reduce_dispatch_func(input_types: Mapping[str, type], return_type: Any,
 add_builtin(
     "tile_reduce",
-    input_types={"op": Callable, "a": Tile(dtype=Any, shape=Any)},
+    input_types={"op": Callable, "a": tile(dtype=Scalar, shape=Tuple[int, ...])},
     value_func=tile_reduce_value_func,
     native_func="tile_reduce",
     doc="""Apply a custom reduction operator across the tile.
@@ -3005,37 +3782,164 @@ add_builtin(
             print(s)
-        wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16)
+        wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16)
+    Prints:
+    .. code-block:: text
+        [362880] = tile(shape=(1), storage=register)
+    """,
+    group="Tile Primitives",
+    export=False,
+)
+def tile_scan_inclusive_value_func(arg_types, arg_values):
+    # Return type is the same as input type
+    if arg_types is None:
+        return tile(dtype=Scalar, shape=Tuple[int, ...])
+    if len(arg_types) != 1:
+        raise TypeError(f"tile_scan_inclusive() takes exactly 1 positional argument but {len(arg_types)} were given")
+    a = arg_types["a"]
+    if not is_tile(a):
+        raise TypeError(f"tile_scan_inclusive() argument must be a tile, got {a!r}")
+    # Only allow float32, int32, or uint32 for scan (like tile_sort)
+    if not (a.dtype is warp.float32 or a.dtype is warp.int32 or a.dtype is warp.uint32):
+        raise TypeError(
+            f"tile_scan_inclusive() argument must be a tile of type float32, int32, or uint32, got {a.dtype}"
+        )
+    return tile(dtype=a.dtype, shape=a.shape)
+def tile_scan_inclusive_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = (args["a"],)
+    template_args = ()
+    return (func_args, template_args)
+add_builtin(
+    "tile_scan_inclusive",
+    input_types={"a": tile(dtype=Scalar, shape=Tuple[int, ...])},
+    value_func=tile_scan_inclusive_value_func,
+    native_func="tile_scan_inclusive",
+    doc="""Inclusive scan (prefix sum) across the tile.
+    This function cooperatively performs an inclusive scan (cumulative sum) across the tile.
+    :param a: The input tile. Must be a tile of type float32, int32, or uint32.
+    :returns: A new tile containing the inclusive scan result.
+    Example:
+    .. code-block:: python
+        @wp.kernel
+        def scan_example():
+            t = wp.tile_arange(1, 5, dtype=int)
+            s = wp.tile_scan_inclusive(t)
+            print(s)
+        wp.launch_tiled(scan_example, dim=[1], inputs=[], block_dim=16)
+    Prints:
+    .. code-block:: text
+        [1, 3, 6, 10] = tile(shape=(4), storage=register)
+    """,
+    group="Tile Primitives",
+    export=False,
+)
+def tile_scan_exclusive_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Scalar, shape=Tuple[int, ...])
+    if len(arg_types) != 1:
+        raise TypeError(f"tile_scan_exclusive() takes exactly 1 positional argument but {len(arg_types)} were given")
+    a = arg_types["a"]
+    if not is_tile(a):
+        raise TypeError(f"tile_scan_exclusive() argument must be a tile, got {a!r}")
+    # Only allow float32, int32, or uint32 for scan (like tile_sort)
+    if not (a.dtype is warp.float32 or a.dtype is warp.int32 or a.dtype is warp.uint32):
+        raise TypeError(
+            f"tile_scan_exclusive() argument must be a tile of type float32, int32, or uint32, got {a.dtype}"
+        )
+    return tile(dtype=a.dtype, shape=a.shape)
+def tile_scan_exclusive_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = (args["a"],)
+    template_args = ()
+    return (func_args, template_args)
+add_builtin(
+    "tile_scan_exclusive",
+    input_types={"a": tile(dtype=Scalar, shape=Tuple[int, ...])},
+    value_func=tile_scan_exclusive_value_func,
+    native_func="tile_scan_exclusive",
+    doc="""Exclusive scan (prefix sum) across the tile.
+    This function cooperatively performs an exclusive scan (cumulative sum) across the tile.
+    :param a: The input tile. Must be a tile of type float32, int32, or uint32.
+    :returns: A new tile containing the exclusive scan result.
+    Example:
+    .. code-block:: python
+        @wp.kernel
+        def scan_example():
+            t = wp.tile_arange(1, 5, dtype=int)
+            s = wp.tile_scan_exclusive(t)
+            print(s)
+        wp.launch_tiled(scan_example, dim=[1], inputs=[], block_dim=16)
     Prints:
     .. code-block:: text
-        [362880] = tile(shape=(1), storage=register)
+        [0, 1, 3, 6] = tile(shape=(4), storage=register)
     """,
     group="Tile Primitives",
     export=False,
 )
 # maps
 # does type propagation for load()
 def tile_unary_map_value_func(arg_types, arg_values):
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Scalar, shape=Tuple[int, ...])
     a = arg_types["a"]
     if not is_tile(a):
         raise TypeError(f"tile_map() 'a' argument must be a tile, got {a!r}")
-    return TileUnaryMap(a)
+    return tile(dtype=a.dtype, shape=a.shape)
 add_builtin(
     "tile_map",
-    input_types={"op": Callable, "a": Tile(dtype=Any, shape=Any)},
+    input_types={"op": Callable, "a": tile(dtype=Scalar, shape=Tuple[int, ...])},
     value_func=tile_unary_map_value_func,
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
@@ -3075,7 +3979,7 @@ add_builtin(
 def tile_binary_map_value_func(arg_types, arg_values):
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Scalar, shape=Tuple[int, ...])
     a = arg_types["a"]
     b = arg_types["b"]
@@ -3100,12 +4004,16 @@ def tile_binary_map_value_func(arg_types, arg_values):
         if a.shape[i] != b.shape[i]:
             raise ValueError(f"tile_map() shapes do not match on dimension {i}, got {a.shape} and {b.shape}")
-    return TileBinaryMap(a, b)
+    return tile(dtype=a.dtype, shape=a.shape)
 add_builtin(
     "tile_map",
-    input_types={"op": Callable, "a": Tile(dtype=Any, shape=Any), "b": Tile(dtype=Any, shape=Any)},
+    input_types={
+        "op": Callable,
+        "a": tile(dtype=Scalar, shape=Tuple[int, ...]),
+        "b": tile(dtype=Scalar, shape=Tuple[int, ...]),
+    },
     value_func=tile_binary_map_value_func,
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
@@ -3255,57 +4163,13 @@ add_builtin(
 )
-def mlp_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
-    warp.utils.warn(
-        "wp.mlp() is deprecated and will be removed in a future\nversion. Use tile primitives instead.",
-        category=DeprecationWarning,
-    )
-    func_args = tuple(args.values())
-    template_args = ()
-    return (func_args, template_args)
-add_builtin(
-    "mlp",
-    input_types={
-        "weights": array(dtype=float, ndim=2),
-        "bias": array(dtype=float, ndim=1),
-        "activation": Callable,
-        "index": int,
-        "x": array(dtype=float, ndim=2),
-        "out": array(dtype=float, ndim=2),
-    },
-    value_type=None,
-    dispatch_func=mlp_dispatch_func,
-    skip_replay=True,
-    doc="""Evaluate a multi-layer perceptron (MLP) layer in the form: ``out = act(weights*x + bias)``.
-    .. deprecated:: 1.6
-        Use :doc:`tile primitives </modules/tiles>` instead.
-    :param weights: A layer's network weights with dimensions ``(m, n)``.
-    :param bias: An array with dimensions ``(n)``.
-    :param activation: A ``wp.func`` function that takes a single scalar float as input and returns a scalar float as output
-    :param index: The batch item to process, typically each thread will process one item in the batch, in which case
-                  index should be ``wp.tid()``
-    :param x: The feature matrix with dimensions ``(n, b)``
-    :param out: The network output with dimensions ``(m, b)``
-    :note: Feature and output matrices are transposed compared to some other frameworks such as PyTorch.
-           All matrices are assumed to be stored in flattened row-major memory layout (NumPy default).""",
-    group="Utility",
-)
 # ---------------------------------
 # Geometry
 add_builtin(
     "bvh_query_aabb",
     input_types={"id": uint64, "low": vec3, "high": vec3},
-    value_func=lambda arg_types, _: BvhQuery if arg_types is None else bvh_query_t,
+    value_type=BvhQuery,
     group="Geometry",
     doc="""Construct an axis-aligned bounding box query against a BVH object.
@@ -3320,7 +4184,7 @@ add_builtin(
 add_builtin(
     "bvh_query_ray",
     input_types={"id": uint64, "start": vec3, "dir": vec3},
-    value_func=lambda arg_types, _: BvhQuery if arg_types is None else bvh_query_t,
+    value_type=BvhQuery,
     group="Geometry",
     doc="""Construct a ray query against a BVH object.
@@ -3380,7 +4244,7 @@ add_builtin(
         "point": vec3,
         "max_dist": float,
     },
-    value_func=lambda arg_types, _: MeshQueryPoint if arg_types is None else mesh_query_point_t,
+    value_type=MeshQueryPoint,
     group="Geometry",
     doc="""Computes the closest point on the :class:`Mesh` with identifier ``id`` to the given ``point`` in space.
@@ -3428,7 +4292,7 @@ add_builtin(
         "point": vec3,
         "max_dist": float,
     },
-    value_func=lambda arg_types, _: MeshQueryPoint if arg_types is None else mesh_query_point_t,
+    value_type=MeshQueryPoint,
     group="Geometry",
     doc="""Computes the closest point on the :class:`Mesh` with identifier ``id`` to the given ``point`` in space.
@@ -3474,7 +4338,7 @@ add_builtin(
         "point": vec3,
         "min_dist": float,
     },
-    value_func=lambda arg_types, _: MeshQueryPoint if arg_types is None else mesh_query_point_t,
+    value_type=MeshQueryPoint,
     group="Geometry",
     doc="""Computes the furthest point on the mesh with identifier `id` to the given point in space.
@@ -3531,7 +4395,7 @@ add_builtin(
         "epsilon": float,
     },
     defaults={"epsilon": 1.0e-3},
-    value_func=lambda arg_types, _: MeshQueryPoint if arg_types is None else mesh_query_point_t,
+    value_type=MeshQueryPoint,
     group="Geometry",
     doc="""Computes the closest point on the :class:`Mesh` with identifier ``id`` to the given ``point`` in space.
@@ -3596,7 +4460,7 @@ add_builtin(
         "threshold": float,
     },
     defaults={"accuracy": 2.0, "threshold": 0.5},
-    value_func=lambda arg_types, _: MeshQueryPoint if arg_types is None else mesh_query_point_t,
+    value_type=MeshQueryPoint,
     group="Geometry",
     doc="""Computes the closest point on the :class:`Mesh` with identifier ``id`` to the given point in space.
@@ -3655,7 +4519,7 @@ add_builtin(
         "dir": vec3,
         "max_t": float,
     },
-    value_func=lambda arg_types, _: MeshQueryRay if arg_types is None else mesh_query_ray_t,
+    value_type=MeshQueryRay,
     group="Geometry",
     doc="""Computes the closest ray hit on the :class:`Mesh` with identifier ``id``.
@@ -3670,7 +4534,7 @@ add_builtin(
 add_builtin(
     "mesh_query_aabb",
     input_types={"id": uint64, "low": vec3, "high": vec3},
-    value_func=lambda arg_types, _: MeshQueryAABB if arg_types is None else mesh_query_aabb_t,
+    value_type=MeshQueryAABB,
     group="Geometry",
     doc="""Construct an axis-aligned bounding box query against a :class:`Mesh`.
@@ -3714,7 +4578,7 @@ add_builtin(
 add_builtin(
     "hash_grid_query",
     input_types={"id": uint64, "point": vec3, "max_dist": float},
-    value_func=lambda arg_types, _: HashGridQuery if arg_types is None else hash_grid_query_t,
+    value_type=HashGridQuery,
     group="Geometry",
     doc="""Construct a point query against a :class:`HashGrid`.
@@ -3843,10 +4707,10 @@ add_builtin(
 add_builtin("iter_next", input_types={"range": range_t}, value_type=int, group="Utility", export=False, hidden=True)
 add_builtin(
-    "iter_next", input_types={"query": hash_grid_query_t}, value_type=int, group="Utility", export=False, hidden=True
+    "iter_next", input_types={"query": HashGridQuery}, value_type=int, group="Utility", export=False, hidden=True
 )
 add_builtin(
-    "iter_next", input_types={"query": mesh_query_aabb_t}, value_type=int, group="Utility", export=False, hidden=True
+    "iter_next", input_types={"query": MeshQueryAABB}, value_type=int, group="Utility", export=False, hidden=True
 )
 add_builtin(
@@ -3889,7 +4753,7 @@ def _check_volume_type_is_supported(dtype):
 def check_volume_value_grad_compatibility(dtype, grad_dtype):
     if type_is_vector(dtype):
-        expected = matrix(shape=(type_length(dtype), 3), dtype=type_scalar_type(dtype))
+        expected = matrix(shape=(type_size(dtype), 3), dtype=type_scalar_type(dtype))
     else:
         expected = vector(length=3, dtype=dtype)
@@ -4062,6 +4926,7 @@ add_builtin(
     input_types={"id": uint64, "i": int, "j": int, "k": int, "value": float},
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
+    export=False,
 )
 add_builtin(
@@ -4089,6 +4954,7 @@ add_builtin(
     input_types={"id": uint64, "i": int, "j": int, "k": int, "value": vec3},
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
+    export=False,
 )
 add_builtin(
@@ -4114,6 +4980,7 @@ add_builtin(
     input_types={"id": uint64, "i": int, "j": int, "k": int, "value": int},
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
+    export=False,
 )
@@ -4527,6 +5394,16 @@ add_builtin(
     native_func="builtin_tid1d",
 )
+add_builtin(
+    "block_dim",
+    input_types={},
+    value_type=int,
+    group="Utility",
+    doc="Returns the number of threads in the current block.",
+    namespace="",
+    native_func="builtin_block_dim",
+)
 add_builtin(
     "tid",
     input_types={},
@@ -4667,7 +5544,7 @@ def array_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any
         return array(dtype=Scalar)
     dtype = arg_values["dtype"]
-    shape = arg_values["shape"]
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
     return array(dtype=dtype, ndim=len(shape))
@@ -4677,8 +5554,9 @@ def array_dispatch_func(input_types: Mapping[str, type], return_type: Any, args:
     # to the underlying C++ function's runtime and template params.
     dtype = return_type.dtype
+    shape = extract_tuple(args["shape"], as_constant=True)
-    func_args = (args["ptr"], *args["shape"])
+    func_args = (args["ptr"], *shape)
     template_args = (dtype,)
     return (func_args, template_args)
@@ -4958,6 +5836,12 @@ def create_atomic_op_value_func(op: str):
                     f"atomic_{op}() operations only work on arrays with [u]int32, [u]int64, float32, or float64 "
                     f"as the underlying scalar types, but got {type_repr(arr_type.dtype)} (with scalar type {type_repr(scalar_type)})"
                 )
+        elif op in ("cas", "exch"):
+            if not any(types_equal(scalar_type, x, match_generic=True) for x in SUPPORTED_ATOMIC_TYPES):
+                raise RuntimeError(
+                    f"atomic_{op}() operations only work on arrays with [u]int32, [u]int64, float32, or float64 "
+                    f"as the underlying scalar types, but got {type_repr(arr_type.dtype)} (with scalar type {type_repr(scalar_type)})"
+                )
         else:
             raise NotImplementedError
@@ -5187,6 +6071,120 @@ for array_type in array_types:
         skip_replay=True,
     )
+    add_builtin(
+        "atomic_cas",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "compare": Any, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("cas"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically compare and swap ``value`` with ``arr[i]`` if ``arr[i]`` equals ``compare``, and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
+    add_builtin(
+        "atomic_cas",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "compare": Any, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("cas"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically compare and swap ``value`` with ``arr[i,j]`` if ``arr[i,j]`` equals ``compare``, and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
+    add_builtin(
+        "atomic_cas",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "compare": Any, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("cas"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically compare and swap ``value`` with ``arr[i,j,k]`` if ``arr[i,j,k]`` equals ``compare``, and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
+    add_builtin(
+        "atomic_cas",
+        hidden=hidden,
+        input_types={
+            "arr": array_type(dtype=Any),
+            "i": Int,
+            "j": Int,
+            "k": Int,
+            "l": Int,
+            "compare": Any,
+            "value": Any,
+        },
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("cas"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically compare and swap ``value`` with ``arr[i,j,k,l]`` if ``arr[i,j,k,l]`` equals ``compare``, and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
+    add_builtin(
+        "atomic_exch",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("exch"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically exchange ``value`` with ``arr[i]`` and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
+    add_builtin(
+        "atomic_exch",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("exch"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically exchange ``value`` with ``arr[i,j]`` and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
+    add_builtin(
+        "atomic_exch",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("exch"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically exchange ``value`` with ``arr[i,j,k]`` and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
+    add_builtin(
+        "atomic_exch",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("exch"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically exchange ``value`` with ``arr[i,j,k,l]`` and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
 # used to index into builtin types, i.e.: y = vec3[1]
 def extract_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
@@ -5269,6 +6267,16 @@ add_builtin(
     group="Utility",
     skip_replay=True,
 )
+# implements &transformation[index]
+add_builtin(
+    "index",
+    input_types={"a": transformation(dtype=Float), "i": int},
+    value_func=vector_index_value_func,
+    dispatch_func=vector_index_dispatch_func,
+    hidden=True,
+    group="Utility",
+    skip_replay=True,
+)
 # implements &(*vector)[index]
 add_builtin(
     "indexref",
@@ -5289,6 +6297,16 @@ add_builtin(
     group="Utility",
     skip_replay=True,
 )
+# implements &(*transformation)[index]
+add_builtin(
+    "indexref",
+    input_types={"a": transformation(dtype=Float), "i": int},
+    value_func=vector_index_value_func,
+    dispatch_func=vector_index_dispatch_func,
+    hidden=True,
+    group="Utility",
+    skip_replay=True,
+)
 # implements vector[index] = value
@@ -5297,6 +6315,7 @@ add_builtin(
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5306,6 +6325,16 @@ add_builtin(
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
+    group="Utility",
+)
+# implements transformation[index] = value
+add_builtin(
+    "assign_inplace",
+    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
+    value_type=None,
+    hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5321,6 +6350,7 @@ add_builtin(
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_func=vector_assign_value_func,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5330,6 +6360,17 @@ add_builtin(
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_func=vector_assign_value_func,
     hidden=True,
+    export=False,
+    group="Utility",
+)
+# implements transformation[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
+add_builtin(
+    "assign_copy",
+    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
+    value_func=vector_assign_value_func,
+    hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5339,6 +6380,7 @@ add_builtin(
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5348,6 +6390,27 @@ add_builtin(
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
+    group="Utility",
+)
+# implements transformation[idx] += scalar
+add_builtin(
+    "add_inplace",
+    input_types={"a": transformation(dtype=Float), "i": int, "value": Float},
+    value_type=None,
+    hidden=True,
+    export=False,
+    group="Utility",
+)
+# implements transformation.p += vec3
+add_builtin(
+    "transform_add_inplace",
+    input_types={"a": transformation(dtype=Float), "value": vector(length=3, dtype=Float)},
+    value_type=None,
+    hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5357,6 +6420,7 @@ add_builtin(
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5366,6 +6430,27 @@ add_builtin(
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
+    group="Utility",
+)
+# implements transformation[idx] -= scalar
+add_builtin(
+    "sub_inplace",
+    input_types={"a": transformation(dtype=Scalar), "i": int, "value": Scalar},
+    value_type=None,
+    hidden=True,
+    export=False,
+    group="Utility",
+)
+# implements transformation.p -= vec3
+add_builtin(
+    "transform_sub_inplace",
+    input_types={"a": transformation(dtype=Float), "value": vector(length=3, dtype=Float)},
+    value_type=None,
+    hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5407,7 +6492,7 @@ add_builtin(
 def matrix_vector_sametype(arg_types: Mapping[str, Any]):
-    mat_size = arg_types["a"]._shape_[0]
+    mat_size = arg_types["a"]._shape_[1]
     vec_size = arg_types["value"]._length_
     mat_type = arg_types["a"]._type_
     vec_type = arg_types["value"]._type_
@@ -5420,6 +6505,7 @@ add_builtin(
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5431,6 +6517,7 @@ add_builtin(
     constraint=matrix_vector_sametype,
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5446,6 +6533,7 @@ add_builtin(
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_func=matrix_assign_value_func,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5457,6 +6545,7 @@ add_builtin(
     constraint=matrix_vector_sametype,
     value_func=matrix_assign_value_func,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5467,6 +6556,7 @@ add_builtin(
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5478,6 +6568,7 @@ add_builtin(
     constraint=matrix_vector_sametype,
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5488,6 +6579,7 @@ add_builtin(
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5498,6 +6590,7 @@ add_builtin(
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
     value_type=None,
     hidden=True,
+    export=False,
     group="Utility",
 )
@@ -5522,6 +6615,7 @@ for t in scalar_types + vector_types + (bool,):
         doc="Prints an error to stdout if ``a`` and ``b`` are not equal",
         group="Utility",
         hidden=True,
+        export=False,
     )
@@ -5549,6 +6643,7 @@ add_builtin(
     doc="Prints an error to stdout if ``a`` and ``b`` are equal",
     group="Utility",
     hidden=True,
+    export=False,
 )
 add_builtin(
@@ -5568,6 +6663,7 @@ add_builtin(
     doc="Prints an error to stdout if ``a`` and ``b`` are equal",
     group="Utility",
     hidden=True,
+    export=False,
 )
 add_builtin(
@@ -5638,11 +6734,23 @@ add_builtin(
     group="Utility",
 )
 # fuzzy compare for float values
+def expect_near_constraint(arg_types: Mapping[str, type]):
+    if not types_equal(arg_types["a"], arg_types["b"]):
+        return False
+    if hasattr(arg_types["a"], "_wp_scalar_type_"):
+        return types_equal(arg_types["a"]._wp_scalar_type_, arg_types["tolerance"])
+    return types_equal(arg_types["a"], arg_types["tolerance"])
 add_builtin(
     "expect_near",
     input_types={"a": Float, "b": Float, "tolerance": Float},
     defaults={"tolerance": 1.0e-6},
+    constraint=expect_near_constraint,
     value_type=None,
     doc="Prints an error to stdout if ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
@@ -5651,6 +6759,7 @@ add_builtin(
     "expect_near",
     input_types={"a": vector(length=Any, dtype=Float), "b": vector(length=Any, dtype=Float), "tolerance": Float},
     defaults={"tolerance": 1.0e-6},
+    constraint=expect_near_constraint,
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
@@ -5659,6 +6768,7 @@ add_builtin(
     "expect_near",
     input_types={"a": quaternion(dtype=Float), "b": quaternion(dtype=Float), "tolerance": Float},
     defaults={"tolerance": 1.0e-6},
+    constraint=expect_near_constraint,
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
@@ -5671,6 +6781,7 @@ add_builtin(
         "tolerance": Float,
     },
     defaults={"tolerance": 1.0e-6},
+    constraint=expect_near_constraint,
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
@@ -6088,19 +7199,19 @@ add_builtin("unot", input_types={"a": array(dtype=Any)}, value_type=builtins.boo
 # Tile operators
 def tile_unary_value_func(arg_types, arg_values):
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Scalar, shape=Tuple[int, ...])
     t = arg_types["x"]
     if not is_tile(t):
         raise TypeError(f"Expected tile for unary expression, got {t}")
-    return TileUnaryMap(t)
+    return tile(dtype=t.dtype, shape=t.shape)
 def tile_scalar_mul_value_func(arg_types, arg_values):
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Any, shape=Tuple[int, ...])
     x = arg_types["x"]
     y = arg_types["y"]
@@ -6110,19 +7221,19 @@ def tile_scalar_mul_value_func(arg_types, arg_values):
         if x.dtype != y:
             raise TypeError(f"Scalar factor type {y} does not match tile type {x.dtype} for tile*scalar")
-        return TileBinaryMap(x, TileConstant(y, x.shape))
+        return tile(dtype=x.dtype, shape=x.shape)
     # scalar*tile
     if is_tile(y):
         if y.dtype != x:
             raise TypeError(f"Scalar factor type {x} does not match tile type {y.dtype} for scalar*tile")
-        return TileBinaryMap(TileConstant(x, y.shape), y)
+        return tile(dtype=y.dtype, shape=y.shape)
 add_builtin(
     "neg",
-    input_types={"x": Tile(dtype=Any, shape=Any)},
+    input_types={"x": tile(dtype=Any, shape=Tuple[int, ...])},
     value_func=tile_unary_value_func,
     doc="Negate each element of a tile",
     export=False,
@@ -6132,7 +7243,7 @@ add_builtin(
 add_builtin(
     "add",
-    input_types={"a": Tile(dtype=Any, shape=Any), "b": Tile(dtype=Any, shape=Any)},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
     value_func=tile_binary_map_value_func,
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
@@ -6144,7 +7255,7 @@ add_builtin(
 add_builtin(
     "sub",
-    input_types={"a": Tile(dtype=Any, shape=Any), "b": Tile(dtype=Any, shape=Any)},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
     value_func=tile_binary_map_value_func,
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
@@ -6157,7 +7268,7 @@ add_builtin(
 add_builtin(
     "mul",
-    input_types={"x": Tile(dtype=Any, shape=Any), "y": Scalar},
+    input_types={"x": tile(dtype=Any, shape=Tuple[int, ...]), "y": Scalar},
     value_func=tile_scalar_mul_value_func,
     doc="Multiply each element of a tile by a scalar",
     export=False,
@@ -6167,7 +7278,7 @@ add_builtin(
 add_builtin(
     "mul",
-    input_types={"x": Scalar, "y": Tile(dtype=Any, shape=Any)},
+    input_types={"x": Scalar, "y": tile(dtype=Any, shape=Tuple[int, ...])},
     value_func=tile_scalar_mul_value_func,
     doc="Multiply each element of a tile by a scalar",
     export=False,
@@ -6176,9 +7287,48 @@ add_builtin(
 )
+def tile_inplace_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    a = args["a"]
+    b = args["b"]
+    a_type = input_types["a"]
+    b_type = input_types["b"]
+    if a_type.shape != b_type.shape:
+        raise ValueError(f"Tile inplace arguments must have the same shape, got {a_type.shape} and {b_type.shape}")
+    func_args = (a, b)
+    template_args = ()
+    return (func_args, template_args)
+add_builtin(
+    "add_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_type=None,
+    dispatch_func=tile_inplace_dispatch_func,
+    export=False,
+    hidden=True,
+    native_func="tile_add_inplace",
+    group="Operators",
+)
+add_builtin(
+    "sub_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_type=None,
+    dispatch_func=tile_inplace_dispatch_func,
+    export=False,
+    hidden=True,
+    native_func="tile_sub_inplace",
+    group="Operators",
+)
 def tile_diag_add_value_func(arg_types, arg_values):
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Any, shape=Tuple[int, int])
     a = arg_types["a"]
     d = arg_types["d"]
@@ -6208,7 +7358,7 @@ def tile_diag_add_value_func(arg_types, arg_values):
         )
     # use first argument to define output type
-    return Tile(dtype=a.dtype, shape=a.shape, storage="shared")
+    return tile(dtype=a.dtype, shape=a.shape, layout=a.layout, strides=a.strides, storage="shared")
 def tile_diag_add_lto_dispatch_func(
@@ -6230,7 +7380,7 @@ def tile_diag_add_lto_dispatch_func(
 add_builtin(
     "tile_diag_add",
-    input_types={"a": Tile(dtype=Any, shape=Any), "d": Tile(dtype=Any, shape=Any)},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, int]), "d": tile(dtype=Any, shape=Tuple[int])},
     value_func=tile_diag_add_value_func,
     lto_dispatch_func=tile_diag_add_lto_dispatch_func,
     native_func="tile_diag_add",
@@ -6240,18 +7390,40 @@ add_builtin(
 )
-##
-## MathDx, LTOIR-based, Tile functions
-##
+##
+## MathDx, LTOIR-based, Tile functions
+##
+##
+## Matmul
+##
+def tile_matmul_out_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+    a = arg_types["a"]
+    b = arg_types["b"]
+    if not is_tile(a):
+        raise TypeError(f"tile_matmul() 'a' argument must be a tile, got {a!r}")
+    if not is_tile(b):
+        raise TypeError(f"tile_matmul() 'b' argument must be a tile, got {b!r}")
+    if not is_tile(arg_types["out"]):
+        raise TypeError(f"tile_matmul() 'out' argument must be a tile, got {arg_types['out']!r}")
+    return None
-##
-## Matmul
-##
 def tile_matmul_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Float, shape=Tuple[int, int])
     a = arg_types["a"]
     b = arg_types["b"]
@@ -6262,16 +7434,7 @@ def tile_matmul_value_func(arg_types, arg_values):
     if not is_tile(b):
         raise TypeError(f"tile_matmul() 'b' argument must be a tile, got {b!r}")
-    # out = wp.tile_matmul(a, b)
-    if len(arg_types) == 2:
-        return Tile(dtype=a.dtype, shape=(a.shape[0], b.shape[1]), storage="shared")
-    # wp.tile_matmul(a, b, out)
-    elif len(arg_types) == 3:
-        if not is_tile(arg_types["out"]):
-            raise TypeError(f"tile_matmul() 'out' argument must be a tile, got {arg_types['out']!r}")
-    return None
+    return tile(dtype=a.dtype, shape=(a.shape[0], b.shape[1]), storage="shared")
 def tile_matmul_lto_dispatch_func(
@@ -6345,36 +7508,41 @@ def tile_matmul_lto_dispatch_func(
             num_threads,
             builder,
         )
-        # adjA += adjC * B^T - Transpose ~= flipped layout
-        (fun_backward_A, lto_backward_A) = warp.build.build_lto_dot(
-            M,
-            K,
-            N,
-            out.type.dtype,
-            b.type.dtype,
-            a.type.dtype,
-            out.type.layout,
-            tile_flip_layout(b.type.layout),
-            a.type.layout,
-            arch,
-            num_threads,
-            builder,
-        )
-        # adjB += A^T * adjC - Transpose ~= flipped layout
-        (fun_backward_B, lto_backward_B) = warp.build.build_lto_dot(
-            K,
-            N,
-            M,
-            a.type.dtype,
-            out.type.dtype,
-            b.type.dtype,
-            tile_flip_layout(a.type.layout),
-            out.type.layout,
-            b.type.layout,
-            arch,
-            num_threads,
-            builder,
-        )
+        if warp.config.enable_backward:
+            # adjA += adjC * B^T - Transpose ~= flipped layout
+            (fun_backward_A, lto_backward_A) = warp.build.build_lto_dot(
+                M,
+                K,
+                N,
+                out.type.dtype,
+                b.type.dtype,
+                a.type.dtype,
+                out.type.layout,
+                tile_flip_layout(b.type.layout),
+                a.type.layout,
+                arch,
+                num_threads,
+                builder,
+            )
+            # adjB += A^T * adjC - Transpose ~= flipped layout
+            (fun_backward_B, lto_backward_B) = warp.build.build_lto_dot(
+                K,
+                N,
+                M,
+                a.type.dtype,
+                out.type.dtype,
+                b.type.dtype,
+                tile_flip_layout(a.type.layout),
+                out.type.layout,
+                b.type.layout,
+                arch,
+                num_threads,
+                builder,
+            )
+        else:
+            # adjoints aren't computed, so we reuse fun_forward as a dummy arg
+            (fun_backward_A, lto_backward_A) = (fun_forward, None)
+            (fun_backward_B, lto_backward_B) = (fun_forward, None)
         return (
             (
@@ -6394,11 +7562,11 @@ def tile_matmul_lto_dispatch_func(
 add_builtin(
     "tile_matmul",
     input_types={
-        "a": Tile(dtype=Any, shape=Any),
-        "b": Tile(dtype=Any, shape=Any),
-        "out": Tile(dtype=Any, shape=Any),
+        "a": tile(dtype=Float, shape=Tuple[int, int]),
+        "b": tile(dtype=Float, shape=Tuple[int, int]),
+        "out": tile(dtype=Float, shape=Tuple[int, int]),
     },
-    value_func=tile_matmul_value_func,
+    value_func=tile_matmul_out_value_func,
     lto_dispatch_func=tile_matmul_lto_dispatch_func,
     variadic=False,
     doc="""Computes the matrix product and accumulates ``out += a*b``.
@@ -6420,7 +7588,7 @@ add_builtin(
 add_builtin(
     "tile_matmul",
-    input_types={"a": Tile(dtype=Any, shape=Any), "b": Tile(dtype=Any, shape=Any)},
+    input_types={"a": tile(dtype=Float, shape=Tuple[int, int]), "b": tile(dtype=Float, shape=Tuple[int, int])},
     value_func=tile_matmul_value_func,
     lto_dispatch_func=tile_matmul_lto_dispatch_func,
     variadic=False,
@@ -6447,7 +7615,7 @@ add_builtin(
 ##
 def tile_fft_generic_value_func(arg_types, arg_values):
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=vector(length=2, dtype=Float), shape=Tuple[int, int])
     if len(arg_types) != 1:
         raise TypeError(f"tile_fft() takes exactly 1 positional argument but {len(arg_types)} were given")
@@ -6475,7 +7643,7 @@ def tile_fft_generic_lto_dispatch_func(
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
     builder: warp.context.ModuleBuilder,
-    direction: str = None,
+    direction: str | None = None,
 ):
     inout = arg_values["inout"]
     inout.type.storage = "register"
@@ -6529,7 +7697,7 @@ def tile_fft_generic_lto_dispatch_func(
 add_builtin(
     "tile_fft",
-    input_types={"inout": Tile},
+    input_types={"inout": tile(dtype=vector(length=2, dtype=Float), shape=Tuple[int, int])},
     value_func=tile_fft_generic_value_func,
     lto_dispatch_func=functools.partial(tile_fft_generic_lto_dispatch_func, direction="forward"),
     variadic=True,
@@ -6550,7 +7718,7 @@ add_builtin(
 add_builtin(
     "tile_ifft",
-    input_types={"inout": Tile},
+    input_types={"inout": tile(dtype=vector(length=2, dtype=Float), shape=Tuple[int, int])},
     value_func=tile_fft_generic_value_func,
     lto_dispatch_func=functools.partial(tile_fft_generic_lto_dispatch_func, direction="inverse"),
     variadic=True,
@@ -6575,7 +7743,7 @@ add_builtin(
 ##
 def tile_cholesky_generic_value_func(arg_types, arg_values):
     if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
+        return tile(dtype=Float, shape=Tuple[int, int])
     if len(arg_types) != 1:
         raise TypeError("tile_cholesky() requires 1 positional args")
@@ -6591,15 +7759,19 @@ def tile_cholesky_generic_value_func(arg_types, arg_values):
     if a.shape[0] != a.shape[1]:
         raise ValueError("tile_cholesky() argument must be square")
-    return Tile(dtype=a.dtype, shape=a.shape, storage="shared")
+    return tile(dtype=a.dtype, shape=a.shape, layout=a.layout, strides=a.strides, storage="shared")
-cusolver_function_map = {"getrf": 0, "getrf_no_pivot": 1, "potrf": 2, "potrs": 3}
+cusolver_function_map = {"getrf": 0, "getrf_no_pivot": 1, "potrf": 2, "potrs": 3, "trsm": 4}
 cusolver_type_map = {float32: ("wp::float32", 5), float64: ("wp::float64", 6)}
 cusolver_fill_mode_map = {"upper": 0, "lower": 1}
+cusolver_side_map = {"-": -1, "left": 0, "right": 1}
+cusolver_diag_map = {"-": -1, "unit": 0, "nounit": 1}
 def tile_cholesky_generic_lto_dispatch_func(
     arg_types: Mapping[str, type],
@@ -6623,20 +7795,20 @@ def tile_cholesky_generic_lto_dispatch_func(
     dtype, precision_enum = cusolver_type_map[a.type.dtype]
     # We already ensured a is square in tile_cholesky_generic_value_func()
-    M, N = a.type.shape[0], a.type.shape[1]
+    M, N = a.type.shape
     if out.type.shape[0] != M or out.type.shape[1] != M:
         raise ValueError("tile_cholesky() output tile must be square")
     solver = "potrf"
     solver_enum = cusolver_function_map[solver]
-    # cuSOLVERDx only supports col-major input/outputs,
-    # so we use upper to mimic a row-major input
-    fill_mode = cusolver_fill_mode_map["upper"]
+    side_enum = cusolver_side_map["-"]
+    diag_enum = cusolver_diag_map["-"]
+    fill_mode = cusolver_fill_mode_map["lower"]
     arch = options["output_arch"]
     num_threads = options["block_dim"]
-    parameter_list = f"({dtype}*, unsigned)"
+    parameter_list = f"({dtype}*, int*)"
     if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
         # CPU/no-MathDx dispatch
@@ -6646,8 +7818,13 @@ def tile_cholesky_generic_lto_dispatch_func(
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
             N,
+            1,
             solver,
             solver_enum,
+            side_enum,
+            diag_enum,
+            a.type.layout,
+            out.type.layout,
             fill_mode,
             arch,
             precision_enum,
@@ -6661,20 +7838,23 @@ def tile_cholesky_generic_lto_dispatch_func(
 add_builtin(
     "tile_cholesky",
-    input_types={"A": Tile},
+    input_types={"A": tile(dtype=Float, shape=Tuple[int, int])},
     value_func=tile_cholesky_generic_value_func,
     lto_dispatch_func=tile_cholesky_generic_lto_dispatch_func,
     variadic=True,
     doc="""Compute the Cholesky factorization L of a matrix A.
     L is lower triangular and satisfies LL^T = A.
+    Only the lower triangular portion of A is used for the decomposition;
+    the upper triangular part may be left unspecified.
     Note that computing the adjoint is not yet supported.
     Supported datatypes are:
         * float32
         * float64
-    :param A: A square, symmetric positive-definite, matrix.
+    :param A: A square, symmetric positive-definite, matrix. Only the lower triangular part of A is needed; the upper part is ignored.
     :returns L: A square, lower triangular, matrix, such that LL^T = A""",
     group="Tile Primitives",
     export=False,
@@ -6690,30 +7870,30 @@ def tile_cholesky_solve_generic_value_func(arg_types, arg_values):
         raise TypeError("tile_cholesky_solve() requires exactly 2 positional args")
     l = arg_types["L"]
-    x = arg_types["x"]
+    y = arg_types["y"]
     if not is_tile(l):
         raise TypeError(f"tile_cholesky_solve() 'L' argument must be a tile, got {l!r}")
-    if not is_tile(x):
-        raise TypeError(f"tile_cholesky_solve() 'x' argument must be a tile, got {l!r}")
+    if not is_tile(y):
+        raise TypeError(f"tile_cholesky_solve() 'y' argument must be a tile, got {l!r}")
-    if not types_equal(l.dtype, x.dtype):
-        raise TypeError(f"tile_cholesky_solve() arguments must have the same dtype, got {l.dtype} and {x.dtype}")
+    if not types_equal(l.dtype, y.dtype):
+        raise TypeError(f"tile_cholesky_solve() arguments must have the same dtype, got {l.dtype} and {y.dtype}")
     if l.shape[0] != l.shape[1]:
         raise ValueError("tile_cholesky_solve() 'L' argument must be square")
-    if len(x.shape) != 1:
-        raise TypeError("tile_cholesky_solve() 'x' argument must be a 1D tile")
+    if len(y.shape) > 2 or len(y.shape) < 1:
+        raise TypeError("tile_cholesky_solve() 'y' argument must be a 1D or 2D tile")
-    if x.shape[0] != l.shape[0]:
+    if y.shape[0] != l.shape[0]:
         raise ValueError(
-            f"tile_cholesky_solve() 'x' argument must have the same number of elements as the number of rows in 'L', "
-            f"got {x.shape[0]} elements in 'x' and {l.shape[0]} rows in 'L'"
+            f"tile_cholesky_solve() 'y' argument must have the same number of elements as the number of rows in 'L', "
+            f"got {y.shape[0]} elements in 'x' and {l.shape[0]} rows in 'L'"
         )
-    return Tile(dtype=l.dtype, shape=x.shape, storage="shared")
+    return tile(dtype=l.dtype, shape=y.shape, layout=y.layout, strides=y.strides, storage="shared")
 def tile_cholesky_solve_generic_lto_dispatch_func(
@@ -6725,37 +7905,38 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
     builder: warp.context.ModuleBuilder,
 ):
     L = arg_values["L"]
-    x = arg_values["x"]
+    y = arg_values["y"]
     # force the storage type of the input variables to shared memory
     L.type.storage = "shared"
-    x.type.storage = "shared"
+    y.type.storage = "shared"
     if len(return_values) != 1:
         raise TypeError(f"tile_cholesky_solve() must return exactly one value, got {len(return_values)}")
-    y = return_values[0]
+    x = return_values[0]
-    if any(T not in cusolver_type_map.keys() for T in [x.type.dtype, L.type.dtype]):
+    if any(T not in cusolver_type_map.keys() for T in [y.type.dtype, L.type.dtype]):
         raise TypeError("tile_cholesky_solve() arguments be tiles of float64 or float32")
     dtype, precision_enum = cusolver_type_map[L.type.dtype]
-    M, N = L.type.shape[0], L.type.shape[1]
+    M, N = L.type.shape
+    NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
-    if len(y.type.shape) != 1:
-        raise TypeError("tile_cholesky_solve() output vector must be 1D")
+    if len(x.type.shape) > 2 or len(x.type.shape) < 1:
+        raise TypeError(f"tile_cholesky_solve() output vector must be 1D or 2D, got {len(x.type.shape)}-D")
-    if y.type.shape[0] != M:
+    if x.type.shape[0] != M:
         raise ValueError(
             "tile_cholesky_solve() output vector must have same number of elements as the number of rows in 'L' "
-            f"got {y.type.shape[0]} elements in output and {M} rows in 'L'"
+            f"got {x.type.shape[0]} elements in output and {M} rows in 'L'"
         )
     solver = "potrs"
     solver_enum = cusolver_function_map[solver]
-    # cuSOLVERDx only supports col-major input/outputs,
-    # so we use upper to mimic a row-major input
-    fill_mode = cusolver_fill_mode_map["upper"]
+    side_enum = cusolver_side_map["-"]
+    diag_enum = cusolver_diag_map["-"]
+    fill_mode = cusolver_fill_mode_map["lower"]
     arch = options["output_arch"]
     num_threads = options["block_dim"]
@@ -6763,14 +7944,19 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
     if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
         # CPU/no-MathDx dispatch
-        return ((0, L, x, y), [], [], 0)
+        return ((0, L, y, x), [], [], 0)
     else:
         # generate the LTO
         lto_symbol, lto_code_data = warp.build.build_lto_solver(
             M,
             N,
+            NRHS,
             solver,
             solver_enum,
+            side_enum,
+            diag_enum,
+            L.type.layout,
+            y.type.layout,
             fill_mode,
             arch,
             precision_enum,
@@ -6779,12 +7965,12 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
             builder,
         )
-        return ((Var(lto_symbol, str, False, True, False), L, x, y), [], [lto_code_data], 0)
+        return ((Var(lto_symbol, str, False, True, False), L, y, x), [], [lto_code_data], 0)
 add_builtin(
     "tile_cholesky_solve",
-    input_types={"L": Tile, "x": Tile},
+    input_types={"L": tile(dtype=Float, shape=Tuple[int, int]), "y": tile(dtype=Float, shape=Tuple[int])},
     value_func=tile_cholesky_solve_generic_value_func,
     lto_dispatch_func=tile_cholesky_solve_generic_lto_dispatch_func,
     variadic=True,
@@ -6797,13 +7983,276 @@ add_builtin(
         * float64
     :param L: A square, lower triangular, matrix, such that LL^T = A
-    :param x: An 1D tile of length M
-    :returns y: An 1D tile of length M such that LL^T y = x""",
+    :param y: A 1D or 2D tile of length M
+    :returns x: A tile of the same shape as y such that LL^T x = y""",
+    group="Tile Primitives",
+    export=False,
+    namespace="",
+)
+def tile_lower_solve_generic_lto_dispatch_func(
+    arg_types: Mapping[str, type],
+    return_type: Any,
+    return_values: List[Var],
+    arg_values: Mapping[str, Var],
+    options: Mapping[str, Any],
+    builder: warp.context.ModuleBuilder,
+):
+    L = arg_values["L"]
+    y = arg_values["y"]
+    # force the storage type of the input variables to shared memory
+    L.type.storage = "shared"
+    y.type.storage = "shared"
+    if any(T not in cusolver_type_map.keys() for T in [y.type.dtype, L.type.dtype]):
+        raise TypeError("tile_lower_solve() arguments must be tiles of float64 or float32")
+    if len(return_values) != 1:
+        raise TypeError(f"tile_lower_solve() must return exactly one value, got {len(return_values)}")
+    z = return_values[0]
+    dtype, precision_enum = cusolver_type_map[L.type.dtype]
+    M, N = L.type.shape
+    NRHS = z.type.shape[1] if len(z.type.shape) > 1 else 1
+    if len(z.type.shape) > 2 or len(z.type.shape) < 1:
+        raise TypeError(f"tile_lower_solve() output vector must be 1D or 2D, got {len(z.type.shape)}-D")
+    if z.type.shape[0] != M:
+        raise ValueError(
+            "tile_lower_solve() output vector must have same number of elements as the number of rows in 'L' "
+            f"got {z.type.shape[0]} elements in output and {M} rows in 'L'"
+        )
+    solver = "trsm"
+    solver_enum = cusolver_function_map[solver]
+    side_enum = cusolver_side_map["left"]
+    diag_enum = cusolver_diag_map["nounit"]
+    fill_mode = cusolver_fill_mode_map["lower"]
+    arch = options["output_arch"]
+    num_threads = options["block_dim"]
+    parameter_list = f"({dtype}*, {dtype}*)"
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ((0, L, y, z), [], [], 0)
+    else:
+        # generate the LTO
+        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+            M,
+            N,
+            NRHS,
+            solver,
+            solver_enum,
+            side_enum,
+            diag_enum,
+            L.type.layout,
+            y.type.layout,
+            fill_mode,
+            arch,
+            precision_enum,
+            num_threads,
+            parameter_list,
+            builder,
+        )
+        return ((Var(lto_symbol, str, False, True, False), L, y, z), [], [lto_code_data], 0)
+def tile_lower_solve_generic_value_func(arg_types, arg_values):
+    if arg_types is None:
+        return tile(dtype=Float, shape=Tuple[int])
+    if len(arg_types) != 2:
+        raise TypeError("tile_lower_solve() requires exactly 2 positional args")
+    l = arg_types["L"]
+    y = arg_types["y"]
+    if not is_tile(l):
+        raise TypeError(f"tile_lower_solve() 'L' argument must be a tile, got {l!r}")
+    if not is_tile(y):
+        raise TypeError(f"tile_lower_solve() 'y' argument must be a tile, got {y!r}")
+    if not types_equal(l.dtype, y.dtype):
+        raise TypeError(f"tile_lower_solve() arguments must have the same dtype, got {l.dtype} and {y.dtype}")
+    if l.shape[0] != l.shape[1]:
+        raise ValueError("tile_lower_solve() 'L' argument must be square")
+    if len(y.shape) > 2 or len(y.shape) < 1:
+        raise TypeError("tile_lower_solve() 'y' argument must be a 1D or 2D tile")
+    if y.shape[0] != l.shape[0]:
+        raise ValueError(
+            f"tile_lower_solve() 'y' argument must have the same number of elements as the number of rows in 'L', "
+            f"got {y.shape[0]} elements in 'y' and {l.shape[0]} rows in 'L'"
+        )
+    return tile(dtype=l.dtype, shape=y.shape, layout=y.layout, strides=y.strides, storage="shared")
+add_builtin(
+    "tile_lower_solve",
+    input_types={"L": tile(dtype=Float, shape=Tuple[int, int]), "y": tile(dtype=Float, shape=Tuple[int])},
+    value_func=tile_lower_solve_generic_value_func,
+    lto_dispatch_func=tile_lower_solve_generic_lto_dispatch_func,
+    variadic=True,
+    doc="""Solve for z in Lz = y, where L is a lower triangular matrix.
+    This performs general forward substitution for a lower triangular system.
+    Note that computing the adjoint is not yet supported.
+    Supported datatypes are:
+        * float32
+        * float64
+    :param L: A square, non-singular, lower triangular matrix
+    :param y: A 1D or 2D tile with compatible shape
+    :returns z: A tile of the same shape as y such that Lz = y""",
+    group="Tile Primitives",
+    export=False,
+    namespace="",
+)
+def tile_upper_solve_generic_lto_dispatch_func(
+    arg_types: Mapping[str, type],
+    return_type: Any,
+    return_values: List[Var],
+    arg_values: Mapping[str, Var],
+    options: Mapping[str, Any],
+    builder: warp.context.ModuleBuilder,
+):
+    U = arg_values["U"]
+    z = arg_values["z"]
+    # force the storage type of the input variables to shared memory
+    U.type.storage = "shared"
+    z.type.storage = "shared"
+    if any(T not in cusolver_type_map.keys() for T in [z.type.dtype, U.type.dtype]):
+        raise TypeError("tile_upper_solve() arguments must be tiles of float64 or float32")
+    if len(return_values) != 1:
+        raise TypeError(f"tile_upper_solve() must return exactly one value, got {len(return_values)}")
+    x = return_values[0]
+    dtype, precision_enum = cusolver_type_map[U.type.dtype]
+    M, N = U.type.shape
+    NRHS = x.type.shape[1] if len(x.type.shape) > 1 else 1
+    if len(z.type.shape) > 2 or len(z.type.shape) < 1:
+        raise TypeError(f"tile_upper_solve() output tile must be 1D or 2D, got {len(z.type.shape)}-D")
+    if z.type.shape[0] != M:
+        raise ValueError(
+            "tile_upper_solve() output tile must have same number of elements as the number of rows in 'U' "
+            f"got {z.type.shape[0]} elements in output and {M} rows in 'U'"
+        )
+    solver = "trsm"
+    solver_enum = cusolver_function_map[solver]
+    side_enum = cusolver_side_map["left"]
+    diag_enum = cusolver_diag_map["nounit"]
+    fill_mode = cusolver_fill_mode_map["upper"]
+    arch = options["output_arch"]
+    num_threads = options["block_dim"]
+    parameter_list = f"({dtype}*, {dtype}*)"
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ((0, U, z, x), [], [], 0)
+    else:
+        # generate the LTO
+        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+            M,
+            N,
+            NRHS,
+            solver,
+            solver_enum,
+            side_enum,
+            diag_enum,
+            U.type.layout,
+            z.type.layout,
+            fill_mode,
+            arch,
+            precision_enum,
+            num_threads,
+            parameter_list,
+            builder,
+        )
+        return ((Var(lto_symbol, str, False, True, False), U, z, x), [], [lto_code_data], 0)
+def tile_upper_solve_generic_value_func(arg_types, arg_values):
+    if arg_types is None:
+        return tile(dtype=Float, shape=Tuple[int])
+    if len(arg_types) != 2:
+        raise TypeError("tile_upper_solve() requires exactly 2 positional args")
+    u = arg_types["U"]
+    z = arg_types["z"]
+    if not is_tile(u):
+        raise TypeError(f"tile_upper_solve() 'U' argument must be a tile, got {u!r}")
+    if not is_tile(z):
+        raise TypeError(f"tile_upper_solve() 'z' argument must be a tile, got {z!r}")
+    if not types_equal(u.dtype, z.dtype):
+        raise TypeError(f"tile_upper_solve() arguments must have the same dtype, got {u.dtype} and {z.dtype}")
+    if u.shape[0] != u.shape[1]:
+        raise ValueError("tile_upper_solve() 'U' argument must be square")
+    if len(z.shape) > 2 or len(z.shape) < 1:
+        raise TypeError("tile_upper_solve() 'z' argument must be a 1D or 2D tile")
+    if z.shape[0] != u.shape[0]:
+        raise ValueError(
+            f"tile_upper_solve() 'z' argument must have the same number of elements as the number of rows in 'U', "
+            f"got {z.shape[0]} elements in 'z' and {u.shape[0]} rows in 'U'"
+        )
+    return tile(dtype=u.dtype, shape=z.shape, layout=z.layout, strides=z.strides, storage="shared")
+add_builtin(
+    "tile_upper_solve",
+    input_types={"U": tile(dtype=Float, shape=Tuple[int, int]), "z": tile(dtype=Float, shape=Tuple[int])},
+    value_func=tile_upper_solve_generic_value_func,
+    lto_dispatch_func=tile_upper_solve_generic_lto_dispatch_func,
+    variadic=True,
+    doc="""Solve for x in U x = z, where U is an upper triangular matrix.
+    This performs general back substitution for upper triangular systems.
+    Note that computing the adjoint is not yet supported.
+    Supported datatypes are:
+        * float32
+        * float64
+    :param U: A square, non-singular, upper triangular matrix
+    :param z: A 1D or 2D tile with compatible shape
+    :returns x: A tile of the same shape as z such that U x = z""",
     group="Tile Primitives",
     export=False,
     namespace="",
 )
 # ---------------------------------
 # Code Generation
@@ -6840,7 +8289,7 @@ def static(expr):
 add_builtin(
     "len",
     input_types={"a": vector(length=Any, dtype=Scalar)},
-    value_type=int,
+    value_func=static_len_value_func,
     doc="Return the number of elements in a vector.",
     group="Utility",
     export=False,
@@ -6849,7 +8298,7 @@ add_builtin(
 add_builtin(
     "len",
     input_types={"a": quaternion(dtype=Scalar)},
-    value_type=int,
+    value_func=static_len_value_func,
     doc="Return the number of elements in a quaternion.",
     group="Utility",
     export=False,
@@ -6858,7 +8307,7 @@ add_builtin(
 add_builtin(
     "len",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar)},
-    value_type=int,
+    value_func=static_len_value_func,
     doc="Return the number of rows in a matrix.",
     group="Utility",
     export=False,
@@ -6867,7 +8316,7 @@ add_builtin(
 add_builtin(
     "len",
     input_types={"a": transformation(dtype=Float)},
-    value_type=int,
+    value_func=static_len_value_func,
     doc="Return the number of elements in a transformation.",
     group="Utility",
     export=False,
@@ -6884,9 +8333,83 @@ add_builtin(
 add_builtin(
     "len",
-    input_types={"a": Tile(dtype=Any, shape=Any)},
-    value_type=int,
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_func=static_len_value_func,
     doc="Return the number of rows in a tile.",
     group="Utility",
     export=False,
 )
+# ---------------------------------
+# Tuple
+def tuple_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    return tuple_t(arg_types["args"], arg_values["args"])
+def tuple_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = args.get("args", ())
+    template_args = ()
+    return (func_args, template_args)
+add_builtin(
+    "tuple",
+    input_types={"*args": Any},
+    value_func=tuple_value_func,
+    dispatch_func=tuple_dispatch_func,
+    variadic=True,
+    doc="Construct a tuple from a list of values",
+    group="Utility",
+    hidden=True,
+    missing_grad=True,
+    export=False,
+)
+def tuple_extract_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    tuple_type = arg_types["a"]
+    elements = tuple_type.types if is_tuple(tuple_type) else tuple_type
+    if "i" not in arg_values:
+        raise RuntimeError("Tuple index must be a compile time expression.")
+    index = arg_values["i"]
+    if isinstance(index, Var):
+        raise RuntimeError("Tuple index must be a compile time expression.")
+    length = len(elements)
+    if index >= length:
+        raise RuntimeError(f"Tuple index out of bounds, {index} >= {length}")
+    value_type = elements[index]
+    return value_type
+def tuple_extract_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = (args["a"],)
+    template_args = (args["i"].constant,)
+    return (func_args, template_args)
+add_builtin(
+    "extract",
+    input_types={"a": Tuple, "i": int},
+    value_func=tuple_extract_value_func,
+    dispatch_func=tuple_extract_dispatch_func,
+    group="Utility",
+    hidden=True,
+    missing_grad=True,
+)
+add_builtin(
+    "len",
+    input_types={"a": Tuple},
+    value_func=static_len_value_func,
+    doc="Return the number of elements in a tuple.",
+    group="Utility",
+    export=False,
+)