PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-win_amd64.whl → 1.7.1__py3-none-win_amd64.whl - Mend

warp-lang 1.6.2__py3-none-win_amd64.whl → 1.7.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (191) hide show

warp/__init__.py +7 -1
warp/autograd.py +12 -2
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +463 -372
warp/codegen.py +196 -124
warp/config.py +42 -6
warp/context.py +496 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_cloth.py +1 -1
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/distributed/example_jacobi_mpi.py +507 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/field.py +11 -1
warp/fem/field/nodal_field.py +56 -88
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +16 -13
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +7 -20
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -29
warp/jax_experimental/ffi.py +702 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +312 -116
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +100 -11
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/render/render_opengl.py +19 -17
warp/render/render_usd.py +93 -3
warp/sim/articulation.py +4 -4
warp/sim/collide.py +32 -19
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/inertia.py +189 -156
warp/sim/integrator_euler.py +8 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +8 -5
warp/sim/model.py +71 -25
warp/sim/render.py +4 -0
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +217 -20
warp/tests/__main__.py +0 -15
warp/tests/assets/torus.usda +1 -1
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +236 -205
warp/tests/sim/test_inertia.py +161 -0
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{flaky_test_sim_grad.py → sim/test_sim_grad.py} +4 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/sim/test_xpbd.py +399 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_codegen.py +24 -3
warp/tests/test_examples.py +40 -38
warp/tests/test_fem.py +98 -14
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +577 -156
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +356 -151
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +336 -178
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +98 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -62
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +175 -666
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/METADATA +46 -12
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/RECORD +184 -171
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/WHEEL +1 -1
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info/licenses}/LICENSE.md +0 -26
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/top_level.txt +0 -0

warp/builtins.py CHANGED Viewed

@@ -15,10 +15,10 @@
 import builtins
 import functools
-import tempfile
-from pathlib import Path
 from typing import Any, Callable, Mapping, Sequence
+import warp.build
+import warp.context
 from warp.codegen import Reference, Var, strip_reference
 from warp.types import *
@@ -41,7 +41,7 @@ def sametypes(arg_types: Mapping[str, Any]):
     return all(types_equal(arg_type_0, t) for t in arg_types_iter)
-def sametypes_create_value_func(default):
+def sametypes_create_value_func(default: TypeVar):
     def fn(arg_types, arg_values):
         if arg_types is None:
             return default
@@ -399,7 +399,7 @@ add_builtin(
 )
-def scalar_infer_type(arg_types: Mapping[str, type]):
+def scalar_infer_type(arg_types: Union[Mapping[str, type], Tuple[type, ...], None]):
     if arg_types is None:
         return Scalar
@@ -836,7 +836,7 @@ def vector_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
             if dtype is None:
                 dtype = value_type
-            elif value_type != dtype:
+            elif not warp.types.scalars_equal(value_type, dtype):
                 raise RuntimeError(
                     f"the value used to fill this vector is expected to be of the type `{dtype.__name__}`"
                 )
@@ -857,9 +857,9 @@ def vector_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
         if dtype is None:
             dtype = value_type
-        elif value_type != dtype:
+        elif not warp.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
-                f"all values used to initialize this vector matrix are expected to be of the type `{dtype.__name__}`"
+                f"all values used to initialize this vector are expected to be of the type `{dtype.__name__}`"
             )
     if length is None:
@@ -940,7 +940,7 @@ def matrix_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
             if dtype is None:
                 dtype = value_type
-            elif value_type != dtype:
+            elif not warp.types.scalars_equal(value_type, dtype):
                 raise RuntimeError(
                     f"the value used to fill this matrix is expected to be of the type `{dtype.__name__}`"
                 )
@@ -950,6 +950,12 @@ def matrix_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
             raise RuntimeError("the `shape` argument must be specified when initializing a matrix by value")
         if all(type_is_vector(x) for x in variadic_arg_types):
+            warp.utils.warn(
+                "the built-in `wp.matrix()` won't support taking column vectors as input "
+                "in the future. Use `wp.matrix_from_rows()` or `wp.matrix_from_cols()` instead.",
+                DeprecationWarning,
+            )
             if shape[1] != variadic_arg_count:
                 raise RuntimeError(
                     f"incompatible number of column vectors given ({variadic_arg_count}) "
@@ -973,7 +979,7 @@ def matrix_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
         if dtype is None:
             dtype = value_type
-        elif value_type != dtype:
+        elif not warp.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"all values used to initialize this matrix are expected to be of the type `{dtype.__name__}`"
             )
@@ -1030,6 +1036,86 @@ add_builtin(
 )
+def matrix_from_vecs_create_value_func(cols: bool):
+    def fn(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+        if arg_types is None:
+            return matrix(shape=(Any, Any), dtype=Scalar)
+        variadic_arg_types = arg_types.get("args", ())
+        variadic_arg_count = len(variadic_arg_types)
+        if not all(type_is_vector(x) for x in variadic_arg_types):
+            raise RuntimeError("all arguments are expected to be vectors")
+        length = variadic_arg_types[0]._length_
+        if any(x._length_ != length for x in variadic_arg_types):
+            raise RuntimeError("all vectors are expected to have the same length")
+        dtype = variadic_arg_types[0]._wp_scalar_type_
+        if any(x._wp_scalar_type_ != dtype for x in variadic_arg_types):
+            raise RuntimeError("all vectors are expected to have the same dtype")
+        shape = (length, variadic_arg_count) if cols else (variadic_arg_count, length)
+        return matrix(shape=shape, dtype=dtype)
+    return fn
+def matrix_from_vecs_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    # We're in the codegen stage where we emit the code calling the built-in.
+    # Further validate the given argument values if needed and map them
+    # to the underlying C++ function's runtime and template params.
+    shape = return_type._shape_
+    dtype = return_type._wp_scalar_type_
+    variadic_args = args.get("args", ())
+    func_args = variadic_args
+    if shape in ((2, 2), (3, 3), (4, 4)):
+        # Template specializations exist for these shapes, don't pass them
+        # as template parameters.
+        template_args = (dtype,)
+    else:
+        template_args = (*shape, dtype)
+    return (func_args, template_args)
+def matrix_from_vecs_initializer_list_func(args, return_type):
+    shape = return_type._shape_
+    return shape[0] != shape[1] or shape[0] > 4
+add_builtin(
+    "matrix_from_cols",
+    input_types={"*args": vector(length=Any, dtype=Scalar)},
+    variadic=True,
+    value_func=matrix_from_vecs_create_value_func(cols=True),
+    dispatch_func=matrix_from_vecs_dispatch_func,
+    initializer_list_func=matrix_from_vecs_initializer_list_func,
+    native_func="matrix_from_cols",
+    doc="Construct a matrix from column vectors.",
+    group="Vector Math",
+    export=False,
+)
+add_builtin(
+    "matrix_from_rows",
+    input_types={"*args": vector(length=Any, dtype=Scalar)},
+    variadic=True,
+    value_func=matrix_from_vecs_create_value_func(cols=False),
+    dispatch_func=matrix_from_vecs_dispatch_func,
+    initializer_list_func=matrix_from_vecs_initializer_list_func,
+    native_func="matrix_from_rows",
+    doc="Construct a matrix from row vectors.",
+    group="Vector Math",
+    export=False,
+)
 def identity_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     if arg_types is None:
         return matrix(shape=(Any, Any), dtype=Scalar)
@@ -1084,7 +1170,7 @@ def matrix_transform_value_func(arg_types: Mapping[str, type], arg_values: Mappi
     if dtype is None:
         dtype = value_type
-    elif value_type != dtype:
+    elif not warp.types.scalars_equal(value_type, dtype):
         raise RuntimeError(
             f"all values used to initialize this transformation matrix are expected to be of the type `{dtype.__name__}`"
         )
@@ -1141,6 +1227,21 @@ add_builtin(
     while the left and right basis vectors are returned in ``U`` and ``V``.""",
 )
+add_builtin(
+    "svd2",
+    input_types={
+        "A": matrix(shape=(2, 2), dtype=Float),
+        "U": matrix(shape=(2, 2), dtype=Float),
+        "sigma": vector(length=2, dtype=Float),
+        "V": matrix(shape=(2, 2), dtype=Scalar),
+    },
+    value_type=None,
+    group="Vector Math",
+    export=False,
+    doc="""Compute the SVD of a 2x2 matrix ``A``. The singular values are returned in ``sigma``,
+    while the left and right basis vectors are returned in ``U`` and ``V``.""",
+)
 add_builtin(
     "qr3",
     input_types={
@@ -1204,7 +1305,7 @@ def quaternion_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
         if dtype is None:
             dtype = value_type
-        elif value_type != dtype:
+        elif not warp.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"all values used to initialize this quaternion are expected to be of the type `{dtype.__name__}`"
             )
@@ -1244,7 +1345,8 @@ add_builtin(
 )
 add_builtin(
     "quaternion",
-    input_types={"x": Float, "y": Float, "z": Float, "w": Float},
+    input_types={"x": Float, "y": Float, "z": Float, "w": Float, "dtype": Scalar},
+    defaults={"dtype": None},
     value_func=quaternion_value_func,
     export_func=lambda input_types: {k: v for k, v in input_types.items() if k != "dtype"},
     dispatch_func=quaternion_dispatch_func,
@@ -1332,7 +1434,18 @@ add_builtin(
     input_types={"mat": matrix(shape=(3, 3), dtype=Float)},
     value_func=lambda arg_types, arg_values: quaternion(dtype=float_infer_type(arg_types)),
     group="Quaternion Math",
-    doc="Construct a quaternion from a 3x3 matrix.",
+    doc="""Construct a quaternion from a 3x3 matrix.
+    If the matrix is not a pure rotation, but for example includes scaling or skewing, the result is undefined.""",
+)
+add_builtin(
+    "quat_from_matrix",
+    input_types={"mat": matrix(shape=(4, 4), dtype=Float)},
+    value_func=lambda arg_types, arg_values: quaternion(dtype=float_infer_type(arg_types)),
+    group="Quaternion Math",
+    doc="""Construct a quaternion from a 4x4 matrix.
+    If the top-left 3x3 block of the matrix is not a pure rotation, but for example includes scaling or skewing, the result is undefined.""",
 )
 add_builtin(
     "quat_rpy",
@@ -1403,7 +1516,7 @@ def transformation_value_func(arg_types: Mapping[str, type], arg_values: Mapping
     dtype = arg_values.get("dtype", None)
     if dtype is None:
         dtype = value_type
-    elif value_type != dtype:
+    elif not warp.types.scalars_equal(value_type, dtype):
         raise RuntimeError(
             f"all values used to initialize this transformation matrix are expected to be of the type `{dtype.__name__}`"
         )
@@ -1570,7 +1683,7 @@ def spatial_vector_value_func(arg_types: Mapping[str, type], arg_values: Mapping
         if dtype is None:
             dtype = value_type
-        elif value_type != dtype:
+        elif not warp.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"all values used to initialize this spatial vector are expected to be of the type `{dtype.__name__}`"
             )
@@ -2375,7 +2488,7 @@ add_builtin(
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
-    * If the input value is a scalar, then the resulting tile has ``shape=(1, block_dim)``
+    * If the input value is a scalar, then the resulting tile has ``shape=(block_dim,)``
     * If the input value is a vector, then the resulting tile has ``shape=(length(vector), block_dim)``
     :param x: A per-thread local value, e.g. scalar, vector, or matrix.
@@ -2669,11 +2782,9 @@ def tile_broadcast_value_func(arg_types, arg_values):
 def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
     tile = arg_values["a"]
-    template_args = []
-    template_args.append(return_type.shape[0])
-    template_args.append(return_type.shape[1])
-    template_args.append(return_type.strides[0])
-    template_args.append(return_type.strides[1])
+    assert len(return_type.shape) == len(return_type.strides)
+    assert 1 <= len(return_type.shape) <= 4
+    template_args = [*return_type.shape, *return_type.strides]
     return ((tile,), template_args)
@@ -2686,56 +2797,17 @@ add_builtin(
     variadic=False,
     doc="""Broadcast a tile.
-    This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n).
+    Broadcasts the input tile ``a`` to the destination shape.
     Broadcasting follows NumPy broadcast rules.
     :param a: Tile to broadcast
     :param shape: The shape to broadcast to
-    :returns: Tile with broadcast ``shape=(m, n)``""",
+    :returns: Tile with broadcast shape""",
     group="Tile Primitives",
     export=False,
 )
-def tile_matmul_value_func(arg_types, arg_values):
-    # return generic type (for doc builds)
-    if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
-    if len(arg_types) != 3:
-        raise TypeError(f"tile_matmul() takes exactly 3 positional arguments but {len(arg_types)} were given")
-    return None
-def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
-    a = arg_values["a"]
-    b = arg_values["b"]
-    out = arg_values["out"]
-    # force the storage type of the input variables to shared memory
-    a.type.storage = "shared"
-    b.type.storage = "shared"
-    out.type.storage = "shared"
-    template_args = []
-    return ((a, b, out), template_args)
-add_builtin(
-    "tile_matmul_scalar",
-    input_types={"a": Tile, "b": Tile, "out": Tile},
-    value_func=tile_matmul_value_func,
-    dispatch_func=tile_matmul_dispatch_func,
-    variadic=True,
-    doc="Compute matrix product and accumulate out += a*b.",
-    group="Tile Primitives",
-    hidden=True,
-    export=False,
-)
 def tile_sum_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -3030,7 +3102,7 @@ def tile_binary_map_value_func(arg_types, arg_values):
     for i in range(len(a.shape)):
         if a.shape[i] != b.shape[i]:
-            raise ValueError(f"tile_map() shapes do not match on dimension {i}, got {a.shape[i]} and {b.shape[i]}")
+            raise ValueError(f"tile_map() shapes do not match on dimension {i}, got {a.shape} and {b.shape}")
     return TileBinaryMap(a, b)
@@ -3807,6 +3879,18 @@ _volume_supported_value_types = {
 }
+def _is_volume_type_supported(dtype):
+    for typ in _volume_supported_value_types:
+        if types_equal(typ, dtype):
+            return True
+    return False
+def _check_volume_type_is_supported(dtype):
+    if not _is_volume_type_supported(dtype):
+        raise RuntimeError(f"unsupported volume type `{type_repr(dtype)}`")
 def check_volume_value_grad_compatibility(dtype, grad_dtype):
     if type_is_vector(dtype):
         expected = matrix(shape=(type_length(dtype), 3), dtype=type_scalar_type(dtype))
@@ -3822,9 +3906,7 @@ def volume_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
         return Any
     dtype = arg_values["dtype"]
-    if dtype not in _volume_supported_value_types:
-        raise RuntimeError(f"unsupported volume type `{dtype.__name__}`")
+    _check_volume_type_is_supported(dtype)
     return dtype
@@ -3860,9 +3942,7 @@ def volume_sample_grad_value_func(arg_types: Mapping[str, type], arg_values: Map
         return Any
     dtype = arg_values["dtype"]
-    if dtype not in _volume_supported_value_types:
-        raise RuntimeError(f"unsupported volume type `{dtype.__name__}`")
+    _check_volume_type_is_supported(dtype)
     check_volume_value_grad_compatibility(dtype, arg_types["grad"])
@@ -3900,9 +3980,7 @@ def volume_lookup_value_func(arg_types: Mapping[str, type], arg_values: Mapping[
         return Any
     dtype = arg_values["dtype"]
-    if dtype not in _volume_supported_value_types:
-        raise RuntimeError(f"unsupported volume type `{dtype.__name__}`")
+    _check_volume_type_is_supported(dtype)
     return dtype
@@ -3939,9 +4017,7 @@ def volume_store_value_func(arg_types: Mapping[str, type], arg_values: Mapping[s
         return None
     dtype = arg_types["value"]
-    if dtype not in _volume_supported_value_types:
-        raise RuntimeError(f"unsupported volume type `{dtype.__name__}`")
+    _check_volume_type_is_supported(dtype)
     return None
@@ -4191,6 +4267,20 @@ add_builtin(
     group="Random",
     doc="Return a random integer between [low, high).",
 )
+add_builtin(
+    "randu",
+    input_types={"state": uint32},
+    value_type=uint32,
+    group="Random",
+    doc="Return a random unsigned integer in the range [0, 2^32).",
+)
+add_builtin(
+    "randu",
+    input_types={"state": uint32, "low": uint32, "high": uint32},
+    value_type=uint32,
+    group="Random",
+    doc="Return a random unsigned integer between [low, high).",
+)
 add_builtin(
     "randf",
     input_types={"state": uint32},
@@ -4499,11 +4589,31 @@ add_builtin(
     export=False,
     group="Utility",
 )
+def select_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    warp.utils.warn(
+        "wp.select() is deprecated and will be removed in a future\n"
+        "version. Use wp.where(cond, value_if_true, value_if_false) instead.",
+        category=DeprecationWarning,
+    )
+    func_args = tuple(args.values())
+    template_args = ()
+    return (func_args, template_args)
 add_builtin(
     "select",
     input_types={"cond": builtins.bool, "value_if_false": Any, "value_if_true": Any},
     value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-    doc="Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``",
+    dispatch_func=select_dispatch_func,
+    doc="""Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``.
+    .. deprecated:: 1.7
+         Use :func:`where` instead, which has the more intuitive argument order:
+         ``where(cond, value_if_true, value_if_false)``.""",
     group="Utility",
 )
 for t in int_types:
@@ -4511,14 +4621,47 @@ for t in int_types:
         "select",
         input_types={"cond": t, "value_if_false": Any, "value_if_true": Any},
         value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-        doc="Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``",
+        dispatch_func=select_dispatch_func,
+        doc="""Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``.
+    .. deprecated:: 1.7
+         Use :func:`where` instead, which has the more intuitive argument order:
+         ``where(cond, value_if_true, value_if_false)``.""",
         group="Utility",
     )
 add_builtin(
     "select",
     input_types={"arr": array(dtype=Any), "value_if_false": Any, "value_if_true": Any},
     value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-    doc="Select between two arguments, if ``arr`` is null then return ``value_if_false``, otherwise return ``value_if_true``",
+    dispatch_func=select_dispatch_func,
+    doc="""Select between two arguments, if ``arr`` is null then return ``value_if_false``, otherwise return ``value_if_true``.
+    .. deprecated:: 1.7
+         Use :func:`where` instead, which has the more intuitive argument order:
+         ``where(arr, value_if_true, value_if_false)``.""",
+    group="Utility",
+)
+add_builtin(
+    "where",
+    input_types={"cond": builtins.bool, "value_if_true": Any, "value_if_false": Any},
+    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    doc="Select between two arguments, if ``cond`` is ``True`` then return ``value_if_true``, otherwise return ``value_if_false``.",
+    group="Utility",
+)
+for t in int_types:
+    add_builtin(
+        "where",
+        input_types={"cond": t, "value_if_true": Any, "value_if_false": Any},
+        value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+        doc="Select between two arguments, if ``cond`` is ``True`` then return ``value_if_true``, otherwise return ``value_if_false``.",
+        group="Utility",
+    )
+add_builtin(
+    "where",
+    input_types={"arr": array(dtype=Any), "value_if_true": Any, "value_if_false": Any},
+    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    doc="Select between two arguments, if ``arr`` is not null then return ``value_if_true``, otherwise return ``value_if_false``.",
     group="Utility",
 )
@@ -5112,33 +5255,51 @@ add_builtin(
 )
+# implements vector[index] = value
+add_builtin(
+    "assign_inplace",
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
+# implements quaternion[index] = value
+add_builtin(
+    "assign_inplace",
+    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
 def vector_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     vec_type = arg_types["a"]
     return vec_type
-# implements vector[index] = value
+# implements vector[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
-    "assign",
+    "assign_copy",
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_func=vector_assign_value_func,
     hidden=True,
     group="Utility",
 )
-# implements quaternion[index] = value
+# implements quaternion[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
-    "assign",
+    "assign_copy",
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_func=vector_assign_value_func,
     hidden=True,
     group="Utility",
 )
 # implements vector[idx] += scalar
 add_builtin(
-    "augassign_add",
+    "add_inplace",
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5147,7 +5308,7 @@ add_builtin(
 # implements quaternion[idx] += scalar
 add_builtin(
-    "augassign_add",
+    "add_inplace",
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5156,7 +5317,7 @@ add_builtin(
 # implements vector[idx] -= scalar
 add_builtin(
-    "augassign_sub",
+    "sub_inplace",
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5165,7 +5326,7 @@ add_builtin(
 # implements quaternion[idx] -= scalar
 add_builtin(
-    "augassign_sub",
+    "sub_inplace",
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5209,11 +5370,6 @@ add_builtin(
 )
-def matrix_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
-    mat_type = arg_types["a"]
-    return mat_type
 def matrix_vector_sametype(arg_types: Mapping[str, Any]):
     mat_size = arg_types["a"]._shape_[0]
     vec_size = arg_types["value"]._length_
@@ -5224,7 +5380,33 @@ def matrix_vector_sametype(arg_types: Mapping[str, Any]):
 # implements matrix[i,j] = scalar
 add_builtin(
-    "assign",
+    "assign_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
+# implements matrix[i] = vector
+add_builtin(
+    "assign_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
+    constraint=matrix_vector_sametype,
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
+def matrix_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    mat_type = arg_types["a"]
+    return mat_type
+# implements matrix[i,j] = scalar
+add_builtin(
+    "assign_copy",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_func=matrix_assign_value_func,
     hidden=True,
@@ -5234,7 +5416,7 @@ add_builtin(
 # implements matrix[i] = vector
 add_builtin(
-    "assign",
+    "assign_copy",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
     constraint=matrix_vector_sametype,
     value_func=matrix_assign_value_func,
@@ -5245,7 +5427,7 @@ add_builtin(
 # implements matrix[i,j] += scalar
 add_builtin(
-    "augassign_add",
+    "add_inplace",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5253,9 +5435,20 @@ add_builtin(
 )
+# implements matrix[i] += vector
+add_builtin(
+    "add_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
+    constraint=matrix_vector_sametype,
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
 # implements matrix[i,j] -= scalar
 add_builtin(
-    "augassign_sub",
+    "sub_inplace",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5263,6 +5456,16 @@ add_builtin(
 )
+# implements matrix[i] -= vector
+add_builtin(
+    "sub_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
 for t in scalar_types + vector_types + (bool,):
     if "vec" in t.__name__ or "mat" in t.__name__:
         continue
@@ -5410,7 +5613,27 @@ add_builtin(
 )
 add_builtin(
     "expect_near",
-    input_types={"a": vec3, "b": vec3, "tolerance": float},
+    input_types={"a": vector(length=Any, dtype=Float), "b": vector(length=Any, dtype=Float), "tolerance": Float},
+    defaults={"tolerance": 1.0e-6},
+    value_type=None,
+    doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
+    group="Utility",
+)
+add_builtin(
+    "expect_near",
+    input_types={"a": quaternion(dtype=Float), "b": quaternion(dtype=Float), "tolerance": Float},
+    defaults={"tolerance": 1.0e-6},
+    value_type=None,
+    doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
+    group="Utility",
+)
+add_builtin(
+    "expect_near",
+    input_types={
+        "a": matrix(shape=(Any, Any), dtype=Float),
+        "b": matrix(shape=(Any, Any), dtype=Float),
+        "tolerance": Float,
+    },
     defaults={"tolerance": 1.0e-6},
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
@@ -5989,7 +6212,7 @@ add_builtin(
 ##
 ## Matmul
 ##
-def tile_matmul_generic_value_func(arg_types, arg_values):
+def tile_matmul_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
         return Tile(dtype=Any, shape=Any)
@@ -6015,7 +6238,7 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
     return None
-def tile_matmul_generic_lto_dispatch_func(
+def tile_matmul_lto_dispatch_func(
     arg_types: Mapping[str, type],
     return_type: Any,
     return_values: List[Var],
@@ -6054,142 +6277,82 @@ def tile_matmul_generic_lto_dispatch_func(
     out.type.storage = "shared"
     template_args = [accumulate]
-    # Maps Python/Warp types to C++ types and enums
-    def cublasdx_type_map(dtype):
-        if dtype == float16:
-            return ("wp::float16", 3, 0)
-        if dtype == float32:
-            return ("wp::float32", 5, 0)
-        if dtype == float64:
-            return ("wp::float64", 6, 0)
-        if dtype == vec2h:
-            return ("wp::vec2h", 3, 1)
-        if dtype == vec2f:
-            return ("wp::vec2f", 5, 1)
-        if dtype == vec2d:
-            return ("wp::vec2d", 6, 1)
-        raise TypeError("Unsupported input type in tile_matmul")
-    def cublasdx_arrangement_map(layout):
-        if layout == "colmajor":
-            return 0  # CUBLASDX_ARRANGEMENT_COL_MAJOR
-        if layout == "rowmajor":
-            return 1  # CUBLASDX_ARRANGEMENT_ROW_MAJOR
-        raise ValueError("Unsupported layout in tile_matmul")
-    # generate the LTO
     M, K = a.type.shape[0], a.type.shape[1]
     _, N = b.type.shape[0], b.type.shape[1]
     num_threads = options["block_dim"]
     arch = options["output_arch"]
-    def make_function(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout):
-        (a_dtype, a_prec, a_type) = cublasdx_type_map(adtype)
-        (b_dtype, b_prec, b_type) = cublasdx_type_map(bdtype)
-        (c_dtype, c_prec, c_type) = cublasdx_type_map(cdtype)
-        a_arrangement = cublasdx_arrangement_map(alayout)
-        b_arrangement = cublasdx_arrangement_map(blayout)
-        c_arrangement = cublasdx_arrangement_map(clayout)
-        if a_type != b_type or a_type != c_type:
-            raise TypeError("time_matmul(A, B, C) requires all inputs to be real or complex")
-        element_type = a_type
-        lto_symbol = f"dot_{M}_{N}_{K}_{arch}_{num_threads}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ((0, 0, 0, a, b, out), template_args, [], 0)
+    else:
-        # early out if LTO for this combination already exists for this module
-        if lto_symbol in builder.ltoirs:
-            return lto_symbol, builder.ltoirs[lto_symbol]
+        def tile_flip_layout(layout):
+            if layout == "rowmajor":
+                return "colmajor"
+            elif layout == "colmajor":
+                return "rowmajor"
-        # otherwise compile LTO
-        lto_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
-        result = warp.context.runtime.core.cuda_compile_dot(
-            lto_code.name.encode("utf-8"),
-            lto_symbol.encode("utf-8"),
-            0,
-            None,
-            None,
+        # generate the LTOs
+        #    C += A * B
+        (fun_forward, lto_forward) = warp.build.build_lto_dot(
+            M,
+            N,
+            K,
+            a.type.dtype,
+            b.type.dtype,
+            out.type.dtype,
+            a.type.layout,
+            b.type.layout,
+            out.type.layout,
             arch,
+            num_threads,
+            builder,
+        )
+        # adjA += adjC * B^T - Transpose ~= flipped layout
+        (fun_backward_A, lto_backward_A) = warp.build.build_lto_dot(
             M,
+            K,
             N,
+            out.type.dtype,
+            b.type.dtype,
+            a.type.dtype,
+            out.type.layout,
+            tile_flip_layout(b.type.layout),
+            a.type.layout,
+            arch,
+            num_threads,
+            builder,
+        )
+        # adjB += A^T * adjC - Transpose ~= flipped layout
+        (fun_backward_B, lto_backward_B) = warp.build.build_lto_dot(
             K,
-            a_prec,
-            b_prec,
-            c_prec,
-            element_type,
-            a_arrangement,
-            b_arrangement,
-            c_arrangement,
+            N,
+            M,
+            a.type.dtype,
+            out.type.dtype,
+            b.type.dtype,
+            tile_flip_layout(a.type.layout),
+            out.type.layout,
+            b.type.layout,
+            arch,
             num_threads,
+            builder,
         )
-        lto_code_path = Path(lto_code.name)
-        if not result:
-            lto_code.close()
-            if lto_code_path.exists():
-                lto_code_path.unlink()
-            raise RuntimeError("Failed to compile tile_matmul")
-        else:
-            with open(lto_code.name, "rb") as f:
-                lto_code_data = f.read()
-            lto_code.close()
-            lto_code_path.unlink()
-            builder.ltoirs[lto_symbol] = lto_code_data
-            builder.ltoirs_decl[lto_symbol] = (
-                f"void {lto_symbol}({c_dtype}, {a_dtype}*, {b_dtype}*, {c_dtype}, {c_dtype}*);"
-            )
-            return lto_symbol, lto_code_data
-    def tile_flip_layout(layout):
-        if layout == "rowmajor":
-            return "colmajor"
-        elif layout == "colmajor":
-            return "rowmajor"
-    #    C += A * B
-    (fun_forward, lto_forward) = make_function(
-        M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a.type.layout, b.type.layout, out.type.layout
-    )
-    # adjA += adjC * B^T - Transpose ~= flipped layout
-    (fun_backward_A, lto_backward_A) = make_function(
-        M,
-        K,
-        N,
-        out.type.dtype,
-        b.type.dtype,
-        a.type.dtype,
-        out.type.layout,
-        tile_flip_layout(b.type.layout),
-        a.type.layout,
-    )
-    # adjB += A^T * adjC - Transpose ~= flipped layout
-    (fun_backward_B, lto_backward_B) = make_function(
-        K,
-        N,
-        M,
-        a.type.dtype,
-        out.type.dtype,
-        b.type.dtype,
-        tile_flip_layout(a.type.layout),
-        out.type.layout,
-        b.type.layout,
-    )
-    return (
-        (
-            Var(fun_forward, str, False, True, False),
-            Var(fun_backward_A, str, False, True, False),
-            Var(fun_backward_B, str, False, True, False),
-            a,
-            b,
-            out,
-        ),
-        template_args,
-        [lto_forward, lto_backward_A, lto_backward_B],
-        0,
-    )
+        return (
+            (
+                Var(fun_forward, str, False, True, False),
+                Var(fun_backward_A, str, False, True, False),
+                Var(fun_backward_B, str, False, True, False),
+                a,
+                b,
+                out,
+            ),
+            template_args,
+            [lto_forward, lto_backward_A, lto_backward_B],
+            0,
+        )
 add_builtin(
@@ -6199,8 +6362,8 @@ add_builtin(
         "b": Tile(dtype=Any, shape=Any),
         "out": Tile(dtype=Any, shape=Any),
     },
-    value_func=tile_matmul_generic_value_func,
-    lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
+    value_func=tile_matmul_value_func,
+    lto_dispatch_func=tile_matmul_lto_dispatch_func,
     variadic=False,
     doc="""Computes the matrix product and accumulates ``out += a*b``.
@@ -6208,7 +6371,7 @@ add_builtin(
         * fp16, fp32, fp64 (real)
         * vec2h, vec2f, vec2d (complex)
-    All input and output tiles must have the same datatype. Tile data will be automatically be migrated
+    All input and output tiles must have the same datatype. Tile data will automatically be migrated
     to shared memory if necessary and will use TensorCore operations when available.
     :param a: A tile with ``shape=(M, K)``
@@ -6222,8 +6385,8 @@ add_builtin(
 add_builtin(
     "tile_matmul",
     input_types={"a": Tile(dtype=Any, shape=Any), "b": Tile(dtype=Any, shape=Any)},
-    value_func=tile_matmul_generic_value_func,
-    lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
+    value_func=tile_matmul_value_func,
+    lto_dispatch_func=tile_matmul_lto_dispatch_func,
     variadic=False,
     doc="""Computes the matrix product ``out = a*b``.
@@ -6231,7 +6394,7 @@ add_builtin(
         * fp16, fp32, fp64 (real)
         * vec2h, vec2f, vec2d (complex)
-    Both input tiles must have the same datatype. Tile data will be automatically be migrated
+    Both input tiles must have the same datatype. Tile data will automatically be migrated
     to shared memory if necessary and will use TensorCore operations when available.
     :param a: A tile with ``shape=(M, K)``
@@ -6303,59 +6466,29 @@ def tile_fft_generic_lto_dispatch_func(
     num_threads = options["block_dim"]
     arch = options["output_arch"]
     ept = size // num_threads
-    lto_symbol = f"fft_{size}_{ept}_{arch}_{direction}_{precision}"
-    # early out if LTO for this combination already exists for this module
-    if lto_symbol in builder.ltoirs:
-        return lto_symbol, builder.ltoirs[lto_symbol]
-    # otherwise compile LTO
-    lto_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
-    shared_memory_size = ctypes.c_int(0)
-    result = warp.context.runtime.core.cuda_compile_fft(
-        lto_code.name.encode("utf-8"),
-        lto_symbol.encode("utf-8"),
-        0,
-        None,
-        None,
-        arch,
-        size,
-        ept,
-        dir,
-        precision,
-        ctypes.byref(shared_memory_size),
-    )
-    lto_code_path = Path(lto_code.name)
-    if not result:
-        lto_code.close()
-        if lto_code_path.exists():
-            lto_code_path.unlink()
-        raise RuntimeError("Failed to compile tile_fft")
-    with open(lto_code.name, "rb") as f:
-        lto_code_data = f.read()
-    lto_code.close()
-    lto_code_path.unlink()
-    builder.ltoirs[lto_symbol] = lto_code_data
-    shared_memory_bytes = Tile.round_up(shared_memory_size.value)
-    return (
-        (
-            Var(lto_symbol, str, False, True, False),
-            Var(dtype, str, False, True, False),
-            Var(str(shared_memory_bytes), str, False, True, False),
-            Var(str(batch), str, False, True, False),
-            Var(str(ept), str, False, True, False),
-            inout,
-        ),
-        [],
-        [lto_code_data],
-        shared_memory_bytes,
-    )
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ([], [], [], 0)
+    else:
+        # generate the LTO
+        lto_symbol, lto_code_data, shared_memory_bytes = warp.build.build_lto_fft(
+            arch, size, ept, direction, dir, precision, builder
+        )
+        return (
+            (
+                Var(lto_symbol, str, False, True, False),
+                Var(dtype, str, False, True, False),
+                Var(str(shared_memory_bytes), str, False, True, False),
+                Var(str(batch), str, False, True, False),
+                Var(str(ept), str, False, True, False),
+                inout,
+            ),
+            [],
+            [lto_code_data],
+            shared_memory_bytes,
+        )
 add_builtin(
@@ -6417,7 +6550,7 @@ def tile_cholesky_generic_value_func(arg_types, arg_values):
         raise TypeError(f"tile_cholesky() argument must be a tile, got {a!r}")
     if len(a.shape) != 2:
-        raise ValueError("tile_cholesky() argumust must be a 2D tile")
+        raise ValueError("tile_cholesky() argument must be a 2D tile")
     if a.shape[0] != a.shape[1]:
         raise ValueError("tile_cholesky() argument must be square")
@@ -6458,57 +6591,36 @@ def tile_cholesky_generic_lto_dispatch_func(
     if out.type.shape[0] != M or out.type.shape[1] != M:
         raise ValueError("tile_cholesky() output tile must be square")
-    num_threads = options["block_dim"]
-    arch = options["output_arch"]
-    lto_symbol = f"potrf_{M}_{N}_{arch}_{precision_enum}"
-    # early out if LTO for this combination already exists for this module
-    if lto_symbol in builder.ltoirs:
-        return lto_symbol, builder.ltoirs[lto_symbol]
-    # otherwise compile LTO
-    lto_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
-    universal_fatbin_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
+    solver = "potrf"
+    solver_enum = cusolver_function_map[solver]
-    # cuSOLVERDx only support col-major input/outputs,
+    # cuSOLVERDx only supports col-major input/outputs,
     # so we use upper to mimic a row-major input
-    result = warp.context.runtime.core.cuda_compile_solver(
-        universal_fatbin_code.name.encode("utf-8"),
-        lto_code.name.encode("utf-8"),
-        lto_symbol.encode("utf-8"),
-        0,
-        None,
-        None,
-        arch,
-        M,
-        N,
-        cusolver_function_map["potrf"],
-        precision_enum,
-        cusolver_fill_mode_map["upper"],
-        num_threads,
-    )
+    fill_mode = cusolver_fill_mode_map["upper"]
-    if not result:
-        for f in [lto_code, universal_fatbin_code]:
-            f.close()
-            if Path(f.name).exists():
-                Path(f.name).unlink()
-        raise RuntimeError("Failed to compile tile_cholesky")
+    arch = options["output_arch"]
+    num_threads = options["block_dim"]
+    parameter_list = f"({dtype}*, unsigned)"
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ((0, a, out), [], [], 0)
     else:
-        with open(lto_code.name, "rb") as f:
-            lto_code_data = f.read()
-        with open(universal_fatbin_code.name, "rb") as f:
-            universal_fatbin_code_data = f.read()
-        for f in [lto_code, universal_fatbin_code]:
-            f.close()
-            Path(f.name).unlink()
-    builder.ltoirs[lto_symbol] = lto_code_data
-    builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({dtype}*, unsigned);"
-    builder.fatbins["cholesky"] = universal_fatbin_code_data
+        # generate the LTO
+        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+            M,
+            N,
+            solver,
+            solver_enum,
+            fill_mode,
+            arch,
+            precision_enum,
+            num_threads,
+            parameter_list,
+            builder,
+        )
-    return ((Var(lto_symbol, str, False, True, False), a, out), [], [lto_code_data], 0)
+        return ((Var(lto_symbol, str, False, True, False), a, out), [], [lto_code_data], 0)
 add_builtin(
@@ -6602,57 +6714,36 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
             f"got {y.type.shape[0]} elements in output and {M} rows in 'L'"
         )
-    num_threads = options["block_dim"]
-    arch = options["output_arch"]
-    lto_symbol = f"potrs_{M}_{N}_{arch}_{precision_enum}"
-    # early out if LTO for this combination already exists for this module
-    if lto_symbol in builder.ltoirs:
-        return lto_symbol, builder.ltoirs[lto_symbol]
-    # otherwise compile LTO
-    lto_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
-    universal_fatbin_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
+    solver = "potrs"
+    solver_enum = cusolver_function_map[solver]
-    # cuSOLVERDx only support col-major input/outputs,
+    # cuSOLVERDx only supports col-major input/outputs,
     # so we use upper to mimic a row-major input
-    result = warp.context.runtime.core.cuda_compile_solver(
-        universal_fatbin_code.name.encode("utf-8"),
-        lto_code.name.encode("utf-8"),
-        lto_symbol.encode("utf-8"),
-        0,
-        None,
-        None,
-        arch,
-        M,
-        N,
-        cusolver_function_map["potrs"],
-        precision_enum,
-        cusolver_fill_mode_map["upper"],
-        num_threads,
-    )
+    fill_mode = cusolver_fill_mode_map["upper"]
-    if not result:
-        for f in [lto_code, universal_fatbin_code]:
-            f.close()
-            if Path(f.name).exists():
-                Path(f.name).unlink()
-        raise RuntimeError("Failed to compile tile_cholesky_solve")
+    arch = options["output_arch"]
+    num_threads = options["block_dim"]
+    parameter_list = f"({dtype}*, {dtype}*)"
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ((0, L, x, y), [], [], 0)
     else:
-        with open(lto_code.name, "rb") as f:
-            lto_code_data = f.read()
-        with open(universal_fatbin_code.name, "rb") as f:
-            universal_fatbin_code_data = f.read()
-        for f in [lto_code, universal_fatbin_code]:
-            f.close()
-            Path(f.name).unlink()
-    builder.ltoirs[lto_symbol] = lto_code_data
-    builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({dtype}*, {dtype}*);"
-    builder.fatbins["cholesky"] = universal_fatbin_code_data
-    return ((Var(lto_symbol, str, False, True, False), L, x, y), [], [lto_code_data], 0)
+        # generate the LTO
+        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+            M,
+            N,
+            solver,
+            solver_enum,
+            fill_mode,
+            arch,
+            precision_enum,
+            num_threads,
+            parameter_list,
+            builder,
+        )
+        return ((Var(lto_symbol, str, False, True, False), L, x, y), [], [lto_code_data], 0)
 add_builtin(