PyPI - warp-lang - Versions diffs - 1.6.1__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl - Mend

warp-lang 1.6.1__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (401) hide show

warp/__init__.py +21 -7
warp/autograd.py +14 -6
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +424 -6
warp/build_dll.py +20 -20
warp/builtins.py +467 -368
warp/codegen.py +193 -125
warp/config.py +56 -12
warp/constants.py +14 -6
warp/context.py +524 -277
warp/dlpack.py +22 -12
warp/examples/__init__.py +14 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_api.py +14 -6
warp/examples/benchmarks/benchmark_cloth.py +14 -6
warp/examples/benchmarks/benchmark_cloth_cupy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_jax.py +14 -6
warp/examples/benchmarks/benchmark_cloth_numba.py +15 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_paddle.py +14 -6
warp/examples/benchmarks/benchmark_cloth_pytorch.py +14 -6
warp/examples/benchmarks/benchmark_cloth_taichi.py +14 -6
warp/examples/benchmarks/benchmark_cloth_warp.py +14 -6
warp/examples/benchmarks/benchmark_gemm.py +82 -48
warp/examples/benchmarks/benchmark_interop_paddle.py +14 -6
warp/examples/benchmarks/benchmark_interop_torch.py +14 -6
warp/examples/benchmarks/benchmark_launches.py +14 -6
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/browse.py +14 -6
warp/examples/core/example_cupy.py +14 -6
warp/examples/core/example_dem.py +14 -6
warp/examples/core/example_fluid.py +14 -6
warp/examples/core/example_graph_capture.py +14 -6
warp/examples/core/example_marching_cubes.py +14 -6
warp/examples/core/example_mesh.py +14 -6
warp/examples/core/example_mesh_intersect.py +14 -6
warp/examples/core/example_nvdb.py +14 -6
warp/examples/core/example_raycast.py +14 -6
warp/examples/core/example_raymarch.py +14 -6
warp/examples/core/example_render_opengl.py +14 -6
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +14 -6
warp/examples/core/example_torch.py +14 -6
warp/examples/core/example_wave.py +14 -6
warp/examples/fem/example_adaptive_grid.py +14 -6
warp/examples/fem/example_apic_fluid.py +15 -7
warp/examples/fem/example_burgers.py +16 -8
warp/examples/fem/example_convection_diffusion.py +14 -6
warp/examples/fem/example_convection_diffusion_dg.py +14 -6
warp/examples/fem/example_deformed_geometry.py +15 -7
warp/examples/fem/example_diffusion.py +14 -6
warp/examples/fem/example_diffusion_3d.py +14 -6
warp/examples/fem/example_diffusion_mgpu.py +14 -6
warp/examples/fem/example_distortion_energy.py +15 -7
warp/examples/fem/example_magnetostatics.py +20 -12
warp/examples/fem/example_mixed_elasticity.py +14 -6
warp/examples/fem/example_navier_stokes.py +14 -6
warp/examples/fem/example_nonconforming_contact.py +14 -6
warp/examples/fem/example_stokes.py +14 -6
warp/examples/fem/example_stokes_transfer.py +14 -6
warp/examples/fem/example_streamlines.py +14 -6
warp/examples/fem/utils.py +24 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_bounce.py +14 -6
warp/examples/optim/example_cloth_throw.py +14 -6
warp/examples/optim/example_diffray.py +14 -6
warp/examples/optim/example_drone.py +14 -6
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/optim/example_inverse_kinematics.py +14 -6
warp/examples/optim/example_inverse_kinematics_torch.py +14 -6
warp/examples/optim/example_softbody_properties.py +14 -6
warp/examples/optim/example_spring_cage.py +14 -6
warp/examples/optim/example_trajectory.py +14 -6
warp/examples/sim/example_cartpole.py +14 -6
warp/examples/sim/example_cloth.py +14 -6
warp/examples/sim/example_cloth_self_contact.py +14 -6
warp/examples/sim/example_granular.py +14 -6
warp/examples/sim/example_granular_collision_sdf.py +14 -6
warp/examples/sim/example_jacobian_ik.py +14 -6
warp/examples/sim/example_particle_chain.py +14 -6
warp/examples/sim/example_quadruped.py +14 -6
warp/examples/sim/example_rigid_chain.py +14 -6
warp/examples/sim/example_rigid_contact.py +14 -6
warp/examples/sim/example_rigid_force.py +14 -6
warp/examples/sim/example_rigid_gyroscopic.py +14 -6
warp/examples/sim/example_rigid_soft_contact.py +14 -6
warp/examples/sim/example_soft_body.py +14 -6
warp/examples/tile/example_tile_cholesky.py +14 -6
warp/examples/tile/example_tile_convolution.py +14 -6
warp/examples/tile/example_tile_fft.py +14 -6
warp/examples/tile/example_tile_filtering.py +14 -6
warp/examples/tile/example_tile_matmul.py +16 -10
warp/examples/tile/example_tile_mlp.py +14 -6
warp/examples/tile/example_tile_nbody.py +14 -6
warp/examples/tile/example_tile_walker.py +14 -6
warp/fabric.py +15 -0
warp/fem/__init__.py +26 -1
warp/fem/adaptivity.py +19 -4
warp/fem/cache.py +15 -0
warp/fem/dirichlet.py +15 -0
warp/fem/domain.py +15 -0
warp/fem/field/__init__.py +15 -0
warp/fem/field/field.py +15 -0
warp/fem/field/nodal_field.py +37 -68
warp/fem/field/restriction.py +15 -0
warp/fem/field/virtual.py +77 -23
warp/fem/geometry/__init__.py +15 -0
warp/fem/geometry/adaptive_nanogrid.py +24 -10
warp/fem/geometry/closest_point.py +16 -1
warp/fem/geometry/deformed_geometry.py +20 -2
warp/fem/geometry/element.py +15 -0
warp/fem/geometry/geometry.py +20 -0
warp/fem/geometry/grid_2d.py +27 -12
warp/fem/geometry/grid_3d.py +27 -15
warp/fem/geometry/hexmesh.py +20 -7
warp/fem/geometry/nanogrid.py +24 -11
warp/fem/geometry/partition.py +15 -0
warp/fem/geometry/quadmesh.py +28 -13
warp/fem/geometry/tetmesh.py +18 -4
warp/fem/geometry/trimesh.py +18 -8
warp/fem/integrate.py +277 -93
warp/fem/linalg.py +20 -5
warp/fem/operator.py +15 -0
warp/fem/polynomial.py +15 -0
warp/fem/quadrature/__init__.py +15 -0
warp/fem/quadrature/pic_quadrature.py +52 -22
warp/fem/quadrature/quadrature.py +209 -25
warp/fem/space/__init__.py +16 -1
warp/fem/space/basis_function_space.py +19 -2
warp/fem/space/basis_space.py +40 -18
warp/fem/space/dof_mapper.py +15 -0
warp/fem/space/function_space.py +15 -0
warp/fem/space/grid_2d_function_space.py +15 -0
warp/fem/space/grid_3d_function_space.py +15 -0
warp/fem/space/hexmesh_function_space.py +17 -2
warp/fem/space/nanogrid_function_space.py +15 -0
warp/fem/space/partition.py +21 -2
warp/fem/space/quadmesh_function_space.py +23 -8
warp/fem/space/restriction.py +15 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +38 -23
warp/fem/space/shape/shape_function.py +15 -0
warp/fem/space/shape/square_shape_function.py +27 -12
warp/fem/space/shape/tet_shape_function.py +15 -0
warp/fem/space/shape/triangle_shape_function.py +16 -1
warp/fem/space/tetmesh_function_space.py +18 -3
warp/fem/space/topology.py +15 -0
warp/fem/space/trimesh_function_space.py +17 -2
warp/fem/types.py +15 -0
warp/fem/utils.py +27 -6
warp/jax.py +28 -7
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -33
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +103 -6
warp/native/array.h +28 -6
warp/native/builtin.h +44 -9
warp/native/bvh.cpp +18 -7
warp/native/bvh.cu +57 -20
warp/native/bvh.h +17 -7
warp/native/clang/clang.cpp +45 -9
warp/native/coloring.cpp +15 -6
warp/native/crt.cpp +15 -6
warp/native/crt.h +15 -6
warp/native/cuda_crt.h +15 -6
warp/native/cuda_util.cpp +29 -6
warp/native/cuda_util.h +17 -6
warp/native/error.cpp +15 -6
warp/native/error.h +15 -6
warp/native/exports.h +85 -63
warp/native/fabric.h +15 -6
warp/native/hashgrid.cpp +15 -6
warp/native/hashgrid.cu +15 -6
warp/native/hashgrid.h +15 -6
warp/native/initializer_array.h +15 -6
warp/native/intersect.h +41 -32
warp/native/intersect_adj.h +48 -39
warp/native/intersect_tri.h +17 -0
warp/native/marching.cpp +16 -0
warp/native/marching.cu +16 -7
warp/native/marching.h +17 -0
warp/native/mat.h +528 -15
warp/native/mathdx.cpp +15 -6
warp/native/matnn.h +15 -6
warp/native/mesh.cpp +15 -6
warp/native/mesh.cu +15 -6
warp/native/mesh.h +25 -16
warp/native/noise.h +15 -6
warp/native/quat.h +114 -17
warp/native/rand.h +21 -6
warp/native/range.h +15 -6
warp/native/reduce.cpp +15 -6
warp/native/reduce.cu +15 -6
warp/native/runlength_encode.cpp +15 -6
warp/native/runlength_encode.cu +15 -6
warp/native/scan.cpp +15 -6
warp/native/scan.cu +15 -6
warp/native/scan.h +15 -6
warp/native/solid_angle.h +17 -0
warp/native/sort.cpp +137 -65
warp/native/sort.cu +167 -21
warp/native/sort.h +23 -7
warp/native/sparse.cpp +58 -28
warp/native/sparse.cu +67 -23
warp/native/spatial.h +15 -6
warp/native/svd.h +131 -6
warp/native/temp_buffer.h +15 -6
warp/native/tile.h +316 -111
warp/native/tile_reduce.h +61 -9
warp/native/vec.h +83 -13
warp/native/volume.cpp +100 -119
warp/native/volume.cu +15 -6
warp/native/volume.h +15 -6
warp/native/volume_builder.cu +40 -16
warp/native/volume_builder.h +21 -6
warp/native/volume_impl.h +15 -6
warp/native/warp.cpp +20 -12
warp/native/warp.cu +114 -16
warp/native/warp.h +34 -16
warp/optim/__init__.py +14 -6
warp/optim/adam.py +14 -6
warp/optim/linear.py +25 -10
warp/optim/sgd.py +14 -6
warp/paddle.py +14 -6
warp/render/__init__.py +14 -6
warp/render/render_opengl.py +14 -6
warp/render/render_usd.py +14 -6
warp/render/utils.py +14 -6
warp/sim/__init__.py +14 -7
warp/sim/articulation.py +18 -10
warp/sim/collide.py +35 -16
warp/sim/graph_coloring.py +14 -6
warp/sim/import_mjcf.py +463 -162
warp/sim/import_snu.py +14 -7
warp/sim/import_urdf.py +46 -18
warp/sim/import_usd.py +14 -7
warp/sim/inertia.py +14 -6
warp/sim/integrator.py +14 -6
warp/sim/integrator_euler.py +19 -11
warp/sim/integrator_featherstone.py +17 -16
warp/sim/integrator_vbd.py +222 -8
warp/sim/integrator_xpbd.py +19 -11
warp/sim/model.py +56 -19
warp/sim/particles.py +14 -6
warp/sim/render.py +14 -6
warp/sim/utils.py +17 -2
warp/sparse.py +657 -555
warp/stubs.py +231 -19
warp/tape.py +14 -6
warp/tests/aux_test_class_kernel.py +14 -6
warp/tests/aux_test_compile_consts_dummy.py +14 -6
warp/tests/aux_test_conditional_unequal_types_kernels.py +14 -6
warp/tests/aux_test_dependent.py +14 -6
warp/tests/aux_test_grad_customs.py +14 -6
warp/tests/aux_test_instancing_gc.py +14 -6
warp/tests/aux_test_module_unload.py +14 -6
warp/tests/aux_test_name_clash1.py +14 -6
warp/tests/aux_test_name_clash2.py +14 -6
warp/tests/aux_test_unresolved_func.py +14 -6
warp/tests/aux_test_unresolved_symbol.py +14 -6
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_async.py → cuda/test_async.py} +14 -6
warp/tests/{test_ipc.py → cuda/test_ipc.py} +14 -6
warp/tests/{test_mempool.py → cuda/test_mempool.py} +53 -6
warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +14 -6
warp/tests/{test_peer.py → cuda/test_peer.py} +14 -6
warp/tests/{test_pinned.py → cuda/test_pinned.py} +14 -6
warp/tests/{test_streams.py → cuda/test_streams.py} +85 -6
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_bvh.py → geometry/test_bvh.py} +14 -6
warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +14 -6
warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +14 -6
warp/tests/{test_mesh.py → geometry/test_mesh.py} +14 -6
warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +14 -6
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +80 -69
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +15 -7
warp/tests/{test_volume.py → geometry/test_volume.py} +55 -12
warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +14 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +42 -11
warp/tests/{test_jax.py → interop/test_jax.py} +14 -6
warp/tests/{test_paddle.py → interop/test_paddle.py} +14 -6
warp/tests/{test_torch.py → interop/test_torch.py} +14 -6
warp/tests/run_coverage_serial.py +14 -6
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +23 -16
warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +14 -6
warp/tests/{test_collision.py → sim/test_collision.py} +16 -8
warp/tests/{test_coloring.py → sim/test_coloring.py} +14 -7
warp/tests/{test_model.py → sim/test_model.py} +55 -7
warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +14 -6
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +16 -7
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_adam.py +14 -6
warp/tests/test_arithmetic.py +14 -6
warp/tests/test_array.py +14 -6
warp/tests/test_array_reduce.py +14 -6
warp/tests/test_assert.py +14 -6
warp/tests/test_atomic.py +14 -6
warp/tests/test_bool.py +15 -7
warp/tests/test_builtins_resolution.py +14 -6
warp/tests/test_closest_point_edge_edge.py +14 -6
warp/tests/test_codegen.py +14 -6
warp/tests/test_codegen_instancing.py +14 -6
warp/tests/test_compile_consts.py +14 -6
warp/tests/test_conditional.py +14 -6
warp/tests/test_context.py +14 -6
warp/tests/test_copy.py +14 -6
warp/tests/test_ctypes.py +14 -6
warp/tests/test_dense.py +14 -6
warp/tests/test_devices.py +14 -6
warp/tests/test_examples.py +42 -42
warp/tests/test_fabricarray.py +14 -6
warp/tests/test_fast_math.py +14 -6
warp/tests/test_fem.py +37 -10
warp/tests/test_fp16.py +14 -6
warp/tests/test_func.py +14 -6
warp/tests/test_future_annotations.py +14 -6
warp/tests/test_generics.py +14 -6
warp/tests/test_grad.py +14 -6
warp/tests/test_grad_customs.py +14 -6
warp/tests/test_grad_debug.py +14 -6
warp/tests/test_implicit_init.py +14 -6
warp/tests/test_import.py +14 -6
warp/tests/test_indexedarray.py +14 -6
warp/tests/test_intersect.py +14 -6
warp/tests/test_iter.py +14 -6
warp/tests/test_large.py +14 -6
warp/tests/test_launch.py +14 -6
warp/tests/test_lerp.py +14 -6
warp/tests/test_linear_solvers.py +15 -11
warp/tests/test_lvalue.py +14 -6
warp/tests/test_mat.py +247 -85
warp/tests/test_mat_lite.py +14 -6
warp/tests/test_mat_scalar_ops.py +18 -10
warp/tests/test_math.py +14 -6
warp/tests/test_mlp.py +14 -6
warp/tests/test_module_hashing.py +14 -6
warp/tests/test_modules_lite.py +14 -6
warp/tests/test_noise.py +14 -6
warp/tests/test_operators.py +14 -6
warp/tests/test_options.py +14 -6
warp/tests/test_overwrite.py +15 -60
warp/tests/test_print.py +14 -6
warp/tests/test_quat.py +81 -52
warp/tests/test_rand.py +58 -43
warp/tests/test_reload.py +14 -6
warp/tests/test_rounding.py +14 -6
warp/tests/test_runlength_encode.py +14 -6
warp/tests/test_scalar_ops.py +14 -6
warp/tests/test_smoothstep.py +14 -6
warp/tests/test_snippet.py +15 -0
warp/tests/test_sparse.py +61 -12
warp/tests/test_spatial.py +89 -6
warp/tests/test_special_values.py +14 -6
warp/tests/test_static.py +15 -7
warp/tests/test_struct.py +14 -6
warp/tests/test_tape.py +14 -6
warp/tests/test_transient_module.py +14 -6
warp/tests/test_triangle_closest_point.py +14 -6
warp/tests/test_types.py +14 -6
warp/tests/test_utils.py +98 -10
warp/tests/test_vec.py +60 -40
warp/tests/test_vec_lite.py +14 -6
warp/tests/test_vec_scalar_ops.py +14 -6
warp/tests/test_verify_fp.py +14 -6
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +150 -57
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +15 -7
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +23 -12
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +39 -20
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +74 -7
warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +14 -6
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +15 -7
warp/tests/unittest_serial.py +15 -6
warp/tests/unittest_suites.py +59 -65
warp/tests/unittest_utils.py +16 -7
warp/tests/walkthrough_debug.py +14 -6
warp/thirdparty/unittest_parallel.py +15 -8
warp/torch.py +14 -6
warp/types.py +124 -664
warp/utils.py +151 -78
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/METADATA +39 -12
warp_lang-1.7.0.dist-info/RECORD +429 -0
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
warp/examples/optim/example_walker.py +0 -309
warp/native/cutlass_gemm.cpp +0 -34
warp/native/cutlass_gemm.cu +0 -373
warp/tests/test_matmul.py +0 -503
warp/tests/test_matmul_lite.py +0 -403
warp/tests/test_vbd.py +0 -378
warp/tests/unused_test_misc.py +0 -69
warp_lang-1.6.1.dist-info/LICENSE.md +0 -126
warp_lang-1.6.1.dist-info/RECORD +0 -419
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/builtins.py CHANGED Viewed

@@ -1,15 +1,24 @@
-# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import builtins
 import functools
-import tempfile
-from pathlib import Path
 from typing import Any, Callable, Mapping, Sequence
+import warp.build
+import warp.context
 from warp.codegen import Reference, Var, strip_reference
 from warp.types import *
@@ -32,7 +41,7 @@ def sametypes(arg_types: Mapping[str, Any]):
     return all(types_equal(arg_type_0, t) for t in arg_types_iter)
-def sametypes_create_value_func(default):
+def sametypes_create_value_func(default: TypeVar):
     def fn(arg_types, arg_values):
         if arg_types is None:
             return default
@@ -390,7 +399,7 @@ add_builtin(
 )
-def scalar_infer_type(arg_types: Mapping[str, type]):
+def scalar_infer_type(arg_types: Union[Mapping[str, type], Tuple[type, ...], None]):
     if arg_types is None:
         return Scalar
@@ -941,6 +950,12 @@ def matrix_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
             raise RuntimeError("the `shape` argument must be specified when initializing a matrix by value")
         if all(type_is_vector(x) for x in variadic_arg_types):
+            warp.utils.warn(
+                "the built-in `wp.matrix()` won't support taking column vectors as input "
+                "in the future. Use `wp.matrix_from_rows()` or `wp.matrix_from_cols()` instead.",
+                DeprecationWarning,
+            )
             if shape[1] != variadic_arg_count:
                 raise RuntimeError(
                     f"incompatible number of column vectors given ({variadic_arg_count}) "
@@ -1021,6 +1036,86 @@ add_builtin(
 )
+def matrix_from_vecs_create_value_func(cols: bool):
+    def fn(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+        if arg_types is None:
+            return matrix(shape=(Any, Any), dtype=Scalar)
+        variadic_arg_types = arg_types.get("args", ())
+        variadic_arg_count = len(variadic_arg_types)
+        if not all(type_is_vector(x) for x in variadic_arg_types):
+            raise RuntimeError("all arguments are expected to be vectors")
+        length = variadic_arg_types[0]._length_
+        if any(x._length_ != length for x in variadic_arg_types):
+            raise RuntimeError("all vectors are expected to have the same length")
+        dtype = variadic_arg_types[0]._wp_scalar_type_
+        if any(x._wp_scalar_type_ != dtype for x in variadic_arg_types):
+            raise RuntimeError("all vectors are expected to have the same dtype")
+        shape = (length, variadic_arg_count) if cols else (variadic_arg_count, length)
+        return matrix(shape=shape, dtype=dtype)
+    return fn
+def matrix_from_vecs_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    # We're in the codegen stage where we emit the code calling the built-in.
+    # Further validate the given argument values if needed and map them
+    # to the underlying C++ function's runtime and template params.
+    shape = return_type._shape_
+    dtype = return_type._wp_scalar_type_
+    variadic_args = args.get("args", ())
+    func_args = variadic_args
+    if shape in ((2, 2), (3, 3), (4, 4)):
+        # Template specializations exist for these shapes, don't pass them
+        # as template parameters.
+        template_args = (dtype,)
+    else:
+        template_args = (*shape, dtype)
+    return (func_args, template_args)
+def matrix_from_vecs_initializer_list_func(args, return_type):
+    shape = return_type._shape_
+    return shape[0] != shape[1] or shape[0] > 4
+add_builtin(
+    "matrix_from_cols",
+    input_types={"*args": vector(length=Any, dtype=Scalar)},
+    variadic=True,
+    value_func=matrix_from_vecs_create_value_func(cols=True),
+    dispatch_func=matrix_from_vecs_dispatch_func,
+    initializer_list_func=matrix_from_vecs_initializer_list_func,
+    native_func="matrix_from_cols",
+    doc="Construct a matrix from column vectors.",
+    group="Vector Math",
+    export=False,
+)
+add_builtin(
+    "matrix_from_rows",
+    input_types={"*args": vector(length=Any, dtype=Scalar)},
+    variadic=True,
+    value_func=matrix_from_vecs_create_value_func(cols=False),
+    dispatch_func=matrix_from_vecs_dispatch_func,
+    initializer_list_func=matrix_from_vecs_initializer_list_func,
+    native_func="matrix_from_rows",
+    doc="Construct a matrix from row vectors.",
+    group="Vector Math",
+    export=False,
+)
 def identity_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     if arg_types is None:
         return matrix(shape=(Any, Any), dtype=Scalar)
@@ -1132,6 +1227,21 @@ add_builtin(
     while the left and right basis vectors are returned in ``U`` and ``V``.""",
 )
+add_builtin(
+    "svd2",
+    input_types={
+        "A": matrix(shape=(2, 2), dtype=Float),
+        "U": matrix(shape=(2, 2), dtype=Float),
+        "sigma": vector(length=2, dtype=Float),
+        "V": matrix(shape=(2, 2), dtype=Scalar),
+    },
+    value_type=None,
+    group="Vector Math",
+    export=False,
+    doc="""Compute the SVD of a 2x2 matrix ``A``. The singular values are returned in ``sigma``,
+    while the left and right basis vectors are returned in ``U`` and ``V``.""",
+)
 add_builtin(
     "qr3",
     input_types={
@@ -1323,7 +1433,18 @@ add_builtin(
     input_types={"mat": matrix(shape=(3, 3), dtype=Float)},
     value_func=lambda arg_types, arg_values: quaternion(dtype=float_infer_type(arg_types)),
     group="Quaternion Math",
-    doc="Construct a quaternion from a 3x3 matrix.",
+    doc="""Construct a quaternion from a 3x3 matrix.
+    If the matrix is not a pure rotation, but for example includes scaling or skewing, the result is undefined.""",
+)
+add_builtin(
+    "quat_from_matrix",
+    input_types={"mat": matrix(shape=(4, 4), dtype=Float)},
+    value_func=lambda arg_types, arg_values: quaternion(dtype=float_infer_type(arg_types)),
+    group="Quaternion Math",
+    doc="""Construct a quaternion from a 4x4 matrix.
+    If the top-left 3x3 block of the matrix is not a pure rotation, but for example includes scaling or skewing, the result is undefined.""",
 )
 add_builtin(
     "quat_rpy",
@@ -2366,7 +2487,7 @@ add_builtin(
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
-    * If the input value is a scalar, then the resulting tile has ``shape=(1, block_dim)``
+    * If the input value is a scalar, then the resulting tile has ``shape=(block_dim,)``
     * If the input value is a vector, then the resulting tile has ``shape=(length(vector), block_dim)``
     :param x: A per-thread local value, e.g. scalar, vector, or matrix.
@@ -2660,11 +2781,9 @@ def tile_broadcast_value_func(arg_types, arg_values):
 def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
     tile = arg_values["a"]
-    template_args = []
-    template_args.append(return_type.shape[0])
-    template_args.append(return_type.shape[1])
-    template_args.append(return_type.strides[0])
-    template_args.append(return_type.strides[1])
+    assert len(return_type.shape) == len(return_type.strides)
+    assert 1 <= len(return_type.shape) <= 4
+    template_args = [*return_type.shape, *return_type.strides]
     return ((tile,), template_args)
@@ -2677,56 +2796,17 @@ add_builtin(
     variadic=False,
     doc="""Broadcast a tile.
-    This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n).
+    Broadcasts the input tile ``a`` to the destination shape.
     Broadcasting follows NumPy broadcast rules.
     :param a: Tile to broadcast
     :param shape: The shape to broadcast to
-    :returns: Tile with broadcast ``shape=(m, n)``""",
+    :returns: Tile with broadcast shape""",
     group="Tile Primitives",
     export=False,
 )
-def tile_matmul_value_func(arg_types, arg_values):
-    # return generic type (for doc builds)
-    if arg_types is None:
-        return Tile(dtype=Any, shape=Any)
-    if len(arg_types) != 3:
-        raise TypeError(f"tile_matmul() takes exactly 3 positional arguments but {len(arg_types)} were given")
-    return None
-def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
-    a = arg_values["a"]
-    b = arg_values["b"]
-    out = arg_values["out"]
-    # force the storage type of the input variables to shared memory
-    a.type.storage = "shared"
-    b.type.storage = "shared"
-    out.type.storage = "shared"
-    template_args = []
-    return ((a, b, out), template_args)
-add_builtin(
-    "tile_matmul_scalar",
-    input_types={"a": Tile, "b": Tile, "out": Tile},
-    value_func=tile_matmul_value_func,
-    dispatch_func=tile_matmul_dispatch_func,
-    variadic=True,
-    doc="Compute matrix product and accumulate out += a*b.",
-    group="Tile Primitives",
-    hidden=True,
-    export=False,
-)
 def tile_sum_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -3021,7 +3101,7 @@ def tile_binary_map_value_func(arg_types, arg_values):
     for i in range(len(a.shape)):
         if a.shape[i] != b.shape[i]:
-            raise ValueError(f"tile_map() shapes do not match on dimension {i}, got {a.shape[i]} and {b.shape[i]}")
+            raise ValueError(f"tile_map() shapes do not match on dimension {i}, got {a.shape} and {b.shape}")
     return TileBinaryMap(a, b)
@@ -3798,6 +3878,18 @@ _volume_supported_value_types = {
 }
+def _is_volume_type_supported(dtype):
+    for typ in _volume_supported_value_types:
+        if types_equal(typ, dtype):
+            return True
+    return False
+def _check_volume_type_is_supported(dtype):
+    if not _is_volume_type_supported(dtype):
+        raise RuntimeError(f"unsupported volume type `{type_repr(dtype)}`")
 def check_volume_value_grad_compatibility(dtype, grad_dtype):
     if type_is_vector(dtype):
         expected = matrix(shape=(type_length(dtype), 3), dtype=type_scalar_type(dtype))
@@ -3813,9 +3905,7 @@ def volume_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
         return Any
     dtype = arg_values["dtype"]
-    if dtype not in _volume_supported_value_types:
-        raise RuntimeError(f"unsupported volume type `{dtype.__name__}`")
+    _check_volume_type_is_supported(dtype)
     return dtype
@@ -3851,9 +3941,7 @@ def volume_sample_grad_value_func(arg_types: Mapping[str, type], arg_values: Map
         return Any
     dtype = arg_values["dtype"]
-    if dtype not in _volume_supported_value_types:
-        raise RuntimeError(f"unsupported volume type `{dtype.__name__}`")
+    _check_volume_type_is_supported(dtype)
     check_volume_value_grad_compatibility(dtype, arg_types["grad"])
@@ -3891,9 +3979,7 @@ def volume_lookup_value_func(arg_types: Mapping[str, type], arg_values: Mapping[
         return Any
     dtype = arg_values["dtype"]
-    if dtype not in _volume_supported_value_types:
-        raise RuntimeError(f"unsupported volume type `{dtype.__name__}`")
+    _check_volume_type_is_supported(dtype)
     return dtype
@@ -3930,9 +4016,7 @@ def volume_store_value_func(arg_types: Mapping[str, type], arg_values: Mapping[s
         return None
     dtype = arg_types["value"]
-    if dtype not in _volume_supported_value_types:
-        raise RuntimeError(f"unsupported volume type `{dtype.__name__}`")
+    _check_volume_type_is_supported(dtype)
     return None
@@ -4182,6 +4266,20 @@ add_builtin(
     group="Random",
     doc="Return a random integer between [low, high).",
 )
+add_builtin(
+    "randu",
+    input_types={"state": uint32},
+    value_type=uint32,
+    group="Random",
+    doc="Return a random unsigned integer in the range [0, 2^32).",
+)
+add_builtin(
+    "randu",
+    input_types={"state": uint32, "low": uint32, "high": uint32},
+    value_type=uint32,
+    group="Random",
+    doc="Return a random unsigned integer between [low, high).",
+)
 add_builtin(
     "randf",
     input_types={"state": uint32},
@@ -4490,11 +4588,31 @@ add_builtin(
     export=False,
     group="Utility",
 )
+def select_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    warp.utils.warn(
+        "wp.select() is deprecated and will be removed in a future\n"
+        "version. Use wp.where(cond, value_if_true, value_if_false) instead.",
+        category=DeprecationWarning,
+    )
+    func_args = tuple(args.values())
+    template_args = ()
+    return (func_args, template_args)
 add_builtin(
     "select",
     input_types={"cond": builtins.bool, "value_if_false": Any, "value_if_true": Any},
     value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-    doc="Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``",
+    dispatch_func=select_dispatch_func,
+    doc="""Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``.
+    .. deprecated:: 1.7
+         Use :func:`where` instead, which has the more intuitive argument order:
+         ``where(cond, value_if_true, value_if_false)``.""",
     group="Utility",
 )
 for t in int_types:
@@ -4502,14 +4620,47 @@ for t in int_types:
         "select",
         input_types={"cond": t, "value_if_false": Any, "value_if_true": Any},
         value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-        doc="Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``",
+        dispatch_func=select_dispatch_func,
+        doc="""Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``.
+    .. deprecated:: 1.7
+         Use :func:`where` instead, which has the more intuitive argument order:
+         ``where(cond, value_if_true, value_if_false)``.""",
         group="Utility",
     )
 add_builtin(
     "select",
     input_types={"arr": array(dtype=Any), "value_if_false": Any, "value_if_true": Any},
     value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-    doc="Select between two arguments, if ``arr`` is null then return ``value_if_false``, otherwise return ``value_if_true``",
+    dispatch_func=select_dispatch_func,
+    doc="""Select between two arguments, if ``arr`` is null then return ``value_if_false``, otherwise return ``value_if_true``.
+    .. deprecated:: 1.7
+         Use :func:`where` instead, which has the more intuitive argument order:
+         ``where(arr, value_if_true, value_if_false)``.""",
+    group="Utility",
+)
+add_builtin(
+    "where",
+    input_types={"cond": builtins.bool, "value_if_true": Any, "value_if_false": Any},
+    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    doc="Select between two arguments, if ``cond`` is ``True`` then return ``value_if_true``, otherwise return ``value_if_false``.",
+    group="Utility",
+)
+for t in int_types:
+    add_builtin(
+        "where",
+        input_types={"cond": t, "value_if_true": Any, "value_if_false": Any},
+        value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+        doc="Select between two arguments, if ``cond`` is ``True`` then return ``value_if_true``, otherwise return ``value_if_false``.",
+        group="Utility",
+    )
+add_builtin(
+    "where",
+    input_types={"arr": array(dtype=Any), "value_if_true": Any, "value_if_false": Any},
+    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    doc="Select between two arguments, if ``arr`` is not null then return ``value_if_true``, otherwise return ``value_if_false``.",
     group="Utility",
 )
@@ -5103,33 +5254,51 @@ add_builtin(
 )
+# implements vector[index] = value
+add_builtin(
+    "assign_inplace",
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
+# implements quaternion[index] = value
+add_builtin(
+    "assign_inplace",
+    input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
 def vector_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     vec_type = arg_types["a"]
     return vec_type
-# implements vector[index] = value
+# implements vector[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
-    "assign",
+    "assign_copy",
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_func=vector_assign_value_func,
     hidden=True,
     group="Utility",
 )
-# implements quaternion[index] = value
+# implements quaternion[index] = value, performs a copy internally if wp.config.enable_vector_component_overwrites is True
 add_builtin(
-    "assign",
+    "assign_copy",
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_func=vector_assign_value_func,
     hidden=True,
     group="Utility",
 )
 # implements vector[idx] += scalar
 add_builtin(
-    "augassign_add",
+    "add_inplace",
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5138,7 +5307,7 @@ add_builtin(
 # implements quaternion[idx] += scalar
 add_builtin(
-    "augassign_add",
+    "add_inplace",
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5147,7 +5316,7 @@ add_builtin(
 # implements vector[idx] -= scalar
 add_builtin(
-    "augassign_sub",
+    "sub_inplace",
     input_types={"a": vector(length=Any, dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5156,7 +5325,7 @@ add_builtin(
 # implements quaternion[idx] -= scalar
 add_builtin(
-    "augassign_sub",
+    "sub_inplace",
     input_types={"a": quaternion(dtype=Scalar), "i": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5200,11 +5369,6 @@ add_builtin(
 )
-def matrix_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
-    mat_type = arg_types["a"]
-    return mat_type
 def matrix_vector_sametype(arg_types: Mapping[str, Any]):
     mat_size = arg_types["a"]._shape_[0]
     vec_size = arg_types["value"]._length_
@@ -5215,7 +5379,33 @@ def matrix_vector_sametype(arg_types: Mapping[str, Any]):
 # implements matrix[i,j] = scalar
 add_builtin(
-    "assign",
+    "assign_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
+# implements matrix[i] = vector
+add_builtin(
+    "assign_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
+    constraint=matrix_vector_sametype,
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
+def matrix_assign_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    mat_type = arg_types["a"]
+    return mat_type
+# implements matrix[i,j] = scalar
+add_builtin(
+    "assign_copy",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_func=matrix_assign_value_func,
     hidden=True,
@@ -5225,7 +5415,7 @@ add_builtin(
 # implements matrix[i] = vector
 add_builtin(
-    "assign",
+    "assign_copy",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
     constraint=matrix_vector_sametype,
     value_func=matrix_assign_value_func,
@@ -5236,7 +5426,7 @@ add_builtin(
 # implements matrix[i,j] += scalar
 add_builtin(
-    "augassign_add",
+    "add_inplace",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5244,9 +5434,20 @@ add_builtin(
 )
+# implements matrix[i] += vector
+add_builtin(
+    "add_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
+    constraint=matrix_vector_sametype,
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
 # implements matrix[i,j] -= scalar
 add_builtin(
-    "augassign_sub",
+    "sub_inplace",
     input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "j": int, "value": Scalar},
     value_type=None,
     hidden=True,
@@ -5254,6 +5455,16 @@ add_builtin(
 )
+# implements matrix[i] -= vector
+add_builtin(
+    "sub_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": int, "value": vector(length=Any, dtype=Scalar)},
+    value_type=None,
+    hidden=True,
+    group="Utility",
+)
 for t in scalar_types + vector_types + (bool,):
     if "vec" in t.__name__ or "mat" in t.__name__:
         continue
@@ -5401,7 +5612,27 @@ add_builtin(
 )
 add_builtin(
     "expect_near",
-    input_types={"a": vec3, "b": vec3, "tolerance": float},
+    input_types={"a": vector(length=Any, dtype=Float), "b": vector(length=Any, dtype=Float), "tolerance": Float},
+    defaults={"tolerance": 1.0e-6},
+    value_type=None,
+    doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
+    group="Utility",
+)
+add_builtin(
+    "expect_near",
+    input_types={"a": quaternion(dtype=Float), "b": quaternion(dtype=Float), "tolerance": Float},
+    defaults={"tolerance": 1.0e-6},
+    value_type=None,
+    doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
+    group="Utility",
+)
+add_builtin(
+    "expect_near",
+    input_types={
+        "a": matrix(shape=(Any, Any), dtype=Float),
+        "b": matrix(shape=(Any, Any), dtype=Float),
+        "tolerance": Float,
+    },
     defaults={"tolerance": 1.0e-6},
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
@@ -5980,7 +6211,7 @@ add_builtin(
 ##
 ## Matmul
 ##
-def tile_matmul_generic_value_func(arg_types, arg_values):
+def tile_matmul_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
         return Tile(dtype=Any, shape=Any)
@@ -6006,7 +6237,7 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
     return None
-def tile_matmul_generic_lto_dispatch_func(
+def tile_matmul_lto_dispatch_func(
     arg_types: Mapping[str, type],
     return_type: Any,
     return_values: List[Var],
@@ -6045,142 +6276,82 @@ def tile_matmul_generic_lto_dispatch_func(
     out.type.storage = "shared"
     template_args = [accumulate]
-    # Maps Python/Warp types to C++ types and enums
-    def cublasdx_type_map(dtype):
-        if dtype == float16:
-            return ("wp::float16", 3, 0)
-        if dtype == float32:
-            return ("wp::float32", 5, 0)
-        if dtype == float64:
-            return ("wp::float64", 6, 0)
-        if dtype == vec2h:
-            return ("wp::vec2h", 3, 1)
-        if dtype == vec2f:
-            return ("wp::vec2f", 5, 1)
-        if dtype == vec2d:
-            return ("wp::vec2d", 6, 1)
-        raise TypeError("Unsupported input type in tile_matmul")
-    def cublasdx_arrangement_map(layout):
-        if layout == "colmajor":
-            return 0  # CUBLASDX_ARRANGEMENT_COL_MAJOR
-        if layout == "rowmajor":
-            return 1  # CUBLASDX_ARRANGEMENT_ROW_MAJOR
-        raise ValueError("Unsupported layout in tile_matmul")
-    # generate the LTO
     M, K = a.type.shape[0], a.type.shape[1]
     _, N = b.type.shape[0], b.type.shape[1]
     num_threads = options["block_dim"]
     arch = options["output_arch"]
-    def make_function(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout):
-        (a_dtype, a_prec, a_type) = cublasdx_type_map(adtype)
-        (b_dtype, b_prec, b_type) = cublasdx_type_map(bdtype)
-        (c_dtype, c_prec, c_type) = cublasdx_type_map(cdtype)
-        a_arrangement = cublasdx_arrangement_map(alayout)
-        b_arrangement = cublasdx_arrangement_map(blayout)
-        c_arrangement = cublasdx_arrangement_map(clayout)
-        if a_type != b_type or a_type != c_type:
-            raise TypeError("time_matmul(A, B, C) requires all inputs to be real or complex")
-        element_type = a_type
-        lto_symbol = f"dot_{M}_{N}_{K}_{arch}_{num_threads}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ((0, 0, 0, a, b, out), template_args, [], 0)
+    else:
-        # early out if LTO for this combination already exists for this module
-        if lto_symbol in builder.ltoirs:
-            return lto_symbol, builder.ltoirs[lto_symbol]
+        def tile_flip_layout(layout):
+            if layout == "rowmajor":
+                return "colmajor"
+            elif layout == "colmajor":
+                return "rowmajor"
-        # otherwise compile LTO
-        lto_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
-        result = warp.context.runtime.core.cuda_compile_dot(
-            lto_code.name.encode("utf-8"),
-            lto_symbol.encode("utf-8"),
-            0,
-            None,
-            None,
+        # generate the LTOs
+        #    C += A * B
+        (fun_forward, lto_forward) = warp.build.build_lto_dot(
+            M,
+            N,
+            K,
+            a.type.dtype,
+            b.type.dtype,
+            out.type.dtype,
+            a.type.layout,
+            b.type.layout,
+            out.type.layout,
             arch,
+            num_threads,
+            builder,
+        )
+        # adjA += adjC * B^T - Transpose ~= flipped layout
+        (fun_backward_A, lto_backward_A) = warp.build.build_lto_dot(
             M,
+            K,
             N,
+            out.type.dtype,
+            b.type.dtype,
+            a.type.dtype,
+            out.type.layout,
+            tile_flip_layout(b.type.layout),
+            a.type.layout,
+            arch,
+            num_threads,
+            builder,
+        )
+        # adjB += A^T * adjC - Transpose ~= flipped layout
+        (fun_backward_B, lto_backward_B) = warp.build.build_lto_dot(
             K,
-            a_prec,
-            b_prec,
-            c_prec,
-            element_type,
-            a_arrangement,
-            b_arrangement,
-            c_arrangement,
+            N,
+            M,
+            a.type.dtype,
+            out.type.dtype,
+            b.type.dtype,
+            tile_flip_layout(a.type.layout),
+            out.type.layout,
+            b.type.layout,
+            arch,
             num_threads,
+            builder,
         )
-        lto_code_path = Path(lto_code.name)
-        if not result:
-            lto_code.close()
-            if lto_code_path.exists():
-                lto_code_path.unlink()
-            raise RuntimeError("Failed to compile tile_matmul")
-        else:
-            with open(lto_code.name, "rb") as f:
-                lto_code_data = f.read()
-            lto_code.close()
-            lto_code_path.unlink()
-            builder.ltoirs[lto_symbol] = lto_code_data
-            builder.ltoirs_decl[lto_symbol] = (
-                f"void {lto_symbol}({c_dtype}, {a_dtype}*, {b_dtype}*, {c_dtype}, {c_dtype}*);"
-            )
-            return lto_symbol, lto_code_data
-    def tile_flip_layout(layout):
-        if layout == "rowmajor":
-            return "colmajor"
-        elif layout == "colmajor":
-            return "rowmajor"
-    #    C += A * B
-    (fun_forward, lto_forward) = make_function(
-        M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a.type.layout, b.type.layout, out.type.layout
-    )
-    # adjA += adjC * B^T - Transpose ~= flipped layout
-    (fun_backward_A, lto_backward_A) = make_function(
-        M,
-        K,
-        N,
-        out.type.dtype,
-        b.type.dtype,
-        a.type.dtype,
-        out.type.layout,
-        tile_flip_layout(b.type.layout),
-        a.type.layout,
-    )
-    # adjB += A^T * adjC - Transpose ~= flipped layout
-    (fun_backward_B, lto_backward_B) = make_function(
-        K,
-        N,
-        M,
-        a.type.dtype,
-        out.type.dtype,
-        b.type.dtype,
-        tile_flip_layout(a.type.layout),
-        out.type.layout,
-        b.type.layout,
-    )
-    return (
-        (
-            Var(fun_forward, str, False, True, False),
-            Var(fun_backward_A, str, False, True, False),
-            Var(fun_backward_B, str, False, True, False),
-            a,
-            b,
-            out,
-        ),
-        template_args,
-        [lto_forward, lto_backward_A, lto_backward_B],
-        0,
-    )
+        return (
+            (
+                Var(fun_forward, str, False, True, False),
+                Var(fun_backward_A, str, False, True, False),
+                Var(fun_backward_B, str, False, True, False),
+                a,
+                b,
+                out,
+            ),
+            template_args,
+            [lto_forward, lto_backward_A, lto_backward_B],
+            0,
+        )
 add_builtin(
@@ -6190,8 +6361,8 @@ add_builtin(
         "b": Tile(dtype=Any, shape=Any),
         "out": Tile(dtype=Any, shape=Any),
     },
-    value_func=tile_matmul_generic_value_func,
-    lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
+    value_func=tile_matmul_value_func,
+    lto_dispatch_func=tile_matmul_lto_dispatch_func,
     variadic=False,
     doc="""Computes the matrix product and accumulates ``out += a*b``.
@@ -6199,7 +6370,7 @@ add_builtin(
         * fp16, fp32, fp64 (real)
         * vec2h, vec2f, vec2d (complex)
-    All input and output tiles must have the same datatype. Tile data will be automatically be migrated
+    All input and output tiles must have the same datatype. Tile data will automatically be migrated
     to shared memory if necessary and will use TensorCore operations when available.
     :param a: A tile with ``shape=(M, K)``
@@ -6213,8 +6384,8 @@ add_builtin(
 add_builtin(
     "tile_matmul",
     input_types={"a": Tile(dtype=Any, shape=Any), "b": Tile(dtype=Any, shape=Any)},
-    value_func=tile_matmul_generic_value_func,
-    lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
+    value_func=tile_matmul_value_func,
+    lto_dispatch_func=tile_matmul_lto_dispatch_func,
     variadic=False,
     doc="""Computes the matrix product ``out = a*b``.
@@ -6222,7 +6393,7 @@ add_builtin(
         * fp16, fp32, fp64 (real)
         * vec2h, vec2f, vec2d (complex)
-    Both input tiles must have the same datatype. Tile data will be automatically be migrated
+    Both input tiles must have the same datatype. Tile data will automatically be migrated
     to shared memory if necessary and will use TensorCore operations when available.
     :param a: A tile with ``shape=(M, K)``
@@ -6294,59 +6465,29 @@ def tile_fft_generic_lto_dispatch_func(
     num_threads = options["block_dim"]
     arch = options["output_arch"]
     ept = size // num_threads
-    lto_symbol = f"fft_{size}_{ept}_{arch}_{direction}_{precision}"
-    # early out if LTO for this combination already exists for this module
-    if lto_symbol in builder.ltoirs:
-        return lto_symbol, builder.ltoirs[lto_symbol]
-    # otherwise compile LTO
-    lto_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
-    shared_memory_size = ctypes.c_int(0)
-    result = warp.context.runtime.core.cuda_compile_fft(
-        lto_code.name.encode("utf-8"),
-        lto_symbol.encode("utf-8"),
-        0,
-        None,
-        None,
-        arch,
-        size,
-        ept,
-        dir,
-        precision,
-        ctypes.byref(shared_memory_size),
-    )
-    lto_code_path = Path(lto_code.name)
-    if not result:
-        lto_code.close()
-        if lto_code_path.exists():
-            lto_code_path.unlink()
-        raise RuntimeError("Failed to compile tile_fft")
-    with open(lto_code.name, "rb") as f:
-        lto_code_data = f.read()
-    lto_code.close()
-    lto_code_path.unlink()
-    builder.ltoirs[lto_symbol] = lto_code_data
-    shared_memory_bytes = Tile.round_up(shared_memory_size.value)
-    return (
-        (
-            Var(lto_symbol, str, False, True, False),
-            Var(dtype, str, False, True, False),
-            Var(str(shared_memory_bytes), str, False, True, False),
-            Var(str(batch), str, False, True, False),
-            Var(str(ept), str, False, True, False),
-            inout,
-        ),
-        [],
-        [lto_code_data],
-        shared_memory_bytes,
-    )
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ([], [], [], 0)
+    else:
+        # generate the LTO
+        lto_symbol, lto_code_data, shared_memory_bytes = warp.build.build_lto_fft(
+            arch, size, ept, direction, dir, precision, builder
+        )
+        return (
+            (
+                Var(lto_symbol, str, False, True, False),
+                Var(dtype, str, False, True, False),
+                Var(str(shared_memory_bytes), str, False, True, False),
+                Var(str(batch), str, False, True, False),
+                Var(str(ept), str, False, True, False),
+                inout,
+            ),
+            [],
+            [lto_code_data],
+            shared_memory_bytes,
+        )
 add_builtin(
@@ -6408,7 +6549,7 @@ def tile_cholesky_generic_value_func(arg_types, arg_values):
         raise TypeError(f"tile_cholesky() argument must be a tile, got {a!r}")
     if len(a.shape) != 2:
-        raise ValueError("tile_cholesky() argumust must be a 2D tile")
+        raise ValueError("tile_cholesky() argument must be a 2D tile")
     if a.shape[0] != a.shape[1]:
         raise ValueError("tile_cholesky() argument must be square")
@@ -6449,57 +6590,36 @@ def tile_cholesky_generic_lto_dispatch_func(
     if out.type.shape[0] != M or out.type.shape[1] != M:
         raise ValueError("tile_cholesky() output tile must be square")
-    num_threads = options["block_dim"]
-    arch = options["output_arch"]
-    lto_symbol = f"potrf_{M}_{N}_{arch}_{precision_enum}"
-    # early out if LTO for this combination already exists for this module
-    if lto_symbol in builder.ltoirs:
-        return lto_symbol, builder.ltoirs[lto_symbol]
-    # otherwise compile LTO
-    lto_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
-    universal_fatbin_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
+    solver = "potrf"
+    solver_enum = cusolver_function_map[solver]
-    # cuSOLVERDx only support col-major input/outputs,
+    # cuSOLVERDx only supports col-major input/outputs,
     # so we use upper to mimic a row-major input
-    result = warp.context.runtime.core.cuda_compile_solver(
-        universal_fatbin_code.name.encode("utf-8"),
-        lto_code.name.encode("utf-8"),
-        lto_symbol.encode("utf-8"),
-        0,
-        None,
-        None,
-        arch,
-        M,
-        N,
-        cusolver_function_map["potrf"],
-        precision_enum,
-        cusolver_fill_mode_map["upper"],
-        num_threads,
-    )
+    fill_mode = cusolver_fill_mode_map["upper"]
-    if not result:
-        for f in [lto_code, universal_fatbin_code]:
-            f.close()
-            if Path(f.name).exists():
-                Path(f.name).unlink()
-        raise RuntimeError("Failed to compile tile_cholesky")
+    arch = options["output_arch"]
+    num_threads = options["block_dim"]
+    parameter_list = f"({dtype}*, unsigned)"
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ((0, a, out), [], [], 0)
     else:
-        with open(lto_code.name, "rb") as f:
-            lto_code_data = f.read()
-        with open(universal_fatbin_code.name, "rb") as f:
-            universal_fatbin_code_data = f.read()
-        for f in [lto_code, universal_fatbin_code]:
-            f.close()
-            Path(f.name).unlink()
-    builder.ltoirs[lto_symbol] = lto_code_data
-    builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({dtype}*, unsigned);"
-    builder.fatbins["cholesky"] = universal_fatbin_code_data
+        # generate the LTO
+        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+            M,
+            N,
+            solver,
+            solver_enum,
+            fill_mode,
+            arch,
+            precision_enum,
+            num_threads,
+            parameter_list,
+            builder,
+        )
-    return ((Var(lto_symbol, str, False, True, False), a, out), [], [lto_code_data], 0)
+        return ((Var(lto_symbol, str, False, True, False), a, out), [], [lto_code_data], 0)
 add_builtin(
@@ -6593,57 +6713,36 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
             f"got {y.type.shape[0]} elements in output and {M} rows in 'L'"
         )
-    num_threads = options["block_dim"]
-    arch = options["output_arch"]
-    lto_symbol = f"potrs_{M}_{N}_{arch}_{precision_enum}"
-    # early out if LTO for this combination already exists for this module
-    if lto_symbol in builder.ltoirs:
-        return lto_symbol, builder.ltoirs[lto_symbol]
-    # otherwise compile LTO
-    lto_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
-    universal_fatbin_code = tempfile.NamedTemporaryFile(prefix="warp", delete=False)
+    solver = "potrs"
+    solver_enum = cusolver_function_map[solver]
-    # cuSOLVERDx only support col-major input/outputs,
+    # cuSOLVERDx only supports col-major input/outputs,
     # so we use upper to mimic a row-major input
-    result = warp.context.runtime.core.cuda_compile_solver(
-        universal_fatbin_code.name.encode("utf-8"),
-        lto_code.name.encode("utf-8"),
-        lto_symbol.encode("utf-8"),
-        0,
-        None,
-        None,
-        arch,
-        M,
-        N,
-        cusolver_function_map["potrs"],
-        precision_enum,
-        cusolver_fill_mode_map["upper"],
-        num_threads,
-    )
+    fill_mode = cusolver_fill_mode_map["upper"]
-    if not result:
-        for f in [lto_code, universal_fatbin_code]:
-            f.close()
-            if Path(f.name).exists():
-                Path(f.name).unlink()
-        raise RuntimeError("Failed to compile tile_cholesky_solve")
+    arch = options["output_arch"]
+    num_threads = options["block_dim"]
+    parameter_list = f"({dtype}*, {dtype}*)"
+    if arch is None or not warp.context.runtime.core.is_mathdx_enabled():
+        # CPU/no-MathDx dispatch
+        return ((0, L, x, y), [], [], 0)
     else:
-        with open(lto_code.name, "rb") as f:
-            lto_code_data = f.read()
-        with open(universal_fatbin_code.name, "rb") as f:
-            universal_fatbin_code_data = f.read()
-        for f in [lto_code, universal_fatbin_code]:
-            f.close()
-            Path(f.name).unlink()
-    builder.ltoirs[lto_symbol] = lto_code_data
-    builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({dtype}*, {dtype}*);"
-    builder.fatbins["cholesky"] = universal_fatbin_code_data
-    return ((Var(lto_symbol, str, False, True, False), L, x, y), [], [lto_code_data], 0)
+        # generate the LTO
+        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+            M,
+            N,
+            solver,
+            solver_enum,
+            fill_mode,
+            arch,
+            precision_enum,
+            num_threads,
+            parameter_list,
+            builder,
+        )
+        return ((Var(lto_symbol, str, False, True, False), L, x, y), [], [lto_code_data], 0)
 add_builtin(