PyPI - warp-lang - Versions diffs - 1.9.0__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl - Mend

warp-lang 1.9.0__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (350) hide show

warp/__init__.py +301 -287
warp/__init__.pyi +2220 -313
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1075 -0
warp/_src/build.py +618 -0
warp/_src/build_dll.py +640 -0
warp/{builtins.py → _src/builtins.py} +1497 -226
warp/_src/codegen.py +4359 -0
warp/{config.py → _src/config.py} +178 -169
warp/_src/constants.py +57 -0
warp/_src/context.py +8294 -0
warp/_src/dlpack.py +462 -0
warp/_src/fabric.py +355 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +508 -0
warp/_src/fem/cache.py +687 -0
warp/_src/fem/dirichlet.py +188 -0
warp/{fem → _src/fem}/domain.py +40 -30
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +701 -0
warp/{fem → _src/fem}/field/nodal_field.py +30 -15
warp/{fem → _src/fem}/field/restriction.py +1 -1
warp/{fem → _src/fem}/field/virtual.py +53 -27
warp/_src/fem/geometry/__init__.py +32 -0
warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
warp/_src/fem/geometry/closest_point.py +97 -0
warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
warp/{fem → _src/fem}/geometry/element.py +32 -10
warp/{fem → _src/fem}/geometry/geometry.py +48 -20
warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
warp/{fem → _src/fem}/geometry/partition.py +121 -63
warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
warp/{fem → _src/fem}/integrate.py +164 -158
warp/_src/fem/linalg.py +383 -0
warp/_src/fem/operator.py +396 -0
warp/_src/fem/polynomial.py +229 -0
warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
warp/_src/fem/space/__init__.py +248 -0
warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
warp/_src/fem/space/basis_space.py +679 -0
warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
warp/{fem → _src/fem}/space/function_space.py +14 -13
warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
warp/{fem → _src/fem}/space/partition.py +117 -60
warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/restriction.py +66 -33
warp/_src/fem/space/shape/__init__.py +152 -0
warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
warp/_src/fem/space/topology.py +459 -0
warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
warp/_src/fem/types.py +112 -0
warp/_src/fem/utils.py +486 -0
warp/_src/jax.py +186 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +387 -0
warp/_src/jax_experimental/ffi.py +1284 -0
warp/_src/jax_experimental/xla_ffi.py +656 -0
warp/_src/marching_cubes.py +708 -0
warp/_src/math.py +414 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +163 -0
warp/_src/optim/linear.py +1606 -0
warp/_src/optim/sgd.py +112 -0
warp/_src/paddle.py +406 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +289 -0
warp/_src/render/render_opengl.py +3636 -0
warp/_src/render/render_usd.py +937 -0
warp/_src/render/utils.py +160 -0
warp/_src/sparse.py +2716 -0
warp/_src/tape.py +1206 -0
warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
warp/_src/torch.py +391 -0
warp/_src/types.py +5870 -0
warp/_src/utils.py +1693 -0
warp/autograd.py +12 -1054
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +8 -588
warp/build_dll.py +6 -471
warp/codegen.py +6 -4246
warp/constants.py +6 -39
warp/context.py +12 -7851
warp/dlpack.py +6 -444
warp/examples/distributed/example_jacobi_mpi.py +4 -5
warp/examples/fem/example_adaptive_grid.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +8 -8
warp/examples/fem/example_diffusion.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_mixed_elasticity.py +2 -2
warp/examples/fem/example_navier_stokes.py +1 -1
warp/examples/fem/example_nonconforming_contact.py +7 -7
warp/examples/fem/example_stokes.py +1 -1
warp/examples/fem/example_stokes_transfer.py +1 -1
warp/examples/fem/utils.py +2 -2
warp/examples/interop/example_jax_callable.py +1 -1
warp/examples/interop/example_jax_ffi_callback.py +1 -1
warp/examples/interop/example_jax_kernel.py +3 -2
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/fabric.py +6 -337
warp/fem/__init__.py +159 -97
warp/fem/adaptivity.py +7 -489
warp/fem/cache.py +9 -648
warp/fem/dirichlet.py +6 -184
warp/fem/field/__init__.py +8 -109
warp/fem/field/field.py +7 -652
warp/fem/geometry/__init__.py +7 -18
warp/fem/geometry/closest_point.py +11 -77
warp/fem/linalg.py +18 -366
warp/fem/operator.py +11 -369
warp/fem/polynomial.py +9 -209
warp/fem/space/__init__.py +5 -211
warp/fem/space/basis_space.py +6 -662
warp/fem/space/shape/__init__.py +41 -118
warp/fem/space/topology.py +6 -437
warp/fem/types.py +6 -81
warp/fem/utils.py +11 -444
warp/jax.py +8 -165
warp/jax_experimental/__init__.py +14 -1
warp/jax_experimental/custom_call.py +8 -342
warp/jax_experimental/ffi.py +17 -853
warp/jax_experimental/xla_ffi.py +5 -596
warp/marching_cubes.py +5 -689
warp/math.py +16 -393
warp/native/array.h +385 -37
warp/native/builtin.h +316 -39
warp/native/bvh.cpp +43 -9
warp/native/bvh.cu +62 -27
warp/native/bvh.h +310 -309
warp/native/clang/clang.cpp +102 -97
warp/native/coloring.cpp +0 -1
warp/native/crt.h +208 -0
warp/native/exports.h +156 -0
warp/native/hashgrid.cu +2 -0
warp/native/intersect.h +24 -1
warp/native/intersect_tri.h +44 -35
warp/native/mat.h +1456 -276
warp/native/mesh.cpp +4 -4
warp/native/mesh.cu +4 -2
warp/native/mesh.h +176 -61
warp/native/quat.h +0 -52
warp/native/scan.cu +2 -0
warp/native/sort.cu +22 -13
warp/native/sort.h +2 -0
warp/native/sparse.cu +7 -3
warp/native/spatial.h +12 -0
warp/native/tile.h +837 -70
warp/native/tile_radix_sort.h +1 -1
warp/native/tile_reduce.h +394 -46
warp/native/tile_scan.h +4 -4
warp/native/vec.h +469 -53
warp/native/version.h +23 -0
warp/native/volume.cpp +1 -1
warp/native/volume.cu +1 -0
warp/native/volume.h +1 -1
warp/native/volume_builder.cu +2 -0
warp/native/warp.cpp +60 -32
warp/native/warp.cu +313 -201
warp/native/warp.h +14 -11
warp/optim/__init__.py +6 -3
warp/optim/adam.py +6 -145
warp/optim/linear.py +14 -1585
warp/optim/sgd.py +6 -94
warp/paddle.py +6 -388
warp/render/__init__.py +8 -4
warp/render/imgui_manager.py +7 -267
warp/render/render_opengl.py +6 -3616
warp/render/render_usd.py +6 -918
warp/render/utils.py +6 -142
warp/sparse.py +37 -2563
warp/tape.py +6 -1188
warp/tests/__main__.py +1 -1
warp/tests/cuda/test_async.py +4 -4
warp/tests/cuda/test_conditional_captures.py +1 -1
warp/tests/cuda/test_multigpu.py +1 -1
warp/tests/cuda/test_streams.py +58 -1
warp/tests/geometry/test_bvh.py +157 -22
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +0 -1
warp/tests/geometry/test_mesh.py +5 -3
warp/tests/geometry/test_mesh_query_aabb.py +5 -12
warp/tests/geometry/test_mesh_query_point.py +5 -2
warp/tests/geometry/test_mesh_query_ray.py +15 -3
warp/tests/geometry/test_volume_write.py +5 -5
warp/tests/interop/test_dlpack.py +14 -14
warp/tests/interop/test_jax.py +1382 -79
warp/tests/interop/test_paddle.py +1 -1
warp/tests/test_adam.py +0 -1
warp/tests/test_arithmetic.py +9 -9
warp/tests/test_array.py +529 -100
warp/tests/test_array_reduce.py +3 -3
warp/tests/test_atomic.py +12 -8
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +4 -4
warp/tests/test_bool.py +2 -2
warp/tests/test_builtins_resolution.py +5 -571
warp/tests/test_codegen.py +34 -15
warp/tests/test_conditional.py +1 -1
warp/tests/test_context.py +6 -6
warp/tests/test_copy.py +242 -161
warp/tests/test_ctypes.py +3 -3
warp/tests/test_devices.py +24 -2
warp/tests/test_examples.py +16 -84
warp/tests/test_fabricarray.py +35 -35
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +60 -14
warp/tests/test_fixedarray.py +3 -3
warp/tests/test_func.py +8 -5
warp/tests/test_generics.py +1 -1
warp/tests/test_indexedarray.py +24 -24
warp/tests/test_intersect.py +39 -9
warp/tests/test_large.py +1 -1
warp/tests/test_lerp.py +3 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_map.py +49 -4
warp/tests/test_mat.py +52 -62
warp/tests/test_mat_constructors.py +4 -5
warp/tests/test_mat_lite.py +1 -1
warp/tests/test_mat_scalar_ops.py +121 -121
warp/tests/test_math.py +34 -0
warp/tests/test_module_aot.py +4 -4
warp/tests/test_modules_lite.py +28 -2
warp/tests/test_print.py +11 -11
warp/tests/test_quat.py +93 -58
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +38 -10
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +126 -15
warp/tests/test_spatial.py +105 -87
warp/tests/test_special_values.py +6 -6
warp/tests/test_static.py +7 -7
warp/tests/test_struct.py +13 -2
warp/tests/test_triangle_closest_point.py +48 -1
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +82 -9
warp/tests/test_utils.py +52 -52
warp/tests/test_vec.py +29 -29
warp/tests/test_vec_constructors.py +5 -5
warp/tests/test_vec_scalar_ops.py +97 -97
warp/tests/test_version.py +75 -0
warp/tests/tile/test_tile.py +239 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +7 -4
warp/tests/tile/test_tile_load.py +26 -2
warp/tests/tile/test_tile_mathdx.py +3 -3
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +2 -4
warp/tests/tile/test_tile_reduce.py +214 -13
warp/tests/unittest_suites.py +6 -14
warp/tests/unittest_utils.py +10 -9
warp/tests/walkthrough_debug.py +3 -1
warp/torch.py +6 -373
warp/types.py +29 -5750
warp/utils.py +10 -1659
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +47 -103
warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp/examples/assets/cartpole.urdf +0 -110
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/nv_ant.xml +0 -92
warp/examples/assets/nv_humanoid.xml +0 -183
warp/examples/assets/quadruped.urdf +0 -268
warp/examples/optim/example_bounce.py +0 -266
warp/examples/optim/example_cloth_throw.py +0 -228
warp/examples/optim/example_drone.py +0 -870
warp/examples/optim/example_inverse_kinematics.py +0 -182
warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
warp/examples/optim/example_softbody_properties.py +0 -400
warp/examples/optim/example_spring_cage.py +0 -245
warp/examples/optim/example_trajectory.py +0 -227
warp/examples/sim/example_cartpole.py +0 -143
warp/examples/sim/example_cloth.py +0 -225
warp/examples/sim/example_cloth_self_contact.py +0 -316
warp/examples/sim/example_granular.py +0 -130
warp/examples/sim/example_granular_collision_sdf.py +0 -202
warp/examples/sim/example_jacobian_ik.py +0 -244
warp/examples/sim/example_particle_chain.py +0 -124
warp/examples/sim/example_quadruped.py +0 -203
warp/examples/sim/example_rigid_chain.py +0 -203
warp/examples/sim/example_rigid_contact.py +0 -195
warp/examples/sim/example_rigid_force.py +0 -133
warp/examples/sim/example_rigid_gyroscopic.py +0 -115
warp/examples/sim/example_rigid_soft_contact.py +0 -140
warp/examples/sim/example_soft_body.py +0 -196
warp/examples/tile/example_tile_walker.py +0 -327
warp/sim/__init__.py +0 -74
warp/sim/articulation.py +0 -793
warp/sim/collide.py +0 -2570
warp/sim/graph_coloring.py +0 -307
warp/sim/import_mjcf.py +0 -791
warp/sim/import_snu.py +0 -227
warp/sim/import_urdf.py +0 -579
warp/sim/import_usd.py +0 -898
warp/sim/inertia.py +0 -357
warp/sim/integrator.py +0 -245
warp/sim/integrator_euler.py +0 -2000
warp/sim/integrator_featherstone.py +0 -2101
warp/sim/integrator_vbd.py +0 -2487
warp/sim/integrator_xpbd.py +0 -3295
warp/sim/model.py +0 -4821
warp/sim/particles.py +0 -121
warp/sim/render.py +0 -431
warp/sim/utils.py +0 -431
warp/tests/sim/disabled_kinematics.py +0 -244
warp/tests/sim/test_cloth.py +0 -863
warp/tests/sim/test_collision.py +0 -743
warp/tests/sim/test_coloring.py +0 -347
warp/tests/sim/test_inertia.py +0 -161
warp/tests/sim/test_model.py +0 -226
warp/tests/sim/test_sim_grad.py +0 -287
warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
warp/tests/sim/test_sim_kinematics.py +0 -98
warp/thirdparty/__init__.py +0 -0
warp_lang-1.9.0.dist-info/RECORD +0 -456
/warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
/warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
/warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
/warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0

warp/{builtins.py → _src/builtins.py} RENAMED Viewed

@@ -20,11 +20,11 @@ import functools
 import math
 from typing import Any, Callable, Mapping, Sequence
-import warp.build
-import warp.context
-import warp.utils
-from warp.codegen import Reference, Var, get_arg_value, strip_reference
-from warp.types import *
+import warp._src.build
+import warp._src.context
+import warp._src.utils
+from warp._src.codegen import Reference, Var, get_arg_value, strip_reference
+from warp._src.types import *
 from .context import add_builtin
@@ -61,11 +61,11 @@ def sametypes_create_value_func(default: TypeVar):
 def extract_tuple(arg, as_constant=False):
     if isinstance(arg, Var):
-        if isinstance(arg.type, warp.types.tuple_t):
+        if isinstance(arg.type, warp._src.types.tuple_t):
             out = arg.type.values
         else:
             out = (arg,)
-    elif isinstance(arg, warp.types.tuple_t):
+    elif isinstance(arg, warp._src.types.tuple_t):
         out = arg.values
     elif not isinstance(arg, Sequence):
         out = (arg,)
@@ -82,7 +82,7 @@ def static_len_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
     if arg_types is None:
         return int
-    length = warp.types.type_length(arg_types["a"])
+    length = warp._src.types.type_length(arg_types["a"])
     return Var(None, type=int, constant=length)
@@ -126,6 +126,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Scalar),
     doc="Return -1 if ``x`` < 0, return 1 otherwise.",
     group="Scalar Math",
+    is_differentiable=False,
 )
 add_builtin(
@@ -134,6 +135,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Scalar),
     doc="Return 1.0 if ``x`` < 0.0, return 0.0 otherwise.",
     group="Scalar Math",
+    is_differentiable=False,
 )
 add_builtin(
     "nonzero",
@@ -141,6 +143,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Scalar),
     doc="Return 1.0 if ``x`` is not equal to zero, return 0.0 otherwise.",
     group="Scalar Math",
+    is_differentiable=False,
 )
 add_builtin(
@@ -282,7 +285,36 @@ add_builtin(
     group="Scalar Math",
     require_original_output_arg=True,
 )
+add_builtin(
+    "erf",
+    input_types={"x": Float},
+    value_func=sametypes_create_value_func(Float),
+    doc="Return the error function of ``x``.",
+    group="Scalar Math",
+)
+add_builtin(
+    "erfc",
+    input_types={"x": Float},
+    value_func=sametypes_create_value_func(Float),
+    doc="Return the complementary error function of ``x``.",
+    group="Scalar Math",
+)
+add_builtin(
+    "erfinv",
+    input_types={"x": Float},
+    value_func=sametypes_create_value_func(Float),
+    doc="Return the inverse error function of ``x``.",
+    group="Scalar Math",
+    require_original_output_arg=True,
+)
+add_builtin(
+    "erfcinv",
+    input_types={"x": Float},
+    value_func=sametypes_create_value_func(Float),
+    doc="Return the inverse complementary error function of ``x``.",
+    group="Scalar Math",
+    require_original_output_arg=True,
+)
 add_builtin(
     "round",
     input_types={"x": Float},
@@ -292,6 +324,7 @@ add_builtin(
     This is the most intuitive form of rounding in the colloquial sense, but can be slower than other options like :func:`warp.rint()`.
     Differs from :func:`numpy.round()`, which behaves the same way as :func:`numpy.rint()`.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -302,6 +335,7 @@ add_builtin(
     doc="""Return the nearest integer value to ``x``, rounding halfway cases to nearest even integer.
     It is generally faster than :func:`warp.round()`. Equivalent to :func:`numpy.rint()`.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -314,6 +348,7 @@ add_builtin(
     In other words, it discards the fractional part of ``x``.
     It is similar to casting ``float(int(a))``, but preserves the negative sign when ``x`` is in the range [-0.0, -1.0).
     Equivalent to :func:`numpy.trunc()` and :func:`numpy.fix()`.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -322,6 +357,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Float),
     group="Scalar Math",
     doc="""Return the largest integer that is less than or equal to ``x``.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -330,6 +366,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Float),
     group="Scalar Math",
     doc="""Return the smallest integer that is greater than or equal to ``x``.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -340,6 +377,7 @@ add_builtin(
     doc="""Retrieve the fractional part of ``x``.
     In other words, it discards the integer part of ``x`` and is equivalent to ``x - trunc(x)``.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -348,6 +386,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Scalar Math",
     doc="""Return ``True`` if ``a`` is a finite number, otherwise return ``False``.""",
+    is_differentiable=False,
 )
 add_builtin(
     "isfinite",
@@ -355,6 +394,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if all elements of the vector ``a`` are finite, otherwise return ``False``.",
+    is_differentiable=False,
 )
 add_builtin(
     "isfinite",
@@ -362,6 +402,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if all elements of the quaternion ``a`` are finite, otherwise return ``False``.",
+    is_differentiable=False,
 )
 add_builtin(
     "isfinite",
@@ -369,6 +410,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if all elements of the matrix ``a`` are finite, otherwise return ``False``.",
+    is_differentiable=False,
 )
 add_builtin(
@@ -377,6 +419,7 @@ add_builtin(
     value_type=builtins.bool,
     doc="Return ``True`` if ``a`` is NaN, otherwise return ``False``.",
     group="Scalar Math",
+    is_differentiable=False,
 )
 add_builtin(
     "isnan",
@@ -384,6 +427,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the vector ``a`` is NaN, otherwise return ``False``.",
+    is_differentiable=False,
 )
 add_builtin(
     "isnan",
@@ -391,6 +435,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the quaternion ``a`` is NaN, otherwise return ``False``.",
+    is_differentiable=False,
 )
 add_builtin(
     "isnan",
@@ -398,6 +443,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the matrix ``a`` is NaN, otherwise return ``False``.",
+    is_differentiable=False,
 )
 add_builtin(
@@ -406,6 +452,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Scalar Math",
     doc="""Return ``True`` if ``a`` is positive or negative infinity, otherwise return ``False``.""",
+    is_differentiable=False,
 )
 add_builtin(
     "isinf",
@@ -413,6 +460,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the vector ``a`` is positive or negative infinity, otherwise return ``False``.",
+    is_differentiable=False,
 )
 add_builtin(
     "isinf",
@@ -420,6 +468,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the quaternion ``a`` is positive or negative infinity, otherwise return ``False``.",
+    is_differentiable=False,
 )
 add_builtin(
     "isinf",
@@ -427,6 +476,7 @@ add_builtin(
     value_type=builtins.bool,
     group="Vector Math",
     doc="Return ``True`` if any element of the matrix ``a`` is positive or negative infinity, otherwise return ``False``.",
+    is_differentiable=False,
 )
@@ -534,7 +584,7 @@ add_builtin(
     value_func=lambda arg_types, arg_values: warp.uint32,
     doc="Return the index of the minimum element of a vector ``a``.",
     group="Vector Math",
-    missing_grad=True,
+    is_differentiable=False,
 )
 add_builtin(
     "argmax",
@@ -542,7 +592,7 @@ add_builtin(
     value_func=lambda arg_types, arg_values: warp.uint32,
     doc="Return the index of the maximum element of a vector ``a``.",
     group="Vector Math",
-    missing_grad=True,
+    is_differentiable=False,
 )
 add_builtin(
@@ -867,7 +917,7 @@ def vector_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
             if dtype is None:
                 dtype = value_type
-            elif not warp.types.scalars_equal(value_type, dtype):
+            elif not warp._src.types.scalars_equal(value_type, dtype):
                 raise RuntimeError(
                     f"the value used to fill this vector is expected to be of the type `{dtype.__name__}`"
                 )
@@ -888,7 +938,7 @@ def vector_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
         if dtype is None:
             dtype = value_type
-        elif not warp.types.scalars_equal(value_type, dtype):
+        elif not warp._src.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"all values used to initialize this vector are expected to be of the type `{dtype.__name__}`"
             )
@@ -971,7 +1021,7 @@ def matrix_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
             if dtype is None:
                 dtype = value_type
-            elif not warp.types.scalars_equal(value_type, dtype):
+            elif not warp._src.types.scalars_equal(value_type, dtype):
                 raise RuntimeError(
                     f"the value used to fill this matrix is expected to be of the type `{dtype.__name__}`"
                 )
@@ -981,7 +1031,7 @@ def matrix_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
             raise RuntimeError("the `shape` argument must be specified when initializing a matrix by value")
         if all(type_is_vector(x) for x in variadic_arg_types):
-            warp.utils.warn(
+            warp._src.utils.warn(
                 "the built-in `wp.matrix()` won't support taking column vectors as input "
                 "in the future. Use `wp.matrix_from_rows()` or `wp.matrix_from_cols()` instead.",
                 DeprecationWarning,
@@ -1010,7 +1060,7 @@ def matrix_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, An
         if dtype is None:
             dtype = value_type
-        elif not warp.types.scalars_equal(value_type, dtype):
+        elif not warp._src.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"all values used to initialize this matrix are expected to be of the type `{dtype.__name__}`"
             )
@@ -1182,48 +1232,18 @@ add_builtin(
     doc="Create an identity matrix with shape=(n,n) with the type given by ``dtype``.",
     group="Vector Math",
     export=False,
+    is_differentiable=False,
 )
 def matrix_transform_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
-    warp.utils.warn(
-        "the built-in `wp.matrix()` function to construct a 4x4 matrix from a 3D position, quaternion, "
-        "and 3D scale vector will be deprecated in favor of `wp.transform_compose()`.",
-        DeprecationWarning,
-    )
     if arg_types is None:
         return matrix(shape=(4, 4), dtype=Float)
-    dtype = arg_values.get("dtype", None)
-    value_arg_types = tuple(v for k, v in arg_types.items() if k != "dtype")
-    try:
-        value_type = scalar_infer_type(value_arg_types)
-    except RuntimeError:
-        raise RuntimeError(
-            "all values given when constructing a transformation matrix must have the same type"
-        ) from None
-    if dtype is None:
-        dtype = value_type
-    elif not warp.types.scalars_equal(value_type, dtype):
-        raise RuntimeError(
-            f"all values used to initialize this transformation matrix are expected to be of the type `{dtype.__name__}`"
-        )
-    return matrix(shape=(4, 4), dtype=dtype)
-def matrix_transform_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
-    # We're in the codegen stage where we emit the code calling the built-in.
-    # Further validate the given argument values if needed and map them
-    # to the underlying C++ function's runtime and template params.
-    dtype = return_type._wp_scalar_type_
-    func_args = tuple(v for k, v in args.items() if k != "dtype")
-    template_args = (4, 4, dtype)
-    return (func_args, template_args)
+    raise RuntimeError(
+        "the built-in `wp.matrix()` to construct a 4x4 matrix from a 3D position, quaternion, "
+        "and 3D scale vector has been removed in favor of `wp.transform_compose()`."
+    )
 add_builtin(
@@ -1237,13 +1257,14 @@ add_builtin(
     defaults={"dtype": None},
     value_func=matrix_transform_value_func,
     export_func=lambda input_types: {k: v for k, v in input_types.items() if k != "dtype"},
-    dispatch_func=matrix_transform_dispatch_func,
     native_func="mat_t",
     doc="""Construct a 4x4 transformation matrix that applies the transformations as
     Translation(pos)*Rotation(rot)*Scaling(scale) when applied to column vectors, i.e.: y = (TRS)*x
-    .. warning::
-       This function has been deprecated in favor of :func:`warp.math.transform_compose()`.""",
+    .. versionremoved:: 1.10
+       This function has been removed in favor of :func:`warp.math.transform_compose()`.
+    .. deprecated:: 1.8""",
     group="Vector Math",
     export=False,
 )
@@ -1438,7 +1459,7 @@ def quaternion_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
         if dtype is None:
             dtype = value_type
-        elif not warp.types.scalars_equal(value_type, dtype):
+        elif not warp._src.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"all values used to initialize this quaternion are expected to be of the type `{dtype.__name__}`"
             )
@@ -1546,6 +1567,7 @@ add_builtin(
     group="Quaternion Math",
     doc="Construct an identity quaternion with zero imaginary part and real part of 1.0",
     export=True,
+    is_differentiable=False,
 )
 add_builtin(
@@ -1674,7 +1696,7 @@ def transformation_value_func(arg_types: Mapping[str, type], arg_values: Mapping
         value_type = strip_reference(variadic_arg_types[0])
         if dtype is None:
             dtype = value_type
-        elif not warp.types.scalars_equal(value_type, dtype):
+        elif not warp._src.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"the value used to fill this transform is expected to be of the type `{dtype.__name__}`"
             )
@@ -1687,7 +1709,7 @@ def transformation_value_func(arg_types: Mapping[str, type], arg_values: Mapping
         if dtype is None:
             dtype = value_type
-        elif not warp.types.scalars_equal(value_type, dtype):
+        elif not warp._src.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"all values used to initialize this transform are expected to be of the type `{dtype.__name__}`"
             )
@@ -1712,7 +1734,7 @@ def transformation_pq_value_func(arg_types: Mapping[str, type], arg_values: Mapp
     dtype = arg_values.get("dtype", None)
     if dtype is None:
         dtype = value_type
-    elif not warp.types.scalars_equal(value_type, dtype):
+    elif not warp._src.types.scalars_equal(value_type, dtype):
         raise RuntimeError(
             f"all values used to initialize this transformation matrix are expected to be of the type `{dtype.__name__}`"
         )
@@ -1727,9 +1749,19 @@ def transformation_dispatch_func(input_types: Mapping[str, type], return_type: A
     dtype = return_type._wp_scalar_type_
-    variadic_args = tuple(v for k, v in args.items() if k != "dtype")
+    variadic_args = args.get("args", ())
+    variadic_arg_count = len(variadic_args)
+    if variadic_arg_count == 7:
+        func_args = variadic_args
+    else:
+        func_args = tuple(v for k, v in args.items() if k != "dtype")
+        if "p" in args and "q" not in args:
+            quat_ident = warp._src.codegen.Var(
+                label=None, type=quaternion(dtype=dtype), constant=quaternion(dtype=dtype)(0, 0, 0, 1)
+            )
+            func_args += (quat_ident,)
-    func_args = variadic_args
     template_args = (dtype,)
     return (func_args, template_args)
@@ -1737,7 +1769,7 @@ def transformation_dispatch_func(input_types: Mapping[str, type], return_type: A
 add_builtin(
     "transformation",
     input_types={"p": vector(length=3, dtype=Float), "q": quaternion(dtype=Float), "dtype": Float},
-    defaults={"dtype": None},
+    defaults={"q": None, "dtype": None},
     value_func=transformation_pq_value_func,
     export_func=lambda input_types: {k: v for k, v in input_types.items() if k != "dtype"},
     dispatch_func=transformation_dispatch_func,
@@ -1795,6 +1827,7 @@ add_builtin(
     group="Transformations",
     doc="Construct an identity transform with zero translation and identity rotation.",
     export=True,
+    is_differentiable=False,
 )
 add_builtin(
@@ -1928,7 +1961,7 @@ def spatial_vector_value_func(arg_types: Mapping[str, type], arg_values: Mapping
         if dtype is None:
             dtype = value_type
-        elif not warp.types.scalars_equal(value_type, dtype):
+        elif not warp._src.types.scalars_equal(value_type, dtype):
             raise RuntimeError(
                 f"all values used to initialize this spatial vector are expected to be of the type `{dtype.__name__}`"
             )
@@ -2122,7 +2155,7 @@ add_builtin(
     value_func=tile_zeros_value_func,
     dispatch_func=tile_zeros_dispatch_func,
     variadic=False,
-    missing_grad=True,
+    is_differentiable=False,
     doc="""Allocate a tile of zero-initialized items.
     :param shape: Shape of the output tile
@@ -2142,7 +2175,7 @@ add_builtin(
     value_func=tile_zeros_value_func,
     dispatch_func=tile_zeros_dispatch_func,
     variadic=False,
-    missing_grad=True,
+    is_differentiable=False,
     hidden=True,
     group="Tile Primitives",
     export=False,
@@ -2194,7 +2227,7 @@ add_builtin(
     defaults={"storage": "register"},
     value_func=tile_ones_value_func,
     dispatch_func=tile_ones_dispatch_func,
-    missing_grad=True,
+    is_differentiable=False,
     doc="""Allocate a tile of one-initialized items.
     :param shape: Shape of the output tile
@@ -2213,7 +2246,86 @@ add_builtin(
     defaults={"storage": "register"},
     value_func=tile_ones_value_func,
     dispatch_func=tile_ones_dispatch_func,
-    missing_grad=True,
+    is_differentiable=False,
+    hidden=True,
+    group="Tile Primitives",
+    export=False,
+)
+def tile_full_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Any, shape=Tuple[int, ...])
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
+    if "value" not in arg_values:
+        raise TypeError("tile_full() missing required keyword argument 'value'")
+    if "dtype" not in arg_values:
+        raise TypeError("tile_full() missing required keyword argument 'dtype'")
+    if "storage" not in arg_values:
+        raise TypeError("tile_full() missing required keyword argument 'storage'")
+    if arg_values["storage"] not in {"shared", "register"}:
+        raise ValueError(f"Invalid value for 'storage': {arg_values['storage']!r}. Expected 'shared' or 'register'.")
+    dtype = arg_values["dtype"]
+    return tile(dtype=dtype, shape=shape, storage=arg_values["storage"])
+def tile_full_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    shape = extract_tuple(arg_values["shape"], as_constant=True)
+    if None in shape:
+        raise ValueError("Tile functions require shape to be a compile time constant.")
+    dtype = arg_values["dtype"]
+    value = arg_values["value"]
+    func_args = [value]
+    template_args = []
+    template_args.append(dtype)
+    template_args.extend(shape)
+    return (func_args, template_args)
+add_builtin(
+    "tile_full",
+    input_types={"shape": Tuple[int, ...], "value": Any, "dtype": Any, "storage": str},
+    defaults={"storage": "register"},
+    value_func=tile_full_value_func,
+    dispatch_func=tile_full_dispatch_func,
+    is_differentiable=False,
+    doc="""Allocate a tile filled with the specified value.
+    :param shape: Shape of the output tile
+    :param value: Value to fill the tile with
+    :param dtype: Data type of output tile's elements
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
+    :returns: A tile filled with the specified value""",
+    group="Tile Primitives",
+    export=False,
+)
+# overload for scalar shape
+add_builtin(
+    "tile_full",
+    input_types={"shape": int, "value": Any, "dtype": Any, "storage": str},
+    defaults={"storage": "register"},
+    value_func=tile_full_value_func,
+    dispatch_func=tile_full_dispatch_func,
+    is_differentiable=False,
     hidden=True,
     group="Tile Primitives",
     export=False,
@@ -2275,13 +2387,13 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     args = arg_values["args"]
     if len(args) == 1:
-        start = warp.codegen.Var(label=None, type=return_type.dtype, constant=0)
+        start = warp._src.codegen.Var(label=None, type=return_type.dtype, constant=0)
         stop = args[0]
-        step = warp.codegen.Var(label=None, type=return_type.dtype, constant=1)
+        step = warp._src.codegen.Var(label=None, type=return_type.dtype, constant=1)
     elif len(args) == 2:
         start = args[0]
         stop = args[1]
-        step = warp.codegen.Var(label=None, type=return_type.dtype, constant=1)
+        step = warp._src.codegen.Var(label=None, type=return_type.dtype, constant=1)
     elif len(args) == 3:
         start = args[0]
         stop = args[1]
@@ -2304,7 +2416,7 @@ add_builtin(
     value_func=tile_arange_value_func,
     dispatch_func=tile_arange_dispatch_func,
     variadic=True,
-    missing_grad=True,
+    is_differentiable=False,
     doc="""Generate a tile of linearly spaced elements.
     :param args: Variable-length positional arguments, interpreted as:
@@ -3099,7 +3211,7 @@ add_builtin(
     :param shape: Shape of the returned slice
     :returns: A tile with dimensions given by the specified shape or the remaining source tile dimensions""",
     group="Tile Primitives",
-    missing_grad=True,
+    is_differentiable=False,
     export=False,
 )
@@ -3346,7 +3458,32 @@ add_builtin(
 add_builtin(
     "assign",
-    input_types={"dst": tile(dtype=Any, shape=Tuple[int, int]), "i": int, "j": int, "src": Any},
+    input_types={"dst": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "src": Any},
+    value_func=tile_assign_value_func,
+    group="Tile Primitives",
+    export=False,
+    hidden=True,
+)
+add_builtin(
+    "assign",
+    input_types={"dst": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "src": Any},
+    value_func=tile_assign_value_func,
+    group="Tile Primitives",
+    export=False,
+    hidden=True,
+)
+add_builtin(
+    "assign",
+    input_types={
+        "dst": tile(dtype=Any, shape=Tuple[int, ...]),
+        "i": int,
+        "j": int,
+        "k": int,
+        "l": int,
+        "src": Any,
+    },
     value_func=tile_assign_value_func,
     group="Tile Primitives",
     export=False,
@@ -3355,7 +3492,15 @@ add_builtin(
 add_builtin(
     "assign",
-    input_types={"dst": tile(dtype=Any, shape=Tuple[int, int, int]), "i": int, "j": int, "k": int, "src": Any},
+    input_types={
+        "dst": tile(dtype=Any, shape=Tuple[int, ...]),
+        "i": int,
+        "j": int,
+        "k": int,
+        "l": int,
+        "m": int,
+        "src": Any,
+    },
     value_func=tile_assign_value_func,
     group="Tile Primitives",
     export=False,
@@ -3370,6 +3515,8 @@ add_builtin(
         "j": int,
         "k": int,
         "l": int,
+        "m": int,
+        "n": int,
         "src": Any,
     },
     value_func=tile_assign_value_func,
@@ -3391,7 +3538,7 @@ def tile_value_func(arg_types, arg_values):
     if preserve_type:
         dtype = arg_types["x"]
-        shape = (warp.codegen.options["block_dim"],)
+        shape = (warp._src.codegen.options["block_dim"],)
         return tile(dtype=dtype, shape=shape)
@@ -3399,18 +3546,18 @@ def tile_value_func(arg_types, arg_values):
         if type_is_vector(arg_types["x"]):
             dtype = arg_types["x"]._wp_scalar_type_
             length = arg_types["x"]._shape_[0]
-            shape = (length, warp.codegen.options["block_dim"])
+            shape = (length, warp._src.codegen.options["block_dim"])
         elif type_is_quaternion(arg_types["x"]):
             dtype = arg_types["x"]._wp_scalar_type_
-            shape = (4, warp.codegen.options["block_dim"])
+            shape = (4, warp._src.codegen.options["block_dim"])
         elif type_is_matrix(arg_types["x"]):
             dtype = arg_types["x"]._wp_scalar_type_
             rows = arg_types["x"]._shape_[0]
             cols = arg_types["x"]._shape_[1]
-            shape = (rows, cols, warp.codegen.options["block_dim"])
+            shape = (rows, cols, warp._src.codegen.options["block_dim"])
         else:
             dtype = arg_types["x"]
-            shape = (warp.codegen.options["block_dim"],)
+            shape = (warp._src.codegen.options["block_dim"],)
         return tile(dtype=dtype, shape=shape)
@@ -3500,17 +3647,17 @@ def untile_value_func(arg_types, arg_values):
     if not is_tile(t):
         raise TypeError(f"untile() argument must be a tile, got {t!r}")
-    if t.shape[-1] != warp.codegen.options["block_dim"]:
+    if t.shape[-1] != warp._src.codegen.options["block_dim"]:
         raise ValueError(
-            f"untile() argument last dimension {t.shape[-1]} does not match the expected block width {warp.codegen.options['block_dim']}"
+            f"untile() argument last dimension {t.shape[-1]} does not match the expected block width {warp._src.codegen.options['block_dim']}"
         )
     if len(t.shape) == 1:
         return t.dtype
     elif len(t.shape) == 2:
-        return warp.types.vector(t.shape[0], t.dtype)
+        return warp._src.types.vector(t.shape[0], t.dtype)
     elif len(t.shape) == 3:
-        return warp.types.matrix((t.shape[0], t.shape[1]), t.dtype)
+        return warp._src.types.matrix((t.shape[0], t.shape[1]), t.dtype)
     else:
         raise ValueError(f"untile() argument must have a positive size in dimension 0, but got {t.shape[0]}")
@@ -3572,7 +3719,36 @@ def tile_extract_value_func(arg_types, arg_values):
     # force the input tile to shared memory
     arg_types["a"].storage = "shared"
-    return arg_types["a"].dtype
+    # count the number of indices (all parameters except the tile "a")
+    num_indices = len(arg_types) - 1
+    tile_dtype = arg_types["a"].dtype
+    tile_shape = arg_types["a"].shape
+    if type_is_vector(tile_dtype):
+        if num_indices == len(tile_shape):
+            return tile_dtype
+        elif num_indices == len(tile_shape) + 1:
+            return tile_dtype._wp_scalar_type_
+        else:
+            raise IndexError(
+                f"tile_extract: incorrect number of indices ({num_indices}) for tile shape {tuple(tile_shape)}"
+            )
+    elif type_is_matrix(tile_dtype):
+        if num_indices == len(tile_shape):
+            return tile_dtype
+        elif num_indices == len(tile_shape) + 2:
+            return tile_dtype._wp_scalar_type_
+        else:
+            raise IndexError(
+                f"tile_extract: incorrect number of indices ({num_indices}) for matrix tile shape {tuple(tile_shape)}"
+            )
+    else:
+        # scalar element: index count must exactly match tile rank
+        if num_indices == len(tile_shape):
+            return tile_dtype
+        raise IndexError(
+            f"tile_extract: incorrect number of indices ({num_indices}) for tile shape {tuple(tile_shape)}"
+        )
 add_builtin(
@@ -3596,7 +3772,7 @@ add_builtin(
 add_builtin(
     "tile_extract",
-    input_types={"a": tile(dtype=Any, shape=Tuple[int, int]), "i": int, "j": int},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int},
     value_func=tile_extract_value_func,
     variadic=False,
     doc="""Extract a single element from the tile.
@@ -3607,7 +3783,28 @@ add_builtin(
     :param a: Tile to extract the element from
     :param i: Coordinate of element on first dimension
-    :param j: Coordinate of element on the second dimension
+    :param j: Coordinate of element on the second dimension, or vector index
+    :returns: The value of the element at the specified tile location with the same data type as the input tile""",
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_extract",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int},
+    value_func=tile_extract_value_func,
+    variadic=False,
+    doc="""Extract a single element from the tile.
+    This function will extract an element from the tile and broadcast its value to all threads in the block.
+    Note that this may incur additional synchronization if the source tile is a register tile.
+    :param a: Tile to extract the element from
+    :param i: Coordinate of element on first dimension
+    :param j: Coordinate of element on the second dimension, or first matrix index
+    :param k: Coordinate of element on the third dimension, or vector index, or second matrix index
     :returns: The value of the element at the specified tile location with the same data type as the input tile""",
     group="Tile Primitives",
     hidden=True,
@@ -3616,7 +3813,36 @@ add_builtin(
 add_builtin(
     "tile_extract",
-    input_types={"a": tile(dtype=Any, shape=Tuple[int, int, int]), "i": int, "j": int, "k": int},
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "l": int},
+    value_func=tile_extract_value_func,
+    variadic=False,
+    doc="""Extract a single element from the tile.
+    This function will extract an element from the tile and broadcast its value to all threads in the block.
+    Note that this may incur additional synchronization if the source tile is a register tile.
+    :param a: Tile to extract the element from
+    :param i: Coordinate of element on first dimension
+    :param j: Coordinate of element on the second dimension
+    :param k: Coordinate of element on the third dimension, or first matrix index
+    :param l: Coordinate of element on the fourth dimension, or vector index, or second matrix index
+    :returns: The value of the element at the specified tile location, with the same data type as the input tile""",
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+)
+add_builtin(
+    "tile_extract",
+    input_types={
+        "a": tile(dtype=Any, shape=Tuple[int, ...]),
+        "i": int,
+        "j": int,
+        "k": int,
+        "l": int,
+        "m": int,
+    },
     value_func=tile_extract_value_func,
     variadic=False,
     doc="""Extract a single element from the tile.
@@ -3629,7 +3855,9 @@ add_builtin(
     :param i: Coordinate of element on first dimension
     :param j: Coordinate of element on the second dimension
     :param k: Coordinate of element on the third dimension
-    :returns: The value of the element at the specified tile location with the same data type as the input tile""",
+    :param l: Coordinate of element on the fourth dimension, or first matrix index
+    :param m: Vector index, or second matrix index
+    :returns: The value of the element at the specified tile location, with the same data type as the input tile""",
     group="Tile Primitives",
     hidden=True,
     export=False,
@@ -3637,7 +3865,15 @@ add_builtin(
 add_builtin(
     "tile_extract",
-    input_types={"a": tile(dtype=Any, shape=Tuple[int, int, int, int]), "i": int, "j": int, "k": int, "l": int},
+    input_types={
+        "a": tile(dtype=Any, shape=Tuple[int, int, int, int]),
+        "i": int,
+        "j": int,
+        "k": int,
+        "l": int,
+        "m": int,
+        "n": int,
+    },
     value_func=tile_extract_value_func,
     variadic=False,
     doc="""Extract a single element from the tile.
@@ -3651,6 +3887,8 @@ add_builtin(
     :param j: Coordinate of element on the second dimension
     :param k: Coordinate of element on the third dimension
     :param l: Coordinate of element on the fourth dimension
+    :param m: Vector index, or first matrix index
+    :param n: Second matrix index
     :returns: The value of the element at the specified tile location, with the same data type as the input tile""",
     group="Tile Primitives",
     hidden=True,
@@ -3737,50 +3975,161 @@ add_builtin(
     export=False,
 )
-def tile_transpose_value_func(arg_types, arg_values):
-    # return generic type (for doc builds)
-    if arg_types is None:
-        return tile(dtype=Any, shape=Tuple[int, int])
-    if len(arg_types) != 1:
-        raise TypeError(f"tile_transpose() takes exactly 1 positional argument but {len(arg_types)} were given")
-    t = arg_types["a"]
-    if not is_tile(t):
-        raise TypeError(f"tile_transpose() argument must be a tile, got {t!r}")
-    layout = None
-    # flip layout
-    if t.layout == "rowmajor":
-        layout = "colmajor"
-    elif t.layout == "colmajor":
-        layout = "rowmajor"
-    # force the input tile to shared memory
-    t.storage = "shared"
-    return tile(
-        dtype=t.dtype,
-        shape=t.shape[::-1],
-        storage=t.storage,
-        strides=t.strides[::-1],
-        layout=layout,
-        owner=False,
-    )
 add_builtin(
-    "tile_transpose",
-    input_types={"a": tile(dtype=Any, shape=Tuple[int, int])},
-    value_func=tile_transpose_value_func,
-    variadic=True,
-    doc="""Transpose a tile.
-    For shared memory tiles, this operation will alias the input tile.
-    Register tiles will first be transferred to shared memory before transposition.
+    "tile_bit_and_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_and_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_and_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_and_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "l": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_or_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_or_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_or_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_or_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "l": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_xor_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_xor_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_xor_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "tile_bit_xor_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "i": int, "j": int, "k": int, "l": int, "value": Any},
+    value_func=tile_inplace_value_func,
+    group="Tile Primitives",
+    hidden=True,
+    export=False,
+    is_differentiable=False,
+)
+def tile_transpose_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return tile(dtype=Any, shape=Tuple[int, int])
+    if len(arg_types) != 1:
+        raise TypeError(f"tile_transpose() takes exactly 1 positional argument but {len(arg_types)} were given")
+    t = arg_types["a"]
+    if not is_tile(t):
+        raise TypeError(f"tile_transpose() argument must be a tile, got {t!r}")
+    layout = None
+    # flip layout
+    if t.layout == "rowmajor":
+        layout = "colmajor"
+    elif t.layout == "colmajor":
+        layout = "rowmajor"
+    # force the input tile to shared memory
+    t.storage = "shared"
+    return tile(
+        dtype=t.dtype,
+        shape=t.shape[::-1],
+        storage=t.storage,
+        strides=t.strides[::-1],
+        layout=layout,
+        owner=False,
+    )
+add_builtin(
+    "tile_transpose",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, int])},
+    value_func=tile_transpose_value_func,
+    variadic=True,
+    doc="""Transpose a tile.
+    For shared memory tiles, this operation will alias the input tile.
+    Register tiles will first be transferred to shared memory before transposition.
     :param a: Tile to transpose with ``shape=(M,N)``
     :returns: Tile with ``shape=(N,M)``""",
@@ -3910,6 +4259,80 @@ add_builtin(
 )
+def tile_sum_axis_value_func(arg_types, arg_values):
+    if arg_types is None:
+        return tile(dtype=Scalar, shape=Tuple[int, ...])
+    a = arg_types["a"]
+    if not is_tile(a):
+        raise TypeError(f"tile_sum() 'a' argument must be a tile, got {a!r}")
+    # force input tile to shared
+    a.storage = "shared"
+    axis = arg_values["axis"]
+    shape = a.shape
+    if axis < 0 or axis >= len(shape):
+        raise ValueError(f"tile_sum() axis {axis} is out of bounds for tile with {len(shape)} dimensions")
+    # shape is identical less the axis reduction is along
+    if len(shape) > 1:
+        new_shape = shape[:axis] + shape[axis + 1 :]
+    else:
+        new_shape = (1,)
+    return tile(dtype=a.dtype, shape=new_shape)
+def tile_sum_axis_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    tile = arg_values["a"]
+    axis_var = arg_values["axis"]
+    if not hasattr(axis_var, "constant") or axis_var.constant is None:
+        raise ValueError("tile_sum() axis must be a compile-time constant")
+    axis = axis_var.constant
+    return ((tile,), (axis,))
+add_builtin(
+    "tile_sum",
+    input_types={"a": tile(dtype=Scalar, shape=Tuple[int, ...]), "axis": int},
+    value_func=tile_sum_axis_value_func,
+    dispatch_func=tile_sum_axis_dispatch_func,
+    doc="""Cooperatively compute the sum of the tile elements across an axis of the tile using all threads in the block.
+    :param a: The input tile. Must reside in shared memory.
+    :param axis: The tile axis to compute the sum across. Must be a compile-time constant.
+    :returns: A tile with the same shape as the input tile less the axis dimension and the same data type as the input tile.
+    Example:
+    .. code-block:: python
+        @wp.kernel
+        def compute():
+            t = wp.tile_ones(dtype=float, shape=(8, 8))
+            s = wp.tile_sum(t, axis=0)
+            print(s)
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
+    Prints:
+    .. code-block:: text
+        [8 8 8 8 8 8 8 8] = tile(shape=(8), storage=register)
+    """,
+    group="Tile Primitives",
+    export=False,
+)
 def tile_sort_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -3986,6 +4409,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
 )
@@ -4039,6 +4463,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
 )
@@ -4092,6 +4517,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
 )
@@ -4144,6 +4570,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
 )
@@ -4196,10 +4623,10 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
 )
-# does type propagation for load()
 def tile_reduce_value_func(arg_types, arg_values):
     if arg_types is None:
         return tile(dtype=Scalar, shape=(1,))
@@ -4253,6 +4680,88 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
+)
+def tile_reduce_axis_value_func(arg_types, arg_values):
+    if arg_types is None:
+        return tile(dtype=Scalar, shape=Tuple[int, ...])
+    a = arg_types["a"]
+    if not is_tile(a):
+        raise TypeError(f"tile_reduce() 'a' argument must be a tile, got {a!r}")
+    # force input tile to shared memory
+    a.storage = "shared"
+    axis = arg_values["axis"]
+    shape = a.shape
+    if axis < 0 or axis >= len(shape):
+        raise ValueError(f"tile_reduce() axis {axis} is out of bounds for tile with {len(shape)} dimensions")
+    # shape is identical less the axis reduction is along
+    if len(shape) > 1:
+        new_shape = shape[:axis] + shape[axis + 1 :]
+    else:
+        new_shape = (1,)
+    return tile(dtype=a.dtype, shape=new_shape)
+add_builtin(
+    "tile_reduce",
+    input_types={"op": Callable, "a": tile(dtype=Scalar, shape=Tuple[int, ...]), "axis": int},
+    value_func=tile_reduce_axis_value_func,
+    native_func="tile_reduce_axis",
+    doc="""Apply a custom reduction operator across a tile axis.
+    This function cooperatively performs a reduction using the provided operator across an axis of the tile.
+    :param op: A callable function that accepts two arguments and returns one argument, may be a user function or builtin
+    :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's data type. Must reside in shared memory.
+    :param axis: The tile axis to perform the reduction across. Must be a compile-time constant.
+    :returns: A tile with the same shape as the input tile less the axis dimension and the same data type as the input tile.
+    Example:
+    .. code-block:: python
+        TILE_M = wp.constant(4)
+        TILE_N = wp.constant(2)
+        @wp.kernel
+        def compute(x: wp.array2d(dtype=float), y: wp.array(dtype=float)):
+            a = wp.tile_load(x, shape=(TILE_M, TILE_N))
+            b = wp.tile_reduce(wp.add, a, axis=1)
+            wp.tile_store(y, b)
+        arr = np.arange(TILE_M * TILE_N).reshape(TILE_M, TILE_N)
+        x = wp.array(arr, dtype=float)
+        y = wp.zeros(TILE_M, dtype=float)
+        wp.launch_tiled(compute, dim=[1], inputs=[x], outputs=[y], block_dim=32)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[0. 1.]
+         [2. 3.]
+         [4. 5.]
+         [6. 7.]]
+        [ 1.  5.  9. 13.]
+    """,
+    group="Tile Primitives",
+    export=False,
+    is_differentiable=False,
 )
@@ -4316,6 +4825,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
 )
@@ -4379,6 +4889,7 @@ add_builtin(
     """,
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
 )
@@ -4632,6 +5143,7 @@ add_builtin(
     doc="WIP",
     group="Utility",
     hidden=True,
+    is_differentiable=False,
 )
 add_builtin(
@@ -4647,6 +5159,7 @@ add_builtin(
     doc="WIP",
     group="Utility",
     hidden=True,
+    is_differentiable=False,
 )
 add_builtin(
@@ -4656,6 +5169,7 @@ add_builtin(
     doc="WIP",
     group="Utility",
     hidden=True,
+    is_differentiable=False,
 )
 add_builtin(
@@ -4707,6 +5221,7 @@ add_builtin(
     :param low: The lower bound of the bounding box in BVH space
     :param high: The upper bound of the bounding box in BVH space""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -4722,6 +5237,7 @@ add_builtin(
     :param start: The start of the ray in BVH space
     :param dir: The direction of the ray in BVH space""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -4732,6 +5248,7 @@ add_builtin(
     doc="""Move to the next bound returned by the query.
     The index of the current bound is stored in ``index``, returns ``False`` if there are no more overlapping bound.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5066,12 +5583,13 @@ add_builtin(
     group="Geometry",
     doc="""Construct an axis-aligned bounding box query against a :class:`Mesh`.
-    This query can be used to iterate over all triangles inside a volume.
+    This query can be used to iterate over all bounding boxes of the triangles inside a volume.
     :param id: The mesh identifier
     :param low: The lower bound of the bounding box in mesh space
     :param high: The upper bound of the bounding box in mesh space""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5079,10 +5597,11 @@ add_builtin(
     input_types={"query": MeshQueryAABB, "index": int},
     value_type=builtins.bool,
     group="Geometry",
-    doc="""Move to the next triangle overlapping the query bounding box.
+    doc="""Move to the next triangle whose bounding box overlaps the query bounding box.
     The index of the current face is stored in ``index``, returns ``False`` if there are no more overlapping triangles.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5112,6 +5631,7 @@ add_builtin(
     This query can be used to iterate over all neighboring point within a fixed radius from the query point.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5123,6 +5643,7 @@ add_builtin(
     The index of the current neighbor is stored in ``index``, returns ``False`` if there are no more neighbors.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5136,6 +5657,7 @@ add_builtin(
     Returns -1 if the :class:`HashGrid` has not been reserved.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5145,15 +5667,34 @@ add_builtin(
     group="Geometry",
     doc="""Tests for intersection between two triangles (v0, v1, v2) and (u0, u1, u2) using Moller's method.
+    This function works with single precision, may return incorrect results in some case.
     Returns > 0 if triangles intersect.""",
     export=False,
+    is_differentiable=False,
 )
+add_builtin(
+    "intersect_tri_tri",
+    input_types={"v0": vec3d, "v1": vec3d, "v2": vec3d, "u0": vec3d, "u1": vec3d, "u2": vec3d},
+    value_type=int,
+    group="Geometry",
+    doc="""Tests for intersection between two triangles (v0, v1, v2) and (u0, u1, u2) using Moller's method.
+    This function works with double precision, results are more accurate than the single precision version.
+    Returns > 0 if triangles intersect.""",
+    export=False,
+    is_differentiable=False,
+)
 add_builtin(
     "mesh_get",
     input_types={"id": uint64},
     value_type=Mesh,
-    missing_grad=True,
+    is_differentiable=False,
     group="Geometry",
     doc="""Retrieves the mesh given its index.""",
     export=False,
@@ -5166,6 +5707,7 @@ add_builtin(
     group="Geometry",
     doc="""Evaluates the face normal the mesh given a face index.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5175,6 +5717,7 @@ add_builtin(
     group="Geometry",
     doc="""Returns the point of the mesh given a index.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5184,6 +5727,7 @@ add_builtin(
     group="Geometry",
     doc="""Returns the velocity of the mesh given a index.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5193,6 +5737,7 @@ add_builtin(
     group="Geometry",
     doc="""Returns the point-index of the mesh given a face-vertex index.""",
     export=False,
+    is_differentiable=False,
 )
@@ -5233,12 +5778,32 @@ add_builtin(
 # ---------------------------------
 # Iterators
-add_builtin("iter_next", input_types={"range": range_t}, value_type=int, group="Utility", export=False, hidden=True)
 add_builtin(
-    "iter_next", input_types={"query": HashGridQuery}, value_type=int, group="Utility", export=False, hidden=True
+    "iter_next",
+    input_types={"range": range_t},
+    value_type=int,
+    group="Utility",
+    export=False,
+    hidden=True,
+    is_differentiable=False,
+)
+add_builtin(
+    "iter_next",
+    input_types={"query": HashGridQuery},
+    value_type=int,
+    group="Utility",
+    export=False,
+    hidden=True,
+    is_differentiable=False,
 )
 add_builtin(
-    "iter_next", input_types={"query": MeshQueryAABB}, value_type=int, group="Utility", export=False, hidden=True
+    "iter_next",
+    input_types={"query": MeshQueryAABB},
+    value_type=int,
+    group="Utility",
+    export=False,
+    hidden=True,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5249,6 +5814,7 @@ add_builtin(
     group="Utility",
     doc="""Returns the range in reversed order.""",
     export=False,
+    is_differentiable=False,
 )
 # ---------------------------------
@@ -5268,8 +5834,8 @@ _volume_supported_value_types = {
 def _is_volume_type_supported(dtype):
-    for typ in _volume_supported_value_types:
-        if types_equal(typ, dtype):
+    for value_type in _volume_supported_value_types:
+        if types_equal(value_type, dtype):
             return True
     return False
@@ -5397,6 +5963,7 @@ add_builtin(
     doc="""Returns the value of voxel with coordinates ``i``, ``j``, ``k`` for a volume of type type `dtype`.
     If the voxel at this index does not exist, this function returns the background value.""",
+    is_differentiable=False,
 )
@@ -5417,6 +5984,7 @@ add_builtin(
     export=False,
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5447,6 +6015,7 @@ add_builtin(
     doc="""Returns the value of voxel with coordinates ``i``, ``j``, ``k``.
     If the voxel at this index does not exist, this function returns the background value""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5455,6 +6024,7 @@ add_builtin(
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5475,6 +6045,7 @@ add_builtin(
     doc="""Returns the vector value of voxel with coordinates ``i``, ``j``, ``k``.
     If the voxel at this index does not exist, this function returns the background value.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5483,6 +6054,7 @@ add_builtin(
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5501,6 +6073,7 @@ add_builtin(
     doc="""Returns the :class:`int32` value of voxel with coordinates ``i``, ``j``, ``k``.
     If the voxel at this index does not exist, this function returns the background value.""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5509,6 +6082,7 @@ add_builtin(
     group="Volumes",
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
     export=False,
+    is_differentiable=False,
 )
@@ -5590,6 +6164,7 @@ add_builtin(
     If the voxel at this index does not exist, this function returns -1.
     This function is available for both index grids and classical volumes.
     """,
+    is_differentiable=False,
 )
 add_builtin(
@@ -5631,6 +6206,7 @@ add_builtin(
     value_type=uint32,
     group="Random",
     doc="Initialize a new random number generator given a user-defined seed. Returns a 32-bit integer representing the RNG state.",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5642,6 +6218,7 @@ add_builtin(
     This alternative constructor can be useful in parallel programs, where a kernel as a whole should share a seed,
     but each thread should generate uncorrelated values. In this case usage should be ``r = rand_init(seed, tid)``""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5650,6 +6227,7 @@ add_builtin(
     value_type=int,
     group="Random",
     doc="Return a random integer in the range [-2^31, 2^31).",
+    is_differentiable=False,
 )
 add_builtin(
     "randi",
@@ -5657,6 +6235,7 @@ add_builtin(
     value_type=int,
     group="Random",
     doc="Return a random integer between [low, high).",
+    is_differentiable=False,
 )
 add_builtin(
     "randu",
@@ -5664,6 +6243,7 @@ add_builtin(
     value_type=uint32,
     group="Random",
     doc="Return a random unsigned integer in the range [0, 2^32).",
+    is_differentiable=False,
 )
 add_builtin(
     "randu",
@@ -5671,6 +6251,7 @@ add_builtin(
     value_type=uint32,
     group="Random",
     doc="Return a random unsigned integer between [low, high).",
+    is_differentiable=False,
 )
 add_builtin(
     "randf",
@@ -5678,6 +6259,7 @@ add_builtin(
     value_type=float,
     group="Random",
     doc="Return a random float between [0.0, 1.0).",
+    is_differentiable=False,
 )
 add_builtin(
     "randf",
@@ -5685,6 +6267,7 @@ add_builtin(
     value_type=float,
     group="Random",
     doc="Return a random float between [low, high).",
+    is_differentiable=False,
 )
 add_builtin(
     "randn",
@@ -5692,6 +6275,7 @@ add_builtin(
     value_type=float,
     group="Random",
     doc="Sample a normal (Gaussian) distribution of mean 0 and variance 1. ",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5700,6 +6284,7 @@ add_builtin(
     value_type=int,
     group="Random",
     doc="Inverse-transform sample a cumulative distribution function.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_triangle",
@@ -5707,6 +6292,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Uniformly sample a triangle. Returns sample barycentric coordinates.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_unit_ring",
@@ -5714,6 +6300,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Uniformly sample a ring in the xy plane.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_unit_disk",
@@ -5721,6 +6308,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Uniformly sample a disk in the xy plane.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_unit_sphere_surface",
@@ -5728,6 +6316,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit sphere surface.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_unit_sphere",
@@ -5735,6 +6324,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit sphere.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_unit_hemisphere_surface",
@@ -5742,6 +6332,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit hemisphere surface.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_unit_hemisphere",
@@ -5749,6 +6340,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit hemisphere.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_unit_square",
@@ -5756,6 +6348,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Uniformly sample a unit square.",
+    is_differentiable=False,
 )
 add_builtin(
     "sample_unit_cube",
@@ -5763,6 +6356,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Uniformly sample a unit cube.",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5774,6 +6368,7 @@ add_builtin(
     :param state: RNG state
     :param lam: The expected value of the distribution""",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5841,7 +6436,7 @@ add_builtin(
     value_type=vec2,
     group="Random",
     doc="Divergence-free vector field based on the gradient of a Perlin noise function.",
-    missing_grad=True,
+    is_differentiable=False,
 )
 add_builtin(
     "curlnoise",
@@ -5850,7 +6445,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Divergence-free vector field based on the curl of three Perlin noise functions.",
-    missing_grad=True,
+    is_differentiable=False,
 )
 add_builtin(
     "curlnoise",
@@ -5859,7 +6454,7 @@ add_builtin(
     value_type=vec3,
     group="Random",
     doc="Divergence-free vector field based on the curl of three Perlin noise functions.",
-    missing_grad=True,
+    is_differentiable=False,
 )
@@ -5891,9 +6486,16 @@ add_builtin(
     dispatch_func=printf_dispatch_func,
     group="Utility",
     doc="Allows printing formatted strings using C-style format specifiers.",
+    is_differentiable=False,
 )
-add_builtin("print", input_types={"value": Any}, doc="Print variable to stdout", export=False, group="Utility")
+add_builtin(
+    "print",
+    input_types={"value": Any},
+    doc="Print variable to stdout",
+    export=False,
+    group="Utility",
+)
 add_builtin(
     "breakpoint",
@@ -5903,6 +6505,7 @@ add_builtin(
     group="Utility",
     namespace="",
     native_func="__debugbreak",
+    is_differentiable=False,
 )
 # helpers
@@ -5920,6 +6523,7 @@ add_builtin(
     This function may not be called from user-defined Warp functions.""",
     namespace="",
     native_func="builtin_tid1d",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5930,6 +6534,7 @@ add_builtin(
     doc="Returns the number of threads in the current block.",
     namespace="",
     native_func="builtin_block_dim",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5944,6 +6549,7 @@ add_builtin(
     This function may not be called from user-defined Warp functions.""",
     namespace="",
     native_func="builtin_tid2d",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5958,6 +6564,7 @@ add_builtin(
     This function may not be called from user-defined Warp functions.""",
     namespace="",
     native_func="builtin_tid3d",
+    is_differentiable=False,
 )
 add_builtin(
@@ -5972,17 +6579,37 @@ add_builtin(
     This function may not be called from user-defined Warp functions.""",
     namespace="",
     native_func="builtin_tid4d",
+    is_differentiable=False,
 )
+def copy_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    a = arg_types["a"]
+    # if the input is a shared tile, we force a copy
+    if is_tile(a) and a.storage == "shared":
+        return tile(
+            dtype=a.dtype,
+            shape=a.shape,
+            storage=a.storage,
+            strides=a.strides,
+            layout=a.layout,
+            owner=True,
+        )
+    return a
 add_builtin(
     "copy",
     input_types={"a": Any},
-    value_func=lambda arg_types, arg_values: arg_types["a"],
+    value_func=copy_value_func,
     hidden=True,
     export=False,
     group="Utility",
 )
 add_builtin(
     "assign",
     input_types={"dest": Any, "src": Any},
@@ -5992,61 +6619,88 @@ add_builtin(
 )
-def select_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
-    warp.utils.warn(
-        "wp.select() is deprecated and will be removed in a future\n"
-        "version. Use wp.where(cond, value_if_true, value_if_false) instead.",
-        category=DeprecationWarning,
-    )
-    func_args = tuple(args.values())
-    template_args = ()
+def select_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return Any
-    return (func_args, template_args)
+    raise RuntimeError("wp.select() has been removed. Use wp.where(cond, value_if_true, value_if_false) instead.")
 add_builtin(
     "select",
     input_types={"cond": builtins.bool, "value_if_false": Any, "value_if_true": Any},
-    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-    dispatch_func=select_dispatch_func,
+    value_func=select_value_func,
     doc="""Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``.
-    .. deprecated:: 1.7
+    .. versionremoved:: 1.10
          Use :func:`where` instead, which has the more intuitive argument order:
-         ``where(cond, value_if_true, value_if_false)``.""",
+         ``where(cond, value_if_true, value_if_false)``.
+    .. deprecated:: 1.7""",
     group="Utility",
 )
 for t in int_types:
     add_builtin(
         "select",
         input_types={"cond": t, "value_if_false": Any, "value_if_true": Any},
-        value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-        dispatch_func=select_dispatch_func,
+        value_func=select_value_func,
         doc="""Select between two arguments, if ``cond`` is ``False`` then return ``value_if_false``, otherwise return ``value_if_true``.
-    .. deprecated:: 1.7
+    .. versionremoved:: 1.10
          Use :func:`where` instead, which has the more intuitive argument order:
-         ``where(cond, value_if_true, value_if_false)``.""",
+         ``where(cond, value_if_true, value_if_false)``.
+    .. deprecated:: 1.7""",
         group="Utility",
     )
 add_builtin(
     "select",
     input_types={"arr": array(dtype=Any), "value_if_false": Any, "value_if_true": Any},
-    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
-    dispatch_func=select_dispatch_func,
+    value_func=select_value_func,
     doc="""Select between two arguments, if ``arr`` is null then return ``value_if_false``, otherwise return ``value_if_true``.
-    .. deprecated:: 1.7
+    .. versionremoved:: 1.10
          Use :func:`where` instead, which has the more intuitive argument order:
-         ``where(arr, value_if_true, value_if_false)``.""",
+         ``where(arr, value_if_true, value_if_false)``.
+    .. deprecated:: 1.7""",
     group="Utility",
 )
+def where_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    if arg_types is None:
+        return Any
+    v_true = arg_types["value_if_true"]
+    v_false = arg_types["value_if_false"]
+    if not types_equal(v_true, v_false):
+        raise RuntimeError(f"where() true value type ({v_true}) must be of the same type as the false type ({v_false})")
+    if is_tile(v_false):
+        if v_true.storage == "register":
+            return v_true
+        if v_false.storage == "register":
+            return v_false
+        # both v_true and v_false are shared
+        return tile(
+            dtype=v_true.dtype,
+            shape=v_true.shape,
+            storage=v_true.storage,
+            strides=v_true.strides,
+            layout=v_true.layout,
+            owner=True,
+        )
+    return v_true
 add_builtin(
     "where",
     input_types={"cond": builtins.bool, "value_if_true": Any, "value_if_false": Any},
-    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    value_func=where_value_func,
     doc="Select between two arguments, if ``cond`` is ``True`` then return ``value_if_true``, otherwise return ``value_if_false``.",
     group="Utility",
 )
@@ -6054,14 +6708,14 @@ for t in int_types:
     add_builtin(
         "where",
         input_types={"cond": t, "value_if_true": Any, "value_if_false": Any},
-        value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+        value_func=where_value_func,
         doc="Select between two arguments, if ``cond`` is ``True`` then return ``value_if_true``, otherwise return ``value_if_false``.",
         group="Utility",
     )
 add_builtin(
     "where",
     input_types={"arr": array(dtype=Any), "value_if_true": Any, "value_if_false": Any},
-    value_func=lambda arg_types, arg_values: Any if arg_types is None else arg_types["value_if_false"],
+    value_func=where_value_func,
     doc="Select between two arguments, if ``arr`` is not null then return ``value_if_true``, otherwise return ``value_if_false``.",
     group="Utility",
 )
@@ -6099,7 +6753,7 @@ add_builtin(
     group="Utility",
     hidden=True,
     export=False,
-    missing_grad=True,
+    is_differentiable=False,
 )
@@ -6140,7 +6794,7 @@ add_builtin(
     native_func="fixedarray_t",
     group="Utility",
     export=False,
-    missing_grad=True,
+    is_differentiable=False,
     hidden=True,  # Unhide once we can document both a built-in and a Python scope function sharing the same name.
 )
@@ -6183,14 +6837,13 @@ for array_type in array_types:
 # does argument checking and type propagation for view()
 def view_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     arr_type = arg_types["arr"]
-    idx_types = tuple(arg_types[x] for x in "ijk" if arg_types.get(x, None) is not None)
+    idx_types = tuple(arg_types[x] for x in "ijkl" if arg_types.get(x, None) is not None)
     if not is_array(arr_type):
         raise RuntimeError("view() first argument must be an array")
     idx_count = len(idx_types)
-    if idx_count >= arr_type.ndim:
+    if idx_count > arr_type.ndim:
         raise RuntimeError(
             f"Trying to create an array view with {idx_count} indices, "
             f"but the array only has {arr_type.ndim} dimension(s). "
@@ -6198,14 +6851,35 @@ def view_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]
             f"the expected number of dimensions, e.g.: def func(param: wp.array3d(dtype=float): ..."
         )
-    # check index types
-    for t in idx_types:
-        if not type_is_int(t):
-            raise RuntimeError(f"view() index arguments must be of integer type, got index of type {type_repr(t)}")
+    has_slice = any(is_slice(x) for x in idx_types)
+    if has_slice:
+        # check index types
+        for t in idx_types:
+            if not (type_is_int(t) or is_slice(t)):
+                raise RuntimeError(
+                    f"view() index arguments must be of integer or slice types, got index of type {type_repr(t)}"
+                )
+        # Each integer index collapses one dimension.
+        int_count = sum(x.step == 0 for x in idx_types)
+        ndim = arr_type.ndim - int_count
+        assert ndim > 0
+    else:
+        if idx_count == arr_type.ndim:
+            raise RuntimeError("Expected to call `address()` instead of `view()`")
+        # check index types
+        for t in idx_types:
+            if not type_is_int(t):
+                raise RuntimeError(
+                    f"view() index arguments must be of integer or slice types, got index of type {type_repr(t)}"
+                )
+        # create an array view with leading dimensions removed
+        ndim = arr_type.ndim - idx_count
+        assert ndim > 0
-    # create an array view with leading dimensions removed
     dtype = arr_type.dtype
-    ndim = arr_type.ndim - idx_count
     if isinstance(arr_type, (fabricarray, indexedfabricarray)):
         # fabric array of arrays: return array attribute as a regular array
         return array(dtype=dtype, ndim=ndim)
@@ -6216,8 +6890,18 @@ def view_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]
 for array_type in array_types:
     add_builtin(
         "view",
-        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int},
-        defaults={"j": None, "k": None},
+        input_types={
+            "arr": array_type(dtype=Any),
+            "i": Any,
+            "j": Any,
+            "k": Any,
+            "l": Any,
+        },
+        defaults={
+            "j": None,
+            "k": None,
+            "l": None,
+        },
         constraint=sametypes,
         hidden=True,
         value_func=view_value_func,
@@ -6321,6 +7005,7 @@ add_builtin(
     hidden=True,
     skip_replay=True,
     group="Utility",
+    is_differentiable=False,
 )
@@ -6337,6 +7022,7 @@ add_builtin(
     dispatch_func=load_dispatch_func,
     hidden=True,
     group="Utility",
+    is_differentiable=False,
 )
@@ -6412,6 +7098,13 @@ def create_atomic_op_value_func(op: str):
                     f"atomic_{op}() operations only work on arrays with [u]int32, [u]int64, float32, or float64 "
                     f"as the underlying scalar types, but got {type_repr(arr_type.dtype)} (with scalar type {type_repr(scalar_type)})"
                 )
+        elif op in ("and", "or", "xor"):
+            supported_atomic_types = (warp.int32, warp.int64, warp.uint32, warp.uint64)
+            if not any(types_equal(scalar_type, x, match_generic=True) for x in supported_atomic_types):
+                raise RuntimeError(
+                    f"atomic_{op}() operations only work on arrays with [u]int32 or [u]int64 "
+                    f"as the underlying scalar types, but got {type_repr(arr_type.dtype)} (with scalar type {type_repr(scalar_type)})"
+                )
         else:
             raise NotImplementedError
@@ -6653,6 +7346,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        is_differentiable=False,
     )
     add_builtin(
         "atomic_cas",
@@ -6666,6 +7360,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        is_differentiable=False,
     )
     add_builtin(
         "atomic_cas",
@@ -6679,6 +7374,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        is_differentiable=False,
     )
     add_builtin(
         "atomic_cas",
@@ -6700,6 +7396,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        is_differentiable=False,
     )
     add_builtin(
@@ -6714,6 +7411,7 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        is_differentiable=False,
     )
     add_builtin(
         "atomic_exch",
@@ -6727,32 +7425,193 @@ for array_type in array_types:
     The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_exch",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("exch"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically exchange ``value`` with ``arr[i,j,k]`` and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_exch",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("exch"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically exchange ``value`` with ``arr[i,j,k,l]`` and return the old value.
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        group="Utility",
+        skip_replay=True,
+    )
+    add_builtin(
+        "atomic_and",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("and"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise AND between ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i] &= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_and",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("and"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise AND between ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j] &= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_and",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("and"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise AND between ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j,k] &= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_and",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("and"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise AND between ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j,k,l] &= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_or",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("or"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise OR between ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i] |= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_or",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("or"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise OR between ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j] |= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_or",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("or"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise OR between ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j,k] |= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_or",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("or"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise OR between ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j,k,l] |= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_xor",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("xor"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise XOR between ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i] ^= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
+    )
+    add_builtin(
+        "atomic_xor",
+        hidden=hidden,
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=atomic_op_constraint,
+        value_func=create_atomic_op_value_func("xor"),
+        dispatch_func=atomic_op_dispatch_func,
+        doc="""Atomically performs a bitwise XOR between ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j] ^= value``.""",
+        group="Utility",
+        skip_replay=True,
+        is_differentiable=False,
     )
     add_builtin(
-        "atomic_exch",
+        "atomic_xor",
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
         constraint=atomic_op_constraint,
-        value_func=create_atomic_op_value_func("exch"),
+        value_func=create_atomic_op_value_func("xor"),
         dispatch_func=atomic_op_dispatch_func,
-        doc="""Atomically exchange ``value`` with ``arr[i,j,k]`` and return the old value.
-    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Atomically performs a bitwise XOR between ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j,k] ^= value``.""",
         group="Utility",
         skip_replay=True,
+        is_differentiable=False,
     )
     add_builtin(
-        "atomic_exch",
+        "atomic_xor",
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
         constraint=atomic_op_constraint,
-        value_func=create_atomic_op_value_func("exch"),
+        value_func=create_atomic_op_value_func("xor"),
         dispatch_func=atomic_op_dispatch_func,
-        doc="""Atomically exchange ``value`` with ``arr[i,j,k,l]`` and return the old value.
-    The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Atomically performs a bitwise XOR between ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+        This function is automatically invoked when using the syntax ``arr[i,j,k,l] ^= value``.""",
         group="Utility",
         skip_replay=True,
+        is_differentiable=False,
     )
@@ -6903,6 +7762,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
 # implements &quaternion[index]
 add_builtin(
@@ -6913,6 +7773,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
 # implements &transformation[index]
 add_builtin(
@@ -6923,6 +7784,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
 # implements &(*vector)[index]
 add_builtin(
@@ -6933,6 +7795,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
 # implements &(*matrix)[i, j]
 add_builtin(
@@ -6943,6 +7806,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
 # implements &(*quaternion)[index]
 add_builtin(
@@ -6953,6 +7817,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
 # implements &(*transformation)[index]
 add_builtin(
@@ -6963,6 +7828,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
@@ -7158,6 +8024,43 @@ add_builtin(
 )
+# implements vector[idx] &= scalar
+add_builtin(
+    "bit_and_inplace",
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
+    value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
+# implements vector[idx] |= scalar
+add_builtin(
+    "bit_or_inplace",
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
+    value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
+# implements vector[idx] ^= scalar
+add_builtin(
+    "bit_xor_inplace",
+    input_types={"a": vector(length=Any, dtype=Scalar), "i": Any, "value": Any},
+    value_type=None,
+    dispatch_func=vector_assign_dispatch_func,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
 def matrix_index_row_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     mat_type = arg_types["a"]
     row_type = mat_type._wp_row_type_
@@ -7173,6 +8076,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
@@ -7191,6 +8095,7 @@ add_builtin(
     hidden=True,
     group="Utility",
     skip_replay=True,
+    is_differentiable=False,
 )
@@ -7390,6 +8295,78 @@ add_builtin(
 )
+# implements matrix[i] &= value
+add_builtin(
+    "bit_and_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    value_type=None,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
+# implements matrix[i,j] &= value
+add_builtin(
+    "bit_and_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
+    value_type=None,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
+# implements matrix[i] |= value
+add_builtin(
+    "bit_or_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    value_type=None,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
+# implements matrix[i,j] |= value
+add_builtin(
+    "bit_or_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
+    value_type=None,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
+# implements matrix[i] ^= value
+add_builtin(
+    "bit_xor_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "value": Any},
+    value_type=None,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
+# implements matrix[i,j] ^= value
+add_builtin(
+    "bit_xor_inplace",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Scalar), "i": Any, "j": Any, "value": Any},
+    value_type=None,
+    hidden=True,
+    export=False,
+    group="Utility",
+    is_differentiable=False,
+)
 for t in scalar_types + vector_types + (bool,):
     if "vec" in t.__name__ or "mat" in t.__name__:
         continue
@@ -7401,6 +8378,7 @@ for t in scalar_types + vector_types + (bool,):
         doc="Prints an error to stdout if ``a`` and ``b`` are not equal",
         group="Utility",
         hidden=True,
+        is_differentiable=False,
     )
     add_builtin(
@@ -7411,6 +8389,7 @@ for t in scalar_types + vector_types + (bool,):
         group="Utility",
         hidden=True,
         export=False,
+        is_differentiable=False,
     )
@@ -7429,6 +8408,7 @@ add_builtin(
     doc="Prints an error to stdout if ``a`` and ``b`` are not equal",
     group="Utility",
     hidden=True,
+    is_differentiable=False,
 )
 add_builtin(
     "expect_neq",
@@ -7439,6 +8419,7 @@ add_builtin(
     group="Utility",
     hidden=True,
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -7449,6 +8430,7 @@ add_builtin(
     doc="Prints an error to stdout if ``a`` and ``b`` are not equal",
     group="Utility",
     hidden=True,
+    is_differentiable=False,
 )
 add_builtin(
     "expect_neq",
@@ -7459,6 +8441,7 @@ add_builtin(
     group="Utility",
     hidden=True,
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -7549,6 +8532,7 @@ add_builtin(
     value_type=None,
     doc="Prints an error to stdout if ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
+    is_differentiable=False,
 )
 add_builtin(
     "expect_near",
@@ -7558,6 +8542,7 @@ add_builtin(
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
+    is_differentiable=False,
 )
 add_builtin(
     "expect_near",
@@ -7567,6 +8552,7 @@ add_builtin(
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
+    is_differentiable=False,
 )
 add_builtin(
     "expect_near",
@@ -7580,6 +8566,7 @@ add_builtin(
     value_type=None,
     doc="Prints an error to stdout if any element of ``a`` and ``b`` are not closer than tolerance in magnitude",
     group="Utility",
+    is_differentiable=False,
 )
 # ---------------------------------
@@ -7590,6 +8577,7 @@ add_builtin(
     input_types={"arr": array(dtype=Scalar), "value": Scalar},
     value_type=int,
     doc="Search a sorted array ``arr`` for the closest element greater than or equal to ``value``.",
+    is_differentiable=False,
 )
 add_builtin(
@@ -7597,6 +8585,7 @@ add_builtin(
     input_types={"arr": array(dtype=Scalar), "arr_begin": int, "arr_end": int, "value": Scalar},
     value_type=int,
     doc="Search a sorted array ``arr`` in the range [arr_begin, arr_end) for the closest element greater than or equal to ``value``.",
+    is_differentiable=False,
 )
 # ---------------------------------
@@ -7672,12 +8661,157 @@ add_builtin(
 )
 # bitwise operators
-add_builtin("bit_and", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("bit_or", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("bit_xor", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("lshift", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("rshift", input_types={"a": Int, "b": Int}, value_func=sametypes_create_value_func(Int))
-add_builtin("invert", input_types={"a": Int}, value_func=sametypes_create_value_func(Int))
+add_builtin(
+    "bit_and",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_and",
+    input_types={"a": vector(length=Any, dtype=Int), "b": vector(length=Any, dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(vector(length=Any, dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_and",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Int), "b": matrix(shape=(Any, Any), dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(matrix(shape=(Any, Any), dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_or",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_or",
+    input_types={"a": vector(length=Any, dtype=Int), "b": vector(length=Any, dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(vector(length=Any, dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_or",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Int), "b": matrix(shape=(Any, Any), dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(matrix(shape=(Any, Any), dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_xor",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_xor",
+    input_types={"a": vector(length=Any, dtype=Int), "b": vector(length=Any, dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(vector(length=Any, dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_xor",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Int), "b": matrix(shape=(Any, Any), dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(matrix(shape=(Any, Any), dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "lshift",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "lshift",
+    input_types={"a": vector(length=Any, dtype=Int), "b": vector(length=Any, dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(vector(length=Any, dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "lshift",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Int), "b": matrix(shape=(Any, Any), dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(matrix(shape=(Any, Any), dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "rshift",
+    input_types={"a": Int, "b": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "rshift",
+    input_types={"a": vector(length=Any, dtype=Int), "b": vector(length=Any, dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(vector(length=Any, dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "rshift",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Int), "b": matrix(shape=(Any, Any), dtype=Int)},
+    constraint=sametypes,
+    value_func=sametypes_create_value_func(matrix(shape=(Any, Any), dtype=Int)),
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "invert",
+    input_types={"a": Int},
+    value_func=sametypes_create_value_func(Int),
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "invert",
+    input_types={"a": vector(length=Any, dtype=Int)},
+    value_func=sametypes_create_value_func(vector(length=Any, dtype=Int)),
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "invert",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Int)},
+    value_func=sametypes_create_value_func(matrix(shape=(Any, Any), dtype=Int)),
+    group="Operators",
+    is_differentiable=False,
+)
 add_builtin(
@@ -7878,6 +9012,7 @@ add_builtin(
     value_func=sametypes_create_value_func(vector(length=Any, dtype=Scalar)),
     doc="Modulo operation using truncated division.",
     group="Operators",
+    is_differentiable=False,
 )
 add_builtin(
@@ -7937,6 +9072,7 @@ add_builtin(
     value_func=sametypes_create_value_func(Scalar),
     doc="",
     group="Operators",
+    is_differentiable=False,
 )
 add_builtin("pos", input_types={"x": Scalar}, value_func=sametypes_create_value_func(Scalar), group="Operators")
@@ -7984,12 +9120,28 @@ add_builtin(
     group="Operators",
 )
-add_builtin("unot", input_types={"a": builtins.bool}, value_type=builtins.bool, doc="", group="Operators")
+add_builtin(
+    "unot",
+    input_types={"a": builtins.bool},
+    value_type=builtins.bool,
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
 for t in int_types:
-    add_builtin("unot", input_types={"a": t}, value_type=builtins.bool, doc="", group="Operators")
+    add_builtin(
+        "unot", input_types={"a": t}, value_type=builtins.bool, doc="", group="Operators", is_differentiable=False
+    )
-add_builtin("unot", input_types={"a": array(dtype=Any)}, value_type=builtins.bool, doc="", group="Operators")
+add_builtin(
+    "unot",
+    input_types={"a": array(dtype=Any)},
+    value_type=builtins.bool,
+    doc="",
+    group="Operators",
+    is_differentiable=False,
+)
 # Tile operators
@@ -8061,6 +9213,45 @@ add_builtin(
     export=False,
 )
+add_builtin(
+    "bit_and",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_func=tile_binary_map_value_func,
+    # dispatch_func=tile_map_dispatch_func,
+    # variadic=True,
+    native_func="tile_bit_and",
+    doc="Bitwise AND each element of two tiles together",
+    group="Tile Primitives",
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_or",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_func=tile_binary_map_value_func,
+    # dispatch_func=tile_map_dispatch_func,
+    # variadic=True,
+    native_func="tile_bit_or",
+    doc="Bitwise OR each element of two tiles together",
+    group="Tile Primitives",
+    export=False,
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_xor",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_func=tile_binary_map_value_func,
+    # dispatch_func=tile_map_dispatch_func,
+    # variadic=True,
+    native_func="tile_bit_xor",
+    doc="Bitwise XOR each element of two tiles together",
+    group="Tile Primitives",
+    export=False,
+    is_differentiable=False,
+)
 add_builtin(
     "mul",
@@ -8122,6 +9313,45 @@ add_builtin(
 )
+add_builtin(
+    "bit_and_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_type=None,
+    dispatch_func=tile_inplace_dispatch_func,
+    export=False,
+    hidden=True,
+    native_func="tile_bit_and_inplace",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_or_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_type=None,
+    dispatch_func=tile_inplace_dispatch_func,
+    export=False,
+    hidden=True,
+    native_func="tile_bit_or_inplace",
+    group="Operators",
+    is_differentiable=False,
+)
+add_builtin(
+    "bit_xor_inplace",
+    input_types={"a": tile(dtype=Any, shape=Tuple[int, ...]), "b": tile(dtype=Any, shape=Tuple[int, ...])},
+    value_type=None,
+    dispatch_func=tile_inplace_dispatch_func,
+    export=False,
+    hidden=True,
+    native_func="tile_bit_xor_inplace",
+    group="Operators",
+    is_differentiable=False,
+)
 def tile_diag_add_value_func(arg_types, arg_values):
     if arg_types is None:
         return tile(dtype=Any, shape=Tuple[int, int])
@@ -8163,7 +9393,7 @@ def tile_diag_add_lto_dispatch_func(
     return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
-    builder: warp.context.ModuleBuilder,
+    builder: warp._src.context.ModuleBuilder,
 ):
     a = arg_values["a"]
     d = arg_values["d"]
@@ -8183,6 +9413,7 @@ add_builtin(
     doc="Add a square matrix and a diagonal matrix 'd' represented as a 1D tile",
     group="Tile Primitives",
     export=False,
+    is_differentiable=False,
 )
@@ -8239,7 +9470,7 @@ def tile_matmul_lto_dispatch_func(
     return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
-    builder: warp.context.ModuleBuilder,
+    builder: warp._src.context.ModuleBuilder,
 ):
     a = arg_values["a"]
     b = arg_values["b"]
@@ -8277,7 +9508,7 @@ def tile_matmul_lto_dispatch_func(
     num_threads = options["block_dim"]
     arch = options["output_arch"]
-    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
+    if arch is None or not warp._src.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, 0, 0, a, b, out), template_args, [], 0)
     else:
@@ -8290,7 +9521,7 @@ def tile_matmul_lto_dispatch_func(
         # generate the LTOs
         #    C += A * B
-        (fun_forward, lto_forward) = warp.build.build_lto_dot(
+        (fun_forward, lto_forward) = warp._src.build.build_lto_dot(
             M,
             N,
             K,
@@ -8306,7 +9537,7 @@ def tile_matmul_lto_dispatch_func(
         )
         if warp.config.enable_backward:
             # adjA += adjC * B^T - Transpose ~= flipped layout
-            (fun_backward_A, lto_backward_A) = warp.build.build_lto_dot(
+            (fun_backward_A, lto_backward_A) = warp._src.build.build_lto_dot(
                 M,
                 K,
                 N,
@@ -8321,7 +9552,7 @@ def tile_matmul_lto_dispatch_func(
                 builder,
             )
             # adjB += A^T * adjC - Transpose ~= flipped layout
-            (fun_backward_B, lto_backward_B) = warp.build.build_lto_dot(
+            (fun_backward_B, lto_backward_B) = warp._src.build.build_lto_dot(
                 K,
                 N,
                 M,
@@ -8438,7 +9669,7 @@ def tile_fft_generic_lto_dispatch_func(
     return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
-    builder: warp.context.ModuleBuilder,
+    builder: warp._src.context.ModuleBuilder,
     direction: str | None = None,
 ):
     inout = arg_values["inout"]
@@ -8467,12 +9698,12 @@ def tile_fft_generic_lto_dispatch_func(
     arch = options["output_arch"]
     ept = size // num_threads
-    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
+    if arch is None or not warp._src.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ([], [], [], 0)
     else:
         # generate the LTO
-        lto_symbol, lto_code_data, shared_memory_bytes = warp.build.build_lto_fft(
+        lto_symbol, lto_code_data, shared_memory_bytes = warp._src.build.build_lto_fft(
             arch, size, ept, direction, dir, precision, builder
         )
@@ -8510,6 +9741,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    is_differentiable=False,
 )
 add_builtin(
@@ -8531,6 +9763,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    is_differentiable=False,
 )
@@ -8575,7 +9808,7 @@ def tile_cholesky_generic_lto_dispatch_func(
     return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
-    builder: warp.context.ModuleBuilder,
+    builder: warp._src.context.ModuleBuilder,
 ):
     a = arg_values["A"]
     # force source tile to shared memory
@@ -8595,7 +9828,7 @@ def tile_cholesky_generic_lto_dispatch_func(
     arch = options["output_arch"]
-    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
+    if arch is None or not warp._src.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, a, out), [], [], 0)
     else:
@@ -8610,7 +9843,7 @@ def tile_cholesky_generic_lto_dispatch_func(
         req_smem_bytes = a.type.size * type_size_in_bytes(a.type.dtype)
         # generate the LTO
-        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+        lto_symbol, lto_code_data = warp._src.build.build_lto_solver(
             M,
             N,
             1,
@@ -8655,6 +9888,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    is_differentiable=False,
 )
@@ -8698,7 +9932,7 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
     return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
-    builder: warp.context.ModuleBuilder,
+    builder: warp._src.context.ModuleBuilder,
 ):
     L = arg_values["L"]
     y = arg_values["y"]
@@ -8727,7 +9961,7 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
     arch = options["output_arch"]
-    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
+    if arch is None or not warp._src.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, L, y, x), [], [], 0)
     else:
@@ -8743,7 +9977,7 @@ def tile_cholesky_solve_generic_lto_dispatch_func(
         req_smem_bytes = (x.type.size + y.type.size + L.type.size) * type_size_in_bytes(L.type.dtype)
         # generate the LTO
-        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+        lto_symbol, lto_code_data = warp._src.build.build_lto_solver(
             M,
             N,
             NRHS,
@@ -8785,6 +10019,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    is_differentiable=False,
 )
@@ -8794,7 +10029,7 @@ def tile_lower_solve_generic_lto_dispatch_func(
     return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
-    builder: warp.context.ModuleBuilder,
+    builder: warp._src.context.ModuleBuilder,
 ):
     L = arg_values["L"]
     y = arg_values["y"]
@@ -8823,7 +10058,7 @@ def tile_lower_solve_generic_lto_dispatch_func(
     arch = options["output_arch"]
-    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
+    if arch is None or not warp._src.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, L, y, z), [], [], 0)
     else:
@@ -8839,7 +10074,7 @@ def tile_lower_solve_generic_lto_dispatch_func(
         req_smem_bytes = (z.type.size + y.type.size + L.type.size) * type_size_in_bytes(L.type.dtype)
         # generate the LTO
-        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+        lto_symbol, lto_code_data = warp._src.build.build_lto_solver(
             M,
             N,
             NRHS,
@@ -8917,6 +10152,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    is_differentiable=False,
 )
@@ -8926,7 +10162,7 @@ def tile_upper_solve_generic_lto_dispatch_func(
     return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
-    builder: warp.context.ModuleBuilder,
+    builder: warp._src.context.ModuleBuilder,
 ):
     U = arg_values["U"]
     z = arg_values["z"]
@@ -8955,7 +10191,7 @@ def tile_upper_solve_generic_lto_dispatch_func(
     arch = options["output_arch"]
-    if arch is None or not warp.context.runtime.core.wp_is_mathdx_enabled():
+    if arch is None or not warp._src.context.runtime.core.wp_is_mathdx_enabled():
         # CPU/no-MathDx dispatch
         return ((0, U, z, x), [], [], 0)
     else:
@@ -8971,7 +10207,7 @@ def tile_upper_solve_generic_lto_dispatch_func(
         req_smem_bytes = (x.type.size + z.type.size + U.type.size) * type_size_in_bytes(U.type.dtype)
         # generate the LTO
-        lto_symbol, lto_code_data = warp.build.build_lto_solver(
+        lto_symbol, lto_code_data = warp._src.build.build_lto_solver(
             M,
             N,
             NRHS,
@@ -9049,6 +10285,7 @@ add_builtin(
     group="Tile Primitives",
     export=False,
     namespace="",
+    is_differentiable=False,
 )
@@ -9068,6 +10305,7 @@ add_builtin(
     The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
     (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).""",
     group="Code Generation",
+    is_differentiable=False,
 )
@@ -9092,6 +10330,7 @@ add_builtin(
     doc="Return the number of elements in a vector.",
     group="Utility",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -9101,6 +10340,7 @@ add_builtin(
     doc="Return the number of elements in a quaternion.",
     group="Utility",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -9110,6 +10350,7 @@ add_builtin(
     doc="Return the number of rows in a matrix.",
     group="Utility",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -9119,6 +10360,7 @@ add_builtin(
     doc="Return the number of elements in a transformation.",
     group="Utility",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -9128,6 +10370,7 @@ add_builtin(
     doc="Return the size of the first dimension in an array.",
     group="Utility",
     export=False,
+    is_differentiable=False,
 )
 add_builtin(
@@ -9137,6 +10380,33 @@ add_builtin(
     doc="Return the number of rows in a tile.",
     group="Utility",
     export=False,
+    is_differentiable=False,
+)
+def cast_value_func(arg_types, arg_values):
+    # Return generic type for doc builds.
+    if arg_types is None:
+        return Any
+    return arg_values["dtype"]
+def cast_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = (args["a"],)
+    template_args = (args["dtype"],)
+    return (func_args, template_args)
+add_builtin(
+    "cast",
+    input_types={"a": Any, "dtype": Any},
+    value_func=cast_value_func,
+    dispatch_func=cast_dispatch_func,
+    doc="Reinterpret a value as a different type while preserving its bit pattern.",
+    group="Utility",
+    export=False,
+    is_differentiable=False,
 )
@@ -9163,7 +10433,7 @@ add_builtin(
     doc="Construct a tuple from a list of values",
     group="Utility",
     hidden=True,
-    missing_grad=True,
+    is_differentiable=False,
     export=False,
 )
@@ -9200,7 +10470,7 @@ add_builtin(
     dispatch_func=tuple_extract_dispatch_func,
     group="Utility",
     hidden=True,
-    missing_grad=True,
+    is_differentiable=False,
 )
@@ -9211,6 +10481,7 @@ add_builtin(
     doc="Return the number of elements in a tuple.",
     group="Utility",
     export=False,
+    is_differentiable=False,
 )
 # ---------------------------------
@@ -9229,5 +10500,5 @@ add_builtin(
     export=False,
     group="Utility",
     hidden=True,
-    missing_grad=True,
+    is_differentiable=False,
 )