PyPI - warp-lang - Versions diffs - 1.9.1__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl - Mend

warp-lang 1.9.1__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (346) hide show

warp/__init__.py +301 -287
warp/__init__.pyi +794 -305
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1075 -0
warp/_src/build.py +618 -0
warp/_src/build_dll.py +640 -0
warp/{builtins.py → _src/builtins.py} +1382 -377
warp/_src/codegen.py +4359 -0
warp/{config.py → _src/config.py} +178 -169
warp/_src/constants.py +57 -0
warp/_src/context.py +8294 -0
warp/_src/dlpack.py +462 -0
warp/_src/fabric.py +355 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +508 -0
warp/_src/fem/cache.py +687 -0
warp/_src/fem/dirichlet.py +188 -0
warp/{fem → _src/fem}/domain.py +40 -30
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +701 -0
warp/{fem → _src/fem}/field/nodal_field.py +30 -15
warp/{fem → _src/fem}/field/restriction.py +1 -1
warp/{fem → _src/fem}/field/virtual.py +53 -27
warp/_src/fem/geometry/__init__.py +32 -0
warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
warp/_src/fem/geometry/closest_point.py +97 -0
warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
warp/{fem → _src/fem}/geometry/element.py +32 -10
warp/{fem → _src/fem}/geometry/geometry.py +48 -20
warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
warp/{fem → _src/fem}/geometry/partition.py +121 -63
warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
warp/{fem → _src/fem}/integrate.py +164 -158
warp/_src/fem/linalg.py +383 -0
warp/_src/fem/operator.py +396 -0
warp/_src/fem/polynomial.py +229 -0
warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
warp/_src/fem/space/__init__.py +248 -0
warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
warp/_src/fem/space/basis_space.py +679 -0
warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
warp/{fem → _src/fem}/space/function_space.py +14 -13
warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
warp/{fem → _src/fem}/space/partition.py +117 -60
warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/restriction.py +66 -33
warp/_src/fem/space/shape/__init__.py +152 -0
warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
warp/_src/fem/space/topology.py +459 -0
warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
warp/_src/fem/types.py +112 -0
warp/_src/fem/utils.py +486 -0
warp/_src/jax.py +186 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +387 -0
warp/_src/jax_experimental/ffi.py +1284 -0
warp/_src/jax_experimental/xla_ffi.py +656 -0
warp/_src/marching_cubes.py +708 -0
warp/_src/math.py +414 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +163 -0
warp/_src/optim/linear.py +1606 -0
warp/_src/optim/sgd.py +112 -0
warp/_src/paddle.py +406 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +289 -0
warp/_src/render/render_opengl.py +3636 -0
warp/_src/render/render_usd.py +937 -0
warp/_src/render/utils.py +160 -0
warp/_src/sparse.py +2716 -0
warp/_src/tape.py +1206 -0
warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
warp/_src/torch.py +391 -0
warp/_src/types.py +5870 -0
warp/_src/utils.py +1693 -0
warp/autograd.py +12 -1054
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +8 -588
warp/build_dll.py +6 -721
warp/codegen.py +6 -4251
warp/constants.py +6 -39
warp/context.py +12 -8062
warp/dlpack.py +6 -444
warp/examples/distributed/example_jacobi_mpi.py +4 -5
warp/examples/fem/example_adaptive_grid.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +8 -8
warp/examples/fem/example_diffusion.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_mixed_elasticity.py +2 -2
warp/examples/fem/example_navier_stokes.py +1 -1
warp/examples/fem/example_nonconforming_contact.py +7 -7
warp/examples/fem/example_stokes.py +1 -1
warp/examples/fem/example_stokes_transfer.py +1 -1
warp/examples/fem/utils.py +2 -2
warp/examples/interop/example_jax_callable.py +1 -1
warp/examples/interop/example_jax_ffi_callback.py +1 -1
warp/examples/interop/example_jax_kernel.py +1 -1
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/fabric.py +6 -337
warp/fem/__init__.py +159 -97
warp/fem/adaptivity.py +7 -489
warp/fem/cache.py +9 -648
warp/fem/dirichlet.py +6 -184
warp/fem/field/__init__.py +8 -109
warp/fem/field/field.py +7 -652
warp/fem/geometry/__init__.py +7 -18
warp/fem/geometry/closest_point.py +11 -77
warp/fem/linalg.py +18 -366
warp/fem/operator.py +11 -369
warp/fem/polynomial.py +9 -209
warp/fem/space/__init__.py +5 -211
warp/fem/space/basis_space.py +6 -662
warp/fem/space/shape/__init__.py +41 -118
warp/fem/space/topology.py +6 -437
warp/fem/types.py +6 -81
warp/fem/utils.py +11 -444
warp/jax.py +8 -165
warp/jax_experimental/__init__.py +14 -1
warp/jax_experimental/custom_call.py +8 -365
warp/jax_experimental/ffi.py +17 -873
warp/jax_experimental/xla_ffi.py +5 -605
warp/marching_cubes.py +5 -689
warp/math.py +16 -393
warp/native/array.h +385 -37
warp/native/builtin.h +314 -37
warp/native/bvh.cpp +43 -9
warp/native/bvh.cu +62 -27
warp/native/bvh.h +310 -309
warp/native/clang/clang.cpp +102 -97
warp/native/coloring.cpp +0 -1
warp/native/crt.h +208 -0
warp/native/exports.h +156 -0
warp/native/hashgrid.cu +2 -0
warp/native/intersect.h +24 -1
warp/native/intersect_tri.h +44 -35
warp/native/mat.h +1456 -276
warp/native/mesh.cpp +4 -4
warp/native/mesh.cu +4 -2
warp/native/mesh.h +176 -61
warp/native/quat.h +0 -52
warp/native/scan.cu +2 -0
warp/native/sparse.cu +7 -3
warp/native/spatial.h +12 -0
warp/native/tile.h +681 -89
warp/native/tile_radix_sort.h +1 -1
warp/native/tile_reduce.h +394 -46
warp/native/tile_scan.h +4 -4
warp/native/vec.h +469 -0
warp/native/version.h +23 -0
warp/native/volume.cpp +1 -1
warp/native/volume.cu +1 -0
warp/native/volume.h +1 -1
warp/native/volume_builder.cu +2 -0
warp/native/warp.cpp +57 -29
warp/native/warp.cu +253 -171
warp/native/warp.h +11 -8
warp/optim/__init__.py +6 -3
warp/optim/adam.py +6 -145
warp/optim/linear.py +14 -1585
warp/optim/sgd.py +6 -94
warp/paddle.py +6 -388
warp/render/__init__.py +8 -4
warp/render/imgui_manager.py +7 -267
warp/render/render_opengl.py +6 -3618
warp/render/render_usd.py +6 -919
warp/render/utils.py +6 -142
warp/sparse.py +37 -2563
warp/tape.py +6 -1188
warp/tests/__main__.py +1 -1
warp/tests/cuda/test_async.py +4 -4
warp/tests/cuda/test_conditional_captures.py +1 -1
warp/tests/cuda/test_multigpu.py +1 -1
warp/tests/cuda/test_streams.py +58 -1
warp/tests/geometry/test_bvh.py +157 -22
warp/tests/geometry/test_marching_cubes.py +0 -1
warp/tests/geometry/test_mesh.py +5 -3
warp/tests/geometry/test_mesh_query_aabb.py +5 -12
warp/tests/geometry/test_mesh_query_point.py +5 -2
warp/tests/geometry/test_mesh_query_ray.py +15 -3
warp/tests/geometry/test_volume_write.py +5 -5
warp/tests/interop/test_dlpack.py +14 -14
warp/tests/interop/test_jax.py +772 -49
warp/tests/interop/test_paddle.py +1 -1
warp/tests/test_adam.py +0 -1
warp/tests/test_arithmetic.py +9 -9
warp/tests/test_array.py +527 -100
warp/tests/test_array_reduce.py +3 -3
warp/tests/test_atomic.py +12 -8
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +4 -4
warp/tests/test_bool.py +2 -2
warp/tests/test_builtins_resolution.py +5 -571
warp/tests/test_codegen.py +33 -14
warp/tests/test_conditional.py +1 -1
warp/tests/test_context.py +6 -6
warp/tests/test_copy.py +242 -161
warp/tests/test_ctypes.py +3 -3
warp/tests/test_devices.py +24 -2
warp/tests/test_examples.py +16 -84
warp/tests/test_fabricarray.py +35 -35
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +56 -10
warp/tests/test_fixedarray.py +3 -3
warp/tests/test_func.py +8 -5
warp/tests/test_generics.py +1 -1
warp/tests/test_indexedarray.py +24 -24
warp/tests/test_intersect.py +39 -9
warp/tests/test_large.py +1 -1
warp/tests/test_lerp.py +3 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_map.py +35 -4
warp/tests/test_mat.py +52 -62
warp/tests/test_mat_constructors.py +4 -5
warp/tests/test_mat_lite.py +1 -1
warp/tests/test_mat_scalar_ops.py +121 -121
warp/tests/test_math.py +34 -0
warp/tests/test_module_aot.py +4 -4
warp/tests/test_modules_lite.py +28 -2
warp/tests/test_print.py +11 -11
warp/tests/test_quat.py +93 -58
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +38 -10
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +126 -15
warp/tests/test_spatial.py +105 -87
warp/tests/test_special_values.py +6 -6
warp/tests/test_static.py +7 -7
warp/tests/test_struct.py +13 -2
warp/tests/test_triangle_closest_point.py +48 -1
warp/tests/test_types.py +27 -15
warp/tests/test_utils.py +52 -52
warp/tests/test_vec.py +29 -29
warp/tests/test_vec_constructors.py +5 -5
warp/tests/test_vec_scalar_ops.py +97 -97
warp/tests/test_version.py +75 -0
warp/tests/tile/test_tile.py +178 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +7 -4
warp/tests/tile/test_tile_load.py +26 -2
warp/tests/tile/test_tile_mathdx.py +3 -3
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +2 -4
warp/tests/tile/test_tile_reduce.py +214 -13
warp/tests/unittest_suites.py +6 -14
warp/tests/unittest_utils.py +10 -9
warp/tests/walkthrough_debug.py +3 -1
warp/torch.py +6 -373
warp/types.py +29 -5764
warp/utils.py +10 -1659
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +46 -99
warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp/examples/assets/cartpole.urdf +0 -110
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/nv_ant.xml +0 -92
warp/examples/assets/nv_humanoid.xml +0 -183
warp/examples/assets/quadruped.urdf +0 -268
warp/examples/optim/example_bounce.py +0 -266
warp/examples/optim/example_cloth_throw.py +0 -228
warp/examples/optim/example_drone.py +0 -870
warp/examples/optim/example_inverse_kinematics.py +0 -182
warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
warp/examples/optim/example_softbody_properties.py +0 -400
warp/examples/optim/example_spring_cage.py +0 -245
warp/examples/optim/example_trajectory.py +0 -227
warp/examples/sim/example_cartpole.py +0 -143
warp/examples/sim/example_cloth.py +0 -225
warp/examples/sim/example_cloth_self_contact.py +0 -316
warp/examples/sim/example_granular.py +0 -130
warp/examples/sim/example_granular_collision_sdf.py +0 -202
warp/examples/sim/example_jacobian_ik.py +0 -244
warp/examples/sim/example_particle_chain.py +0 -124
warp/examples/sim/example_quadruped.py +0 -203
warp/examples/sim/example_rigid_chain.py +0 -203
warp/examples/sim/example_rigid_contact.py +0 -195
warp/examples/sim/example_rigid_force.py +0 -133
warp/examples/sim/example_rigid_gyroscopic.py +0 -115
warp/examples/sim/example_rigid_soft_contact.py +0 -140
warp/examples/sim/example_soft_body.py +0 -196
warp/examples/tile/example_tile_walker.py +0 -327
warp/sim/__init__.py +0 -74
warp/sim/articulation.py +0 -793
warp/sim/collide.py +0 -2570
warp/sim/graph_coloring.py +0 -307
warp/sim/import_mjcf.py +0 -791
warp/sim/import_snu.py +0 -227
warp/sim/import_urdf.py +0 -579
warp/sim/import_usd.py +0 -898
warp/sim/inertia.py +0 -357
warp/sim/integrator.py +0 -245
warp/sim/integrator_euler.py +0 -2000
warp/sim/integrator_featherstone.py +0 -2101
warp/sim/integrator_vbd.py +0 -2487
warp/sim/integrator_xpbd.py +0 -3295
warp/sim/model.py +0 -4821
warp/sim/particles.py +0 -121
warp/sim/render.py +0 -431
warp/sim/utils.py +0 -431
warp/tests/sim/disabled_kinematics.py +0 -244
warp/tests/sim/test_cloth.py +0 -863
warp/tests/sim/test_collision.py +0 -743
warp/tests/sim/test_coloring.py +0 -347
warp/tests/sim/test_inertia.py +0 -161
warp/tests/sim/test_model.py +0 -226
warp/tests/sim/test_sim_grad.py +0 -287
warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
warp/tests/sim/test_sim_kinematics.py +0 -98
warp/thirdparty/__init__.py +0 -0
warp_lang-1.9.1.dist-info/RECORD +0 -456
/warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
/warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
/warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
/warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0

warp/native/tile.h CHANGED Viewed

@@ -43,18 +43,10 @@
     };
 #endif
-// only used while building the warp core library
-#ifndef WP_TILE_BLOCK_DIM
-#define WP_TILE_BLOCK_DIM 256
-#endif
-#if !defined(__CUDA_ARCH__)
-#define WP_TILE_SHARED static
-#define WP_TILE_SYNC void
-#else
-#define WP_TILE_SHARED __shared__
+#if defined(__CUDA_ARCH__)
 #define WP_TILE_SYNC __syncthreads
+#else
+#define WP_TILE_SYNC void
 #endif
 #if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
@@ -140,7 +132,6 @@
     [ ] LayerNorm
     [ ] SoftMax
     [ ] GEMM
-    [ ] warp.sim (CRBA)
     [ ] Batched MLP
     [ ] Layer norm
     [ ] FNO + Burgers equation
@@ -149,7 +140,6 @@
     [ ] MeshCNN (Modulus, Oliver)
     [ ] BioNemo (Ali)
     [ ] Skinning (David/Or/Vismay)
-    [ ] warp.sim (VBD)
 [ ] Error checking
     [ ] Ensure functions passed to tile_map() are compatible with tile type
     [ ] Ensure that args passed to tile ops are compatible
@@ -213,6 +203,12 @@ struct is_same<T, T> {
     static constexpr bool value = true;
 };
+// Helper for dependent static_assert failures
+template <typename T>
+struct always_false {
+    static constexpr bool value = false;
+};
 template <int N>
 struct tile_coord_t
@@ -338,6 +334,113 @@ template <int... V>
 using tile_stride_t = tile_tuple_t<V...>;
+// helper to remove a dimension from a shape (used for axis reductions)
+template<int Axis, typename Shape>
+struct tile_shape_remove_dim {
+    static_assert(Axis >= 0 && Axis < Shape::N, "Axis out of bounds for tile_shape_remove_dim");
+};
+// 1D -> scalar
+template<int D0>
+struct tile_shape_remove_dim<0, tile_shape_t<D0>> {
+    using type = tile_shape_t<1>;
+};
+// 2D -> 1D
+template<int D0, int D1>
+struct tile_shape_remove_dim<0, tile_shape_t<D0, D1>> {
+    using type = tile_shape_t<D1>;
+};
+template<int D0, int D1>
+struct tile_shape_remove_dim<1, tile_shape_t<D0, D1>> {
+    using type = tile_shape_t<D0>;
+};
+// 3D -> 2D
+template<int D0, int D1, int D2>
+struct tile_shape_remove_dim<0, tile_shape_t<D0, D1, D2>> {
+    using type = tile_shape_t<D1, D2>;
+};
+template<int D0, int D1, int D2>
+struct tile_shape_remove_dim<1, tile_shape_t<D0, D1, D2>> {
+    using type = tile_shape_t<D0, D2>;
+};
+template<int D0, int D1, int D2>
+struct tile_shape_remove_dim<2, tile_shape_t<D0, D1, D2>> {
+    using type = tile_shape_t<D0, D1>;
+};
+// 4D -> 3D
+template<int D0, int D1, int D2, int D3>
+struct tile_shape_remove_dim<0, tile_shape_t<D0, D1, D2, D3>> {
+    using type = tile_shape_t<D1, D2, D3>;
+};
+template<int D0, int D1, int D2, int D3>
+struct tile_shape_remove_dim<1, tile_shape_t<D0, D1, D2, D3>> {
+    using type = tile_shape_t<D0, D2, D3>;
+};
+template<int D0, int D1, int D2, int D3>
+struct tile_shape_remove_dim<2, tile_shape_t<D0, D1, D2, D3>> {
+    using type = tile_shape_t<D0, D1, D3>;
+};
+template<int D0, int D1, int D2, int D3>
+struct tile_shape_remove_dim<3, tile_shape_t<D0, D1, D2, D3>> {
+    using type = tile_shape_t<D0, D1, D2>;
+};
+// helper to insert an axis value into a coordinate (inverse of removing dimension)
+// used for mapping output coordinates back to input coordinates during axis reduction
+template<int Axis, int N>
+CUDA_CALLABLE constexpr auto tile_coord_insert_axis(const tile_coord_t<N>& coord, int axis_val)
+{
+    static_assert(Axis >= 0 && Axis <= N, "Axis out of bounds for tile_coord_insert_axis");
+    if constexpr (N == 0)
+    {
+        // Scalar -> 1D
+        static_assert(Axis == 0, "Invalid axis for scalar coordinate");
+        return tile_coord(axis_val);
+    }
+    else if constexpr (N == 1)
+    {
+        // 1D -> 2D
+        if constexpr (Axis == 0)
+            return tile_coord(axis_val, coord[0]);
+        else
+            return tile_coord(coord[0], axis_val);
+    }
+    else if constexpr (N == 2)
+    {
+        // 2D -> 3D
+        if constexpr (Axis == 0)
+            return tile_coord(axis_val, coord[0], coord[1]);
+        else if constexpr (Axis == 1)
+            return tile_coord(coord[0], axis_val, coord[1]);
+        else
+            return tile_coord(coord[0], coord[1], axis_val);
+    }
+    else // N == 3
+    {
+        // 3D -> 4D
+        if constexpr (Axis == 0)
+            return tile_coord(axis_val, coord[0], coord[1], coord[2]);
+        else if constexpr (Axis == 1)
+            return tile_coord(coord[0], axis_val, coord[1], coord[2]);
+        else if constexpr (Axis == 2)
+            return tile_coord(coord[0], coord[1], axis_val, coord[2]);
+        else
+            return tile_coord(coord[0], coord[1], coord[2], axis_val);
+    }
+}
 // represents a tile stored in global memory with dynamic strides
 // used to represent the source and offset for tile loads to register/shared
 // BoundsCheck: when true (default), validates array access bounds; when false, skips validation for performance
@@ -581,7 +684,11 @@ struct tile_register_t
         const int thread = Layout::thread_from_linear(linear);
         const int reg = Layout::register_from_linear(linear);
-        WP_TILE_SHARED Type scratch;
+#if defined(__CUDA_ARCH__)
+        __shared__ Type scratch;
+#else
+        Type scratch;
+#endif
         // ensure any previously scheduled threads have finished reading from scratch
         WP_TILE_SYNC();
@@ -735,43 +842,124 @@ inline CUDA_CALLABLE int tile_align(int num_bytes)
     return sign * ((num_bytes_abs + alignment - 1) / alignment) * alignment;
 }
-inline CUDA_CALLABLE void* tile_alloc_shared(int num_bytes, bool init=false, bool check=false)
+#if defined(WP_ENABLE_TILES_IN_STACK_MEMORY)
+// On the CPU we use a fixed size block of stack memory for shared tile allocations.
+// We store a pointer to the current allocation storage either in a reserved register
+// (AArch64) or a static variable (x86-64).
+#if !defined(__CUDA_ARCH__)
+class tile_shared_storage_t;
+#if defined(__aarch64__)
+// x28 is is the last callee-saved register on AArch64. This allows us to call externally
+// compiled functions without worrying about clobbering the pointer.
+// We pass -target-feature +reserve-x28 to Clang to exclude it from register allocation.
+register tile_shared_storage_t* shared_tile_storage asm("x28");
+#else
+// Ideally this would be thread_local, but LLVM's JIT doesn't support TLS yet
+// There is also no support for something like -ffixed-r15 either
+static tile_shared_storage_t* shared_tile_storage;
+#endif
+#endif
+#endif
+// This class manages a block of "shared" memory for use by tiles.
+// On the GPU this maps to dynamic shared memory, while on the CPU we allocate
+// a fixed size block of memory on the stack and manage allocations from it.
+// An instance of this class gets created at the start of a kernel.
+class tile_shared_storage_t
 {
+private:
+#if !defined(__CUDA_ARCH__)
+#define WP_MAX_CPU_SHARED 256*1024
+#if defined(WP_ENABLE_TILES_IN_STACK_MEMORY)
+    tile_shared_storage_t* old_value;
+    unsigned int smem_base[WP_TILE_BLOCK_DIM];
+    char dynamic_smem_base[WP_MAX_CPU_SHARED];  // on CPU allocate a fixed 256k block to use for shared allocs
+#endif
+#endif
     // we maintain a per-thread offset into dynamic
     // shared memory that allows us to keep track of
     // current use across dynamic function calls
-    WP_TILE_SHARED int smem_base[WP_TILE_BLOCK_DIM];
+    static inline CUDA_CALLABLE unsigned int* get_smem_base()
+    {
+#if defined(__CUDA_ARCH__)
+        __shared__ unsigned int smem_base[WP_TILE_BLOCK_DIM];
+        return smem_base;
+#elif defined(WP_ENABLE_TILES_IN_STACK_MEMORY)
+        return shared_tile_storage->smem_base;
+#else
+        static unsigned int smem_base[WP_TILE_BLOCK_DIM];
+        return smem_base;
+#endif
+    }
+    static inline CUDA_CALLABLE char* get_dynamic_smem_base()
+    {
+#if defined(__CUDA_ARCH__)
+        extern __shared__ char dynamic_smem_base[];
+        return dynamic_smem_base;
+#elif defined(WP_ENABLE_TILES_IN_STACK_MEMORY)
+        return shared_tile_storage->dynamic_smem_base;
+#else
+        static char dynamic_smem_base[WP_MAX_CPU_SHARED];
+        return dynamic_smem_base;
+#endif
+    }
-    if (init)
+public:
+    // cppcheck-suppress uninitMemberVar
+    inline CUDA_CALLABLE tile_shared_storage_t()
     {
+#if !defined(__CUDA_ARCH__) && defined(WP_ENABLE_TILES_IN_STACK_MEMORY)
+        // On the CPU save a pointer to this instance in a reserved register
+        // or static variable so it can be accessed from anywhere within a kernel.
+        old_value = shared_tile_storage;
+        shared_tile_storage = this;
+#endif
+        init();
+    }
+    inline CUDA_CALLABLE ~tile_shared_storage_t()
+    {
+        check();
+#if !defined(__CUDA_ARCH__) && defined(WP_ENABLE_TILES_IN_STACK_MEMORY)
+        shared_tile_storage = old_value;
+#endif
+    }
+    static inline CUDA_CALLABLE void init()
+    {
+        unsigned int* smem_base = get_smem_base();
         smem_base[WP_TILE_THREAD_IDX] = 0;
-        return nullptr;
     }
-    else if (check)
+    static inline CUDA_CALLABLE void check()
     {
+        unsigned int* smem_base = get_smem_base();
         assert(smem_base[WP_TILE_THREAD_IDX] == 0);
-        return nullptr;
     }
-    else
+    static inline CUDA_CALLABLE void* alloc(int num_bytes)
     {
-        const int offset = smem_base[WP_TILE_THREAD_IDX];
+        unsigned int* smem_base = get_smem_base();
+        char* dynamic_smem_base = get_dynamic_smem_base();
+        const unsigned int offset = smem_base[WP_TILE_THREAD_IDX];
         // one entry per-thread so no need for synchronization
         smem_base[WP_TILE_THREAD_IDX] += tile_align(num_bytes);
-        assert(smem_base[WP_TILE_THREAD_IDX] >= 0);
-#ifdef __CUDA_ARCH__
-        extern __shared__ char dynamic_smem_base[];
-#else
-        // on CPU allocate a fixed 256k block to use for shared allocs
-        static const int max_cpu_shared = 256*1024;
-        static char dynamic_smem_base[max_cpu_shared];
-        assert(smem_base[WP_TILE_THREAD_IDX] <= max_cpu_shared);
+#if !defined(__CUDA_ARCH__)
+        assert(smem_base[WP_TILE_THREAD_IDX] <= WP_MAX_CPU_SHARED);
 #endif
         return &(dynamic_smem_base[offset]);
     }
-}
+};
 template <typename Shape_, typename Stride_= typename compute_strides<Shape_>::Stride>
@@ -939,10 +1127,10 @@ struct tile_shared_t
         {
             // update our per-thread shared memory allocator
             if (data.ptr)
-                tile_alloc_shared(-Layout::Size*int(sizeof(T)));
+                tile_shared_storage_t::alloc(-Layout::Size*int(sizeof(T)));
             if (grad.ptr)
-                tile_alloc_shared(-Layout::Size*int(sizeof(T)));
+                tile_shared_storage_t::alloc(-Layout::Size*int(sizeof(T)));
         }
     }
@@ -1095,6 +1283,46 @@ struct tile_shared_t
         adj_x -= grad(c);
     }
+    // perform AND between a scalar value and a single tile element
+    inline CUDA_CALLABLE void bit_and_inplace(const typename Layout::Coord& c, const Type& x)
+    {
+        // since multiple threads may access the same element
+        // we need to access using atomic operations
+        wp::atomic_and(&data(c), x);
+        WP_TILE_SYNC();
+    }
+    // backward of inplace scalar AND
+    inline CUDA_CALLABLE void adj_bit_and_inplace(const typename Layout::Coord& c, Type& adj_x) {}
+    // perform OR between a scalar value and a single tile element
+    inline CUDA_CALLABLE void bit_or_inplace(const typename Layout::Coord& c, const Type& x)
+    {
+        // since multiple threads may access the same element
+        // we need to access using atomic operations
+        wp::atomic_or(&data(c), x);
+        WP_TILE_SYNC();
+    }
+    // backward of inplace scalar OR
+    inline CUDA_CALLABLE void adj_bit_or_inplace(const typename Layout::Coord& c, Type& adj_x) {}
+    // perform XOR between a scalar value and a single tile element
+    inline CUDA_CALLABLE void bit_xor_inplace(const typename Layout::Coord& c, const Type& x)
+    {
+        // since multiple threads may access the same element
+        // we need to access using atomic operations
+        wp::atomic_xor(&data(c), x);
+        WP_TILE_SYNC();
+    }
+    // backward of inplace scalar XOR
+    inline CUDA_CALLABLE void adj_bit_xor_inplace(const typename Layout::Coord& c, Type& adj_x) {}
     // copy register tile to shared
     template <typename Tile>
     inline CUDA_CALLABLE void assign(const Tile& tile)
@@ -1549,7 +1777,11 @@ void tile_register_t<T, L>::print() const
 {
     // create a temporary shared tile so that
     // we can print it deterministically
-    WP_TILE_SHARED T smem[L::Size];
+#if defined(__CUDA_ARCH__)
+    __shared__ T smem[L::Size];
+#else
+    T smem[L::Size];
+#endif
     tile_shared_t<T, tile_layout_strided_t<typename L::Shape>, false> scratch(smem, nullptr);
     scratch.assign(*this);
@@ -1609,37 +1841,6 @@ inline CUDA_CALLABLE void adj_len(const tile_register_t<T,L>& t, const AdjTile&
 {
 }
-// select specialization for shared tiles
-template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
-inline CUDA_CALLABLE auto select(const C& cond, const tile_register_t<T, LRegister>& a, const tile_shared_t<T, LShared, Owner>& b)
-{
-    // The double NOT operator !! casts to bool without compiler warnings.
-    return (!!cond) ? b.copy_to_register() : a;
-}
-template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
-inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, LShared, Owner>& a, const tile_register_t<T, LRegister>& b)
-{
-    // The double NOT operator !! casts to bool without compiler warnings.
-    return (!!cond) ? b : a.copy_to_register();
-}
-template <typename C, typename T, typename L, bool Owner>
-inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, L, Owner>& a, const tile_shared_t<T, L, Owner>& b)
-{
-    // The double NOT operator !! casts to bool without compiler warnings.
-    return (!!cond) ? tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr) : tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr);
-}
-template <typename C, typename T, typename L, bool LOwner, bool ROwner>
-inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, L, LOwner>& a, const tile_shared_t<T, L, ROwner>& b)
-{
-    // The double NOT operator !! casts to bool without compiler warnings.
-    return (!!cond) ? tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr) : tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr);
-}
-// adj_select same as in builtin.h
 // where specialization for register/shared tiles
 template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
 inline CUDA_CALLABLE auto where(const C& cond, const tile_register_t<T, LRegister>& a, const tile_shared_t<T, LShared, Owner>& b)
@@ -1690,7 +1891,7 @@ template <typename T, typename Shape, typename Strides, bool RequiresGrad>
 inline CUDA_CALLABLE auto tile_alloc_empty()
 {
     constexpr int size = Shape::size();
-    T* data = (T*)tile_alloc_shared(size*sizeof(T));
+    T* data = (T*)tile_shared_storage_t::alloc(size*sizeof(T));
     T* grad = nullptr;
 #if FP_CHECK
@@ -1709,7 +1910,7 @@ inline CUDA_CALLABLE auto tile_alloc_empty()
     if (RequiresGrad)
     {
-        grad = (T*)tile_alloc_shared(size*sizeof(T));
+        grad = (T*)tile_shared_storage_t::alloc(size*sizeof(T));
         for (int i=WP_TILE_THREAD_IDX; i < size; i+= WP_TILE_BLOCK_DIM)
             grad[i] = T(0);
@@ -1887,6 +2088,14 @@ inline CUDA_CALLABLE auto tile_ones()
     return T(1);
 }
+// value-initialized tile
+template <typename T, unsigned... Shape>
+inline CUDA_CALLABLE auto tile_full(T x)
+{
+    // tile variable assignment operator will handle initialization (since lhs could be shared/register tile)
+    return x;
+}
 // tile with evenly spaced values
 template <typename T, int Len>
 inline CUDA_CALLABLE auto tile_arange(T start, T stop, T step)
@@ -2438,6 +2647,43 @@ inline CUDA_CALLABLE void adj_tile_mul(const typename Tile::Type& s, Tile& a,
 }
+// tile & tile
+template <typename TileA, typename TileB>
+inline CUDA_CALLABLE auto tile_bit_and(TileA& a, TileB& b)
+{
+    return tile_binary_map(bit_and, a, b, a);
+}
+template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_bit_and(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b, AdjTile& adj_c)
+{
+}
+// tile | tile
+template <typename TileA, typename TileB>
+inline CUDA_CALLABLE auto tile_bit_or(TileA& a, TileB& b)
+{
+    return tile_binary_map(bit_or, a, b, a);
+}
+template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_bit_or(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b, AdjTile& adj_c)
+{
+}
+// tile ^ tile
+template <typename TileA, typename TileB>
+inline CUDA_CALLABLE auto tile_bit_xor(TileA& a, TileB& b)
+{
+    return tile_binary_map(bit_xor, a, b, a);
+}
+template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_bit_xor(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b, AdjTile& adj_c)
+{
+}
 template <typename TileA, typename TileB>
 inline CUDA_CALLABLE void tile_add_inplace(TileA& a, TileB& b)
 {
@@ -2557,24 +2803,227 @@ inline CUDA_CALLABLE void adj_tile_sub_inplace(TileA& a, TileB& b, AdjTileA& adj
     adj_b.grad_add(adj_b_reg);
 }
+template <typename TileA, typename TileB>
+inline CUDA_CALLABLE void tile_bit_and_inplace(TileA& a, TileB& b)
+{
+    using ShapeA = typename TileA::Layout::Shape;
+    using ShapeB = typename TileB::Layout::Shape;
+    // verify shapes and sizes are compatible
+    static_assert(ShapeA::N == ShapeB::N, "Tile shapes must match for inplace bitwise AND");
+    static_assert(ShapeA::size() == ShapeB::size(), "Tile sizes must match for inplace bitwise AND");
+    // work with register tiles for inplace operations, regardless of the storage type of the input tiles
+    auto a_reg = a.copy_to_register();
+    auto b_reg = b.copy_to_register();
+    using Layout = typename decltype(a_reg)::Layout;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < Layout::NumRegs; ++i)
+    {
+        const int linear = Layout::linear_from_register(i);
+        if(!Layout::valid(linear))
+            break;
+        a_reg.data[i] &= b_reg.data[i];
+    }
+    a.assign(a_reg);
+}
+template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB>
+inline CUDA_CALLABLE void adj_tile_bit_and_inplace(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b) {}
+template <typename TileA, typename TileB>
+inline CUDA_CALLABLE void tile_bit_or_inplace(TileA& a, TileB& b)
+{
+    using ShapeA = typename TileA::Layout::Shape;
+    using ShapeB = typename TileB::Layout::Shape;
+    // verify shapes and sizes are compatible
+    static_assert(ShapeA::N == ShapeB::N, "Tile shapes must match for inplace bitwise OR");
+    static_assert(ShapeA::size() == ShapeB::size(), "Tile sizes must match for inplace bitwise OR");
+    // work with register tiles for inplace operations, regardless of the storage type of the input tiles
+    auto a_reg = a.copy_to_register();
+    auto b_reg = b.copy_to_register();
+    using Layout = typename decltype(a_reg)::Layout;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < Layout::NumRegs; ++i)
+    {
+        const int linear = Layout::linear_from_register(i);
+        if(!Layout::valid(linear))
+            break;
+        a_reg.data[i] |= b_reg.data[i];
+    }
+    a.assign(a_reg);
+}
+template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB>
+inline CUDA_CALLABLE void adj_tile_bit_or_inplace(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b) {}
+template <typename TileA, typename TileB>
+inline CUDA_CALLABLE void tile_bit_xor_inplace(TileA& a, TileB& b)
+{
+    using ShapeA = typename TileA::Layout::Shape;
+    using ShapeB = typename TileB::Layout::Shape;
+    // verify shapes and sizes are compatible
+    static_assert(ShapeA::N == ShapeB::N, "Tile shapes must match for inplace bitwise XOR");
+    static_assert(ShapeA::size() == ShapeB::size(), "Tile sizes must match for inplace bitwise XOR");
+    // work with register tiles for inplace operations, regardless of the storage type of the input tiles
+    auto a_reg = a.copy_to_register();
+    auto b_reg = b.copy_to_register();
+    using Layout = typename decltype(a_reg)::Layout;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < Layout::NumRegs; ++i)
+    {
+        const int linear = Layout::linear_from_register(i);
+        if(!Layout::valid(linear))
+            break;
+        a_reg.data[i] ^= b_reg.data[i];
+    }
+    a.assign(a_reg);
+}
+template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB>
+inline CUDA_CALLABLE void adj_tile_bit_xor_inplace(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b) {}
 template<typename Tile>
-typename Tile::Type tile_extract(Tile& t, int i) { return t.extract(tile_coord(i)); }
+typename Tile::Type tile_extract(Tile& t, int i) {
+    return t.extract(tile_coord(i));
+}
 template<typename Tile>
-typename Tile::Type tile_extract(Tile& t, int i, int j) { return t.extract(tile_coord(i,j)); }
+auto tile_extract(Tile& t, int i, int j) {
+    if constexpr(is_vector<typename Tile::Type>::value) {
+        return t.extract(tile_coord(i))[j];
+    } else {
+        return t.extract(tile_coord(i,j));
+    }
+}
 template<typename Tile>
-typename Tile::Type tile_extract(Tile& t, int i, int j, int k) { return t.extract(tile_coord(i,j,k)); }
+auto tile_extract(Tile& t, int i, int j, int k) {
+    if constexpr(is_vector<typename Tile::Type>::value) {
+        return t.extract(tile_coord(i,j))[k];
+    } else if constexpr(is_matrix<typename Tile::Type>::value) {
+        return t.extract(tile_coord(i)).data[j][k];
+    } else {
+        return t.extract(tile_coord(i,j,k));
+    }
+}
 template<typename Tile>
-typename Tile::Type tile_extract(Tile& t, int i, int j, int k, int l) { return t.extract(tile_coord(i,j,k,l)); }
+auto tile_extract(Tile& t, int i, int j, int k, int l) {
+    if constexpr(is_vector<typename Tile::Type>::value) {
+        return t.extract(tile_coord(i,j,k))[l];
+    } else if constexpr(is_matrix<typename Tile::Type>::value) {
+        return t.extract(tile_coord(i,j)).data[k][l];
+    } else {
+        return t.extract(tile_coord(i,j,k,l));
+    }
+}
+template<typename Tile>
+auto tile_extract(Tile& t, int i, int j, int k, int l, int m) {
+    if constexpr(is_vector<typename Tile::Type>::value) {
+        return t.extract(tile_coord(i,j,k,l))[m];
+    } else if constexpr(is_matrix<typename Tile::Type>::value) {
+        return t.extract(tile_coord(i,j,k)).data[l][m];
+    } else {
+        static_assert(always_false<Tile>::value,
+                      "tile_extract with 5 indices requires a tile of vectors (4D tile) or matrices (3D tile)");
+    }
+}
+template<typename Tile>
+auto tile_extract(Tile& t, int i, int j, int k, int l, int m, int n) {
+    if constexpr(is_matrix<typename Tile::Type>::value) {
+        return t.extract(tile_coord(i,j,k,l)).data[m][n];
+    } else {
+        static_assert(always_false<Tile>::value,
+                      "tile_extract with 6 indices requires a tile of matrices (4D tile)");
+    }
+}
 template<typename Tile, typename AdjTile>
-void adj_tile_extract(Tile& t, int i, AdjTile& adj_t, int adj_i, typename Tile::Type adj_ret) { adj_t.adj_extract(tile_coord(i), adj_ret); }
-template<typename Tile, typename AdjTile>
-void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_j, typename Tile::Type adj_ret) { adj_t.adj_extract(tile_coord(i, j), adj_ret); }
-template<typename Tile, typename AdjTile>
-void adj_tile_extract(Tile& t, int i, int j, int k, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, typename Tile::Type adj_ret) { adj_t.adj_extract(tile_coord(i, j, k), adj_ret); }
-template<typename Tile, typename AdjTile>
-void adj_tile_extract(Tile& t, int i, int j, int k, int l, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type adj_ret) { adj_t.adj_extract(tile_coord(i, j, k, l), adj_ret); }
+void adj_tile_extract(Tile& t, int i, AdjTile& adj_t, int adj_i, typename Tile::Type adj_ret) {
+    adj_t.adj_extract(tile_coord(i), adj_ret);
+}
+template<typename Tile, typename AdjTile, typename AdjType>
+void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_j, AdjType adj_ret) {
+    if constexpr(is_vector<typename Tile::Type>::value) {
+        typename Tile::Type vector_adj{};
+        vector_adj[j] = adj_ret;
+        adj_t.adj_extract(tile_coord(i), vector_adj);
+    } else {
+        adj_t.adj_extract(tile_coord(i, j), adj_ret);
+    }
+}
+template<typename Tile, typename AdjTile, typename AdjType>
+void adj_tile_extract(Tile& t, int i, int j, int k, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, AdjType adj_ret) {
+    if constexpr(is_vector<typename Tile::Type>::value) {
+        typename Tile::Type vector_adj{};
+        vector_adj[k] = adj_ret;
+        adj_t.adj_extract(tile_coord(i, j), vector_adj);
+    } else if constexpr(is_matrix<typename Tile::Type>::value) {
+        typename Tile::Type matrix_adj{};
+        matrix_adj.data[j][k] = adj_ret;
+        adj_t.adj_extract(tile_coord(i), matrix_adj);
+    } else {
+        adj_t.adj_extract(tile_coord(i, j, k), adj_ret);
+    }
+}
+template<typename Tile, typename AdjTile, typename AdjType>
+void adj_tile_extract(Tile& t, int i, int j, int k, int l, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, AdjType adj_ret) {
+    if constexpr(is_vector<typename Tile::Type>::value) {
+        typename Tile::Type vector_adj{};
+        vector_adj[l] = adj_ret;
+        adj_t.adj_extract(tile_coord(i, j, k), vector_adj);
+    } else if constexpr(is_matrix<typename Tile::Type>::value) {
+        typename Tile::Type matrix_adj{};
+        matrix_adj.data[k][l] = adj_ret;
+        adj_t.adj_extract(tile_coord(i, j), matrix_adj);
+    } else {
+        adj_t.adj_extract(tile_coord(i, j, k, l), adj_ret);
+    }
+}
+template<typename Tile, typename AdjTile, typename AdjType>
+void adj_tile_extract(Tile& t, int i, int j, int k, int l, int m, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, int adj_m, AdjType adj_ret) {
+    if constexpr(is_vector<typename Tile::Type>::value) {
+        typename Tile::Type vector_adj{};
+        vector_adj[m] = adj_ret;
+        adj_t.adj_extract(tile_coord(i, j, k, l), vector_adj);
+    } else if constexpr(is_matrix<typename Tile::Type>::value) {
+        typename Tile::Type matrix_adj{};
+        matrix_adj.data[l][m] = adj_ret;
+        adj_t.adj_extract(tile_coord(i, j, k), matrix_adj);
+    } else {
+        static_assert(always_false<Tile>::value,
+                      "adj_tile_extract with 5 indices requires a tile of vectors (4D tile) or matrices (3D tile)");
+    }
+}
+template<typename Tile, typename AdjTile, typename AdjType>
+void adj_tile_extract(Tile& t, int i, int j, int k, int l, int m, int n, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, int adj_m, int adj_n, AdjType adj_ret) {
+    if constexpr(is_matrix<typename Tile::Type>::value) {
+        typename Tile::Type matrix_adj{};
+        matrix_adj.data[m][n] = adj_ret;
+        adj_t.adj_extract(tile_coord(i, j, k, l), matrix_adj);
+    } else {
+        static_assert(always_false<Tile>::value,
+                      "adj_tile_extract with 6 indices requires a tile of matrices (4D tile)");
+    }
+}
 template<typename Tile>
@@ -2595,6 +3044,33 @@ void tile_sub_inplace(Tile& t, int i, int j, int k, typename Tile::Type value) {
 template<typename Tile>
 void tile_sub_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value) { t.sub_inplace(tile_coord(i,j,k,l), value); }
+template<typename Tile>
+void tile_bit_and_inplace(Tile& t, int i, typename Tile::Type value) { t.bit_and_inplace(tile_coord(i), value); }
+template<typename Tile>
+void tile_bit_and_inplace(Tile& t, int i, int j, typename Tile::Type value) { t.bit_and_inplace(tile_coord(i,j), value); }
+template<typename Tile>
+void tile_bit_and_inplace(Tile& t, int i, int j, int k, typename Tile::Type value) { t.bit_and_inplace(tile_coord(i,j,k), value); }
+template<typename Tile>
+void tile_bit_and_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value) { t.bit_and_inplace(tile_coord(i,j,k,l), value); }
+template<typename Tile>
+void tile_bit_or_inplace(Tile& t, int i, typename Tile::Type value) { t.bit_or_inplace(tile_coord(i), value); }
+template<typename Tile>
+void tile_bit_or_inplace(Tile& t, int i, int j, typename Tile::Type value) { t.bit_or_inplace(tile_coord(i,j), value); }
+template<typename Tile>
+void tile_bit_or_inplace(Tile& t, int i, int j, int k, typename Tile::Type value) { t.bit_or_inplace(tile_coord(i,j,k), value); }
+template<typename Tile>
+void tile_bit_or_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value) { t.bit_or_inplace(tile_coord(i,j,k,l), value); }
+template<typename Tile>
+void tile_bit_xor_inplace(Tile& t, int i, typename Tile::Type value) { t.bit_xor_inplace(tile_coord(i), value); }
+template<typename Tile>
+void tile_bit_xor_inplace(Tile& t, int i, int j, typename Tile::Type value) { t.bit_xor_inplace(tile_coord(i,j), value); }
+template<typename Tile>
+void tile_bit_xor_inplace(Tile& t, int i, int j, int k, typename Tile::Type value) { t.bit_xor_inplace(tile_coord(i,j,k), value); }
+template<typename Tile>
+void tile_bit_xor_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value) { t.bit_xor_inplace(tile_coord(i,j,k,l), value); }
 template<typename Tile, typename AdjTile>
 void adj_tile_add_inplace(Tile& t, int i, typename Tile::Type value, AdjTile& adj_t, int adj_i, typename Tile::Type& adj_value) { adj_t.adj_add_inplace(tile_coord(i), adj_value); }
 template<typename Tile, typename AdjTile>
@@ -2613,6 +3089,33 @@ void adj_tile_sub_inplace(Tile& t, int i, int j, int k, typename Tile::Type valu
 template<typename Tile, typename AdjTile>
 void adj_tile_sub_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type& adj_value) { adj_t.adj_sub_inplace(tile_coord(i, j, k, l), adj_value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_and_inplace(Tile& t, int i, typename Tile::Type value, AdjTile& adj_t, int adj_i, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_and_inplace(Tile& t, int i, int j, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_and_inplace(Tile& t, int i, int j, int k, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_and_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_or_inplace(Tile& t, int i, typename Tile::Type value, AdjTile& adj_t, int adj_i, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_or_inplace(Tile& t, int i, int j, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_or_inplace(Tile& t, int i, int j, int k, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_or_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_xor_inplace(Tile& t, int i, typename Tile::Type value, AdjTile& adj_t, int adj_i, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_xor_inplace(Tile& t, int i, int j, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_xor_inplace(Tile& t, int i, int j, int k, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, typename Tile::Type& adj_value) {}
+template<typename Tile, typename AdjTile>
+void adj_tile_bit_xor_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type& adj_value) {}
 namespace partitioned_gemm
 {
@@ -3000,7 +3503,7 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 #define tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, Xinout) \
     do { \
         void function_name(dtype*, char*); \
-        char* buffer = (char*)wp::tile_alloc_shared(shared_memory_size); \
+        char* buffer = (char*)wp::tile_shared_storage_t::alloc(shared_memory_size); \
         __align__(16) dtype data[ept]; \
         for(int b = 0; b < (int)batch_size; b++) { \
             dtype* inout = Xinout.data + (int)b * (int)ept; \
@@ -3009,7 +3512,7 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
             memcpy(inout, data, sizeof(dtype) * ept); \
             WP_TILE_SYNC(); \
         } \
-        wp::tile_alloc_shared(-shared_memory_size); \
+        wp::tile_shared_storage_t::alloc(-shared_memory_size); \
     } while (0)
 #define tile_ifft tile_fft
@@ -3053,7 +3556,7 @@ TileL& tile_cholesky(Fwd fun_forward, TileA& A, TileL& L)
 #else
     // TODO: for batched Cholesky, need one info per batch
-    WP_TILE_SHARED int info[1];
+    __shared__ int info[1];
     if (WP_TILE_THREAD_IDX == 0) {
         info[0] = 0;
@@ -3385,21 +3888,62 @@ inline CUDA_CALLABLE void assign(TileA& dest, int i, const Scalar& src)
 template <typename TileA, typename Scalar>
 inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, const Scalar& src)
 {
-    dest.data(tile_coord(i, j)) = src;
+    if constexpr(is_vector<typename TileA::Type>::value) {
+        dest.data(tile_coord(i))[j] = src;
+    } else {
+        dest.data(tile_coord(i, j)) = src;
+    }
     WP_TILE_SYNC();
 }
 template <typename TileA, typename Scalar>
 inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, int k, const Scalar& src)
 {
-    dest.data(tile_coord(i, j, k)) = src;
+    if constexpr(is_vector<typename TileA::Type>::value) {
+        dest.data(tile_coord(i, j))[k] = src;
+    } else if constexpr(is_matrix<typename TileA::Type>::value) {
+        dest.data(tile_coord(i)).data[j][k] = src;
+    } else {
+        dest.data(tile_coord(i, j, k)) = src;
+    }
     WP_TILE_SYNC();
 }
 template <typename TileA, typename Scalar>
 inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, int k, int l, const Scalar& src)
 {
-    dest.data(tile_coord(i, j, k, l)) = src;
+    if constexpr(is_vector<typename TileA::Type>::value) {
+        dest.data(tile_coord(i, j, k))[l] = src;
+    } else if constexpr(is_matrix<typename TileA::Type>::value) {
+        dest.data(tile_coord(i, j)).data[k][l] = src;
+    } else {
+        dest.data(tile_coord(i, j, k, l)) = src;
+    }
+    WP_TILE_SYNC();
+}
+template <typename TileA, typename Scalar>
+inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, int k, int l, int m, const Scalar& src)
+{
+    if constexpr(is_vector<typename TileA::Type>::value) {
+        dest.data(tile_coord(i, j, k, l))[m] = src;
+    } else if constexpr(is_matrix<typename TileA::Type>::value) {
+        dest.data(tile_coord(i, j, k)).data[l][m] = src;
+    } else {
+        static_assert(always_false<TileA>::value,
+                      "assign with 5 indices requires a tile of vectors (4D tile) or matrices (3D tile)");
+    }
     WP_TILE_SYNC();
 }
+template <typename TileA, typename Scalar>
+inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, int k, int l, int m, int n, const Scalar& src)
+{
+    if constexpr(is_matrix<typename TileA::Type>::value) {
+        dest.data(tile_coord(i, j, k, l)).data[m][n] = src;
+    } else {
+        static_assert(always_false<TileA>::value,
+                      "assign with 6 indices requires a tile of matrices (4D tile)");
+    }
+    WP_TILE_SYNC();
+}
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, const Scalar& src, AdjTileA& adj_dest, int adj_i, Scalar& adj_src)
@@ -3419,7 +3963,11 @@ inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, const Scalar& sr
         return;
     }
-    adj_src += dest.grad(tile_coord(i, j));
+    if constexpr(is_vector<typename TileA::Type>::value) {
+        adj_src += dest.grad(tile_coord(i))[j];
+    } else {
+        adj_src += dest.grad(tile_coord(i, j));
+    }
 }
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, Scalar& adj_src)
@@ -3429,7 +3977,13 @@ inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, const Sca
         return;
     }
-    adj_src += dest.grad(tile_coord(i, j, k));
+    if constexpr(is_vector<typename TileA::Type>::value) {
+        adj_src += dest.grad(tile_coord(i, j))[k];
+    } else if constexpr(is_matrix<typename TileA::Type>::value) {
+        adj_src += dest.grad(tile_coord(i)).data[j][k];
+    } else {
+        adj_src += dest.grad(tile_coord(i, j, k));
+    }
 }
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, int l, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, int adj_l, Scalar& adj_src)
@@ -3439,7 +3993,45 @@ inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, int l, co
         return;
     }
-    adj_src += dest.grad(tile_coord(i, j, k, l));
+    if constexpr(is_vector<typename TileA::Type>::value) {
+        adj_src += dest.grad(tile_coord(i, j, k))[l];
+    } else if constexpr(is_matrix<typename TileA::Type>::value) {
+        adj_src += dest.grad(tile_coord(i, j)).data[k][l];
+    } else {
+        adj_src += dest.grad(tile_coord(i, j, k, l));
+    }
+}
+template <typename TileA, typename AdjTileA, typename Scalar>
+inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, int l, int m, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, int adj_l, int adj_m, Scalar& adj_src)
+{
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
+    if constexpr(is_vector<typename TileA::Type>::value) {
+        adj_src += dest.grad(tile_coord(i, j, k, l))[m];
+    } else if constexpr(is_matrix<typename TileA::Type>::value) {
+        adj_src += dest.grad(tile_coord(i, j, k)).data[l][m];
+    } else {
+        static_assert(always_false<TileA>::value,
+                      "adj_assign with 5 indices requires a tile of vectors (4D tile) or matrices (3D tile)");
+    }
+}
+template <typename TileA, typename AdjTileA, typename Scalar>
+inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, int l, int m, int n, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, int adj_l, int adj_m, int adj_n, Scalar& adj_src)
+{
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
+    if constexpr(is_matrix<typename TileA::Type>::value) {
+        adj_src += dest.grad(tile_coord(i, j, k, l)).data[m][n];
+    } else {
+        static_assert(always_false<TileA>::value,
+                      "adj_assign with 6 indices requires a tile of matrices (4D tile)");
+    }
 }
 template <typename TileA, typename TileB, typename Coord>