PyPI - warp-lang - Versions diffs - 1.6.1__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl - Mend

warp-lang 1.6.1__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (401) hide show

warp/__init__.py +21 -7
warp/autograd.py +14 -6
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +424 -6
warp/build_dll.py +20 -20
warp/builtins.py +467 -368
warp/codegen.py +193 -125
warp/config.py +56 -12
warp/constants.py +14 -6
warp/context.py +524 -277
warp/dlpack.py +22 -12
warp/examples/__init__.py +14 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_api.py +14 -6
warp/examples/benchmarks/benchmark_cloth.py +14 -6
warp/examples/benchmarks/benchmark_cloth_cupy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_jax.py +14 -6
warp/examples/benchmarks/benchmark_cloth_numba.py +15 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_paddle.py +14 -6
warp/examples/benchmarks/benchmark_cloth_pytorch.py +14 -6
warp/examples/benchmarks/benchmark_cloth_taichi.py +14 -6
warp/examples/benchmarks/benchmark_cloth_warp.py +14 -6
warp/examples/benchmarks/benchmark_gemm.py +82 -48
warp/examples/benchmarks/benchmark_interop_paddle.py +14 -6
warp/examples/benchmarks/benchmark_interop_torch.py +14 -6
warp/examples/benchmarks/benchmark_launches.py +14 -6
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/browse.py +14 -6
warp/examples/core/example_cupy.py +14 -6
warp/examples/core/example_dem.py +14 -6
warp/examples/core/example_fluid.py +14 -6
warp/examples/core/example_graph_capture.py +14 -6
warp/examples/core/example_marching_cubes.py +14 -6
warp/examples/core/example_mesh.py +14 -6
warp/examples/core/example_mesh_intersect.py +14 -6
warp/examples/core/example_nvdb.py +14 -6
warp/examples/core/example_raycast.py +14 -6
warp/examples/core/example_raymarch.py +14 -6
warp/examples/core/example_render_opengl.py +14 -6
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +14 -6
warp/examples/core/example_torch.py +14 -6
warp/examples/core/example_wave.py +14 -6
warp/examples/fem/example_adaptive_grid.py +14 -6
warp/examples/fem/example_apic_fluid.py +15 -7
warp/examples/fem/example_burgers.py +16 -8
warp/examples/fem/example_convection_diffusion.py +14 -6
warp/examples/fem/example_convection_diffusion_dg.py +14 -6
warp/examples/fem/example_deformed_geometry.py +15 -7
warp/examples/fem/example_diffusion.py +14 -6
warp/examples/fem/example_diffusion_3d.py +14 -6
warp/examples/fem/example_diffusion_mgpu.py +14 -6
warp/examples/fem/example_distortion_energy.py +15 -7
warp/examples/fem/example_magnetostatics.py +20 -12
warp/examples/fem/example_mixed_elasticity.py +14 -6
warp/examples/fem/example_navier_stokes.py +14 -6
warp/examples/fem/example_nonconforming_contact.py +14 -6
warp/examples/fem/example_stokes.py +14 -6
warp/examples/fem/example_stokes_transfer.py +14 -6
warp/examples/fem/example_streamlines.py +14 -6
warp/examples/fem/utils.py +24 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_bounce.py +14 -6
warp/examples/optim/example_cloth_throw.py +14 -6
warp/examples/optim/example_diffray.py +14 -6
warp/examples/optim/example_drone.py +14 -6
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/optim/example_inverse_kinematics.py +14 -6
warp/examples/optim/example_inverse_kinematics_torch.py +14 -6
warp/examples/optim/example_softbody_properties.py +14 -6
warp/examples/optim/example_spring_cage.py +14 -6
warp/examples/optim/example_trajectory.py +14 -6
warp/examples/sim/example_cartpole.py +14 -6
warp/examples/sim/example_cloth.py +14 -6
warp/examples/sim/example_cloth_self_contact.py +14 -6
warp/examples/sim/example_granular.py +14 -6
warp/examples/sim/example_granular_collision_sdf.py +14 -6
warp/examples/sim/example_jacobian_ik.py +14 -6
warp/examples/sim/example_particle_chain.py +14 -6
warp/examples/sim/example_quadruped.py +14 -6
warp/examples/sim/example_rigid_chain.py +14 -6
warp/examples/sim/example_rigid_contact.py +14 -6
warp/examples/sim/example_rigid_force.py +14 -6
warp/examples/sim/example_rigid_gyroscopic.py +14 -6
warp/examples/sim/example_rigid_soft_contact.py +14 -6
warp/examples/sim/example_soft_body.py +14 -6
warp/examples/tile/example_tile_cholesky.py +14 -6
warp/examples/tile/example_tile_convolution.py +14 -6
warp/examples/tile/example_tile_fft.py +14 -6
warp/examples/tile/example_tile_filtering.py +14 -6
warp/examples/tile/example_tile_matmul.py +16 -10
warp/examples/tile/example_tile_mlp.py +14 -6
warp/examples/tile/example_tile_nbody.py +14 -6
warp/examples/tile/example_tile_walker.py +14 -6
warp/fabric.py +15 -0
warp/fem/__init__.py +26 -1
warp/fem/adaptivity.py +19 -4
warp/fem/cache.py +15 -0
warp/fem/dirichlet.py +15 -0
warp/fem/domain.py +15 -0
warp/fem/field/__init__.py +15 -0
warp/fem/field/field.py +15 -0
warp/fem/field/nodal_field.py +37 -68
warp/fem/field/restriction.py +15 -0
warp/fem/field/virtual.py +77 -23
warp/fem/geometry/__init__.py +15 -0
warp/fem/geometry/adaptive_nanogrid.py +24 -10
warp/fem/geometry/closest_point.py +16 -1
warp/fem/geometry/deformed_geometry.py +20 -2
warp/fem/geometry/element.py +15 -0
warp/fem/geometry/geometry.py +20 -0
warp/fem/geometry/grid_2d.py +27 -12
warp/fem/geometry/grid_3d.py +27 -15
warp/fem/geometry/hexmesh.py +20 -7
warp/fem/geometry/nanogrid.py +24 -11
warp/fem/geometry/partition.py +15 -0
warp/fem/geometry/quadmesh.py +28 -13
warp/fem/geometry/tetmesh.py +18 -4
warp/fem/geometry/trimesh.py +18 -8
warp/fem/integrate.py +277 -93
warp/fem/linalg.py +20 -5
warp/fem/operator.py +15 -0
warp/fem/polynomial.py +15 -0
warp/fem/quadrature/__init__.py +15 -0
warp/fem/quadrature/pic_quadrature.py +52 -22
warp/fem/quadrature/quadrature.py +209 -25
warp/fem/space/__init__.py +16 -1
warp/fem/space/basis_function_space.py +19 -2
warp/fem/space/basis_space.py +40 -18
warp/fem/space/dof_mapper.py +15 -0
warp/fem/space/function_space.py +15 -0
warp/fem/space/grid_2d_function_space.py +15 -0
warp/fem/space/grid_3d_function_space.py +15 -0
warp/fem/space/hexmesh_function_space.py +17 -2
warp/fem/space/nanogrid_function_space.py +15 -0
warp/fem/space/partition.py +21 -2
warp/fem/space/quadmesh_function_space.py +23 -8
warp/fem/space/restriction.py +15 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +38 -23
warp/fem/space/shape/shape_function.py +15 -0
warp/fem/space/shape/square_shape_function.py +27 -12
warp/fem/space/shape/tet_shape_function.py +15 -0
warp/fem/space/shape/triangle_shape_function.py +16 -1
warp/fem/space/tetmesh_function_space.py +18 -3
warp/fem/space/topology.py +15 -0
warp/fem/space/trimesh_function_space.py +17 -2
warp/fem/types.py +15 -0
warp/fem/utils.py +27 -6
warp/jax.py +28 -7
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -33
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +103 -6
warp/native/array.h +28 -6
warp/native/builtin.h +44 -9
warp/native/bvh.cpp +18 -7
warp/native/bvh.cu +57 -20
warp/native/bvh.h +17 -7
warp/native/clang/clang.cpp +45 -9
warp/native/coloring.cpp +15 -6
warp/native/crt.cpp +15 -6
warp/native/crt.h +15 -6
warp/native/cuda_crt.h +15 -6
warp/native/cuda_util.cpp +29 -6
warp/native/cuda_util.h +17 -6
warp/native/error.cpp +15 -6
warp/native/error.h +15 -6
warp/native/exports.h +85 -63
warp/native/fabric.h +15 -6
warp/native/hashgrid.cpp +15 -6
warp/native/hashgrid.cu +15 -6
warp/native/hashgrid.h +15 -6
warp/native/initializer_array.h +15 -6
warp/native/intersect.h +41 -32
warp/native/intersect_adj.h +48 -39
warp/native/intersect_tri.h +17 -0
warp/native/marching.cpp +16 -0
warp/native/marching.cu +16 -7
warp/native/marching.h +17 -0
warp/native/mat.h +528 -15
warp/native/mathdx.cpp +15 -6
warp/native/matnn.h +15 -6
warp/native/mesh.cpp +15 -6
warp/native/mesh.cu +15 -6
warp/native/mesh.h +25 -16
warp/native/noise.h +15 -6
warp/native/quat.h +114 -17
warp/native/rand.h +21 -6
warp/native/range.h +15 -6
warp/native/reduce.cpp +15 -6
warp/native/reduce.cu +15 -6
warp/native/runlength_encode.cpp +15 -6
warp/native/runlength_encode.cu +15 -6
warp/native/scan.cpp +15 -6
warp/native/scan.cu +15 -6
warp/native/scan.h +15 -6
warp/native/solid_angle.h +17 -0
warp/native/sort.cpp +137 -65
warp/native/sort.cu +167 -21
warp/native/sort.h +23 -7
warp/native/sparse.cpp +58 -28
warp/native/sparse.cu +67 -23
warp/native/spatial.h +15 -6
warp/native/svd.h +131 -6
warp/native/temp_buffer.h +15 -6
warp/native/tile.h +316 -111
warp/native/tile_reduce.h +61 -9
warp/native/vec.h +83 -13
warp/native/volume.cpp +100 -119
warp/native/volume.cu +15 -6
warp/native/volume.h +15 -6
warp/native/volume_builder.cu +40 -16
warp/native/volume_builder.h +21 -6
warp/native/volume_impl.h +15 -6
warp/native/warp.cpp +20 -12
warp/native/warp.cu +114 -16
warp/native/warp.h +34 -16
warp/optim/__init__.py +14 -6
warp/optim/adam.py +14 -6
warp/optim/linear.py +25 -10
warp/optim/sgd.py +14 -6
warp/paddle.py +14 -6
warp/render/__init__.py +14 -6
warp/render/render_opengl.py +14 -6
warp/render/render_usd.py +14 -6
warp/render/utils.py +14 -6
warp/sim/__init__.py +14 -7
warp/sim/articulation.py +18 -10
warp/sim/collide.py +35 -16
warp/sim/graph_coloring.py +14 -6
warp/sim/import_mjcf.py +463 -162
warp/sim/import_snu.py +14 -7
warp/sim/import_urdf.py +46 -18
warp/sim/import_usd.py +14 -7
warp/sim/inertia.py +14 -6
warp/sim/integrator.py +14 -6
warp/sim/integrator_euler.py +19 -11
warp/sim/integrator_featherstone.py +17 -16
warp/sim/integrator_vbd.py +222 -8
warp/sim/integrator_xpbd.py +19 -11
warp/sim/model.py +56 -19
warp/sim/particles.py +14 -6
warp/sim/render.py +14 -6
warp/sim/utils.py +17 -2
warp/sparse.py +657 -555
warp/stubs.py +231 -19
warp/tape.py +14 -6
warp/tests/aux_test_class_kernel.py +14 -6
warp/tests/aux_test_compile_consts_dummy.py +14 -6
warp/tests/aux_test_conditional_unequal_types_kernels.py +14 -6
warp/tests/aux_test_dependent.py +14 -6
warp/tests/aux_test_grad_customs.py +14 -6
warp/tests/aux_test_instancing_gc.py +14 -6
warp/tests/aux_test_module_unload.py +14 -6
warp/tests/aux_test_name_clash1.py +14 -6
warp/tests/aux_test_name_clash2.py +14 -6
warp/tests/aux_test_unresolved_func.py +14 -6
warp/tests/aux_test_unresolved_symbol.py +14 -6
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_async.py → cuda/test_async.py} +14 -6
warp/tests/{test_ipc.py → cuda/test_ipc.py} +14 -6
warp/tests/{test_mempool.py → cuda/test_mempool.py} +53 -6
warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +14 -6
warp/tests/{test_peer.py → cuda/test_peer.py} +14 -6
warp/tests/{test_pinned.py → cuda/test_pinned.py} +14 -6
warp/tests/{test_streams.py → cuda/test_streams.py} +85 -6
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_bvh.py → geometry/test_bvh.py} +14 -6
warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +14 -6
warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +14 -6
warp/tests/{test_mesh.py → geometry/test_mesh.py} +14 -6
warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +14 -6
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +80 -69
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +15 -7
warp/tests/{test_volume.py → geometry/test_volume.py} +55 -12
warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +14 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +42 -11
warp/tests/{test_jax.py → interop/test_jax.py} +14 -6
warp/tests/{test_paddle.py → interop/test_paddle.py} +14 -6
warp/tests/{test_torch.py → interop/test_torch.py} +14 -6
warp/tests/run_coverage_serial.py +14 -6
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +23 -16
warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +14 -6
warp/tests/{test_collision.py → sim/test_collision.py} +16 -8
warp/tests/{test_coloring.py → sim/test_coloring.py} +14 -7
warp/tests/{test_model.py → sim/test_model.py} +55 -7
warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +14 -6
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +16 -7
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_adam.py +14 -6
warp/tests/test_arithmetic.py +14 -6
warp/tests/test_array.py +14 -6
warp/tests/test_array_reduce.py +14 -6
warp/tests/test_assert.py +14 -6
warp/tests/test_atomic.py +14 -6
warp/tests/test_bool.py +15 -7
warp/tests/test_builtins_resolution.py +14 -6
warp/tests/test_closest_point_edge_edge.py +14 -6
warp/tests/test_codegen.py +14 -6
warp/tests/test_codegen_instancing.py +14 -6
warp/tests/test_compile_consts.py +14 -6
warp/tests/test_conditional.py +14 -6
warp/tests/test_context.py +14 -6
warp/tests/test_copy.py +14 -6
warp/tests/test_ctypes.py +14 -6
warp/tests/test_dense.py +14 -6
warp/tests/test_devices.py +14 -6
warp/tests/test_examples.py +42 -42
warp/tests/test_fabricarray.py +14 -6
warp/tests/test_fast_math.py +14 -6
warp/tests/test_fem.py +37 -10
warp/tests/test_fp16.py +14 -6
warp/tests/test_func.py +14 -6
warp/tests/test_future_annotations.py +14 -6
warp/tests/test_generics.py +14 -6
warp/tests/test_grad.py +14 -6
warp/tests/test_grad_customs.py +14 -6
warp/tests/test_grad_debug.py +14 -6
warp/tests/test_implicit_init.py +14 -6
warp/tests/test_import.py +14 -6
warp/tests/test_indexedarray.py +14 -6
warp/tests/test_intersect.py +14 -6
warp/tests/test_iter.py +14 -6
warp/tests/test_large.py +14 -6
warp/tests/test_launch.py +14 -6
warp/tests/test_lerp.py +14 -6
warp/tests/test_linear_solvers.py +15 -11
warp/tests/test_lvalue.py +14 -6
warp/tests/test_mat.py +247 -85
warp/tests/test_mat_lite.py +14 -6
warp/tests/test_mat_scalar_ops.py +18 -10
warp/tests/test_math.py +14 -6
warp/tests/test_mlp.py +14 -6
warp/tests/test_module_hashing.py +14 -6
warp/tests/test_modules_lite.py +14 -6
warp/tests/test_noise.py +14 -6
warp/tests/test_operators.py +14 -6
warp/tests/test_options.py +14 -6
warp/tests/test_overwrite.py +15 -60
warp/tests/test_print.py +14 -6
warp/tests/test_quat.py +81 -52
warp/tests/test_rand.py +58 -43
warp/tests/test_reload.py +14 -6
warp/tests/test_rounding.py +14 -6
warp/tests/test_runlength_encode.py +14 -6
warp/tests/test_scalar_ops.py +14 -6
warp/tests/test_smoothstep.py +14 -6
warp/tests/test_snippet.py +15 -0
warp/tests/test_sparse.py +61 -12
warp/tests/test_spatial.py +89 -6
warp/tests/test_special_values.py +14 -6
warp/tests/test_static.py +15 -7
warp/tests/test_struct.py +14 -6
warp/tests/test_tape.py +14 -6
warp/tests/test_transient_module.py +14 -6
warp/tests/test_triangle_closest_point.py +14 -6
warp/tests/test_types.py +14 -6
warp/tests/test_utils.py +98 -10
warp/tests/test_vec.py +60 -40
warp/tests/test_vec_lite.py +14 -6
warp/tests/test_vec_scalar_ops.py +14 -6
warp/tests/test_verify_fp.py +14 -6
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +150 -57
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +15 -7
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +23 -12
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +39 -20
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +74 -7
warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +14 -6
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +15 -7
warp/tests/unittest_serial.py +15 -6
warp/tests/unittest_suites.py +59 -65
warp/tests/unittest_utils.py +16 -7
warp/tests/walkthrough_debug.py +14 -6
warp/thirdparty/unittest_parallel.py +15 -8
warp/torch.py +14 -6
warp/types.py +124 -664
warp/utils.py +151 -78
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/METADATA +39 -12
warp_lang-1.7.0.dist-info/RECORD +429 -0
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
warp/examples/optim/example_walker.py +0 -309
warp/native/cutlass_gemm.cpp +0 -34
warp/native/cutlass_gemm.cu +0 -373
warp/tests/test_matmul.py +0 -503
warp/tests/test_matmul_lite.py +0 -403
warp/tests/test_vbd.py +0 -378
warp/tests/unused_test_misc.py +0 -69
warp_lang-1.6.1.dist-info/LICENSE.md +0 -126
warp_lang-1.6.1.dist-info/RECORD +0 -419
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/native/tile.h CHANGED Viewed

@@ -1,18 +1,57 @@
-/** Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #pragma once
 #include "builtin.h"
+#ifdef __clang__
+// disable warnings related to C++17 extensions on CPU JIT builds
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++17-extensions"
+#endif // __clang__
+// Check if the CUDA toolkit is available
+#if WP_ENABLE_CUDA || defined(__CUDACC_RTC__)
+    // If NVRTC is being used, do not include extra headers (NVRTC has built-in float4)
+    #ifdef __CUDACC_RTC__
+        // NVRTC: Use built-in float4 (no need for extra definitions)
+    #else
+        // NVCC: Include vector_types.h to get float4
+        #include <cuda_runtime.h>
+    #endif
+#else
+    // If CUDA is not available (e.g., macOS build), manually define float4
+    struct alignas(16) float4 {
+        float x, y, z, w;
+    };
+#endif
+// only used while building the warp core library
+#ifndef WP_TILE_BLOCK_DIM
+#define WP_TILE_BLOCK_DIM 256
+#endif
 #if !defined(__CUDA_ARCH__)
 #define WP_TILE_SHARED static
 #define WP_TILE_SYNC void
 #else
 #define WP_TILE_SHARED __shared__
 #define WP_TILE_SYNC __syncthreads
@@ -37,6 +76,14 @@
 #define WP_USE_ASYNC_PIPELINE 0
 #define WP_USE_REGISTER_GEMM 0
+#if defined(__CUDACC_RTC__)
+#define WP_TILE_THREAD_IDX threadIdx.x
+#else
+#define WP_TILE_THREAD_IDX 0
+#endif //
 /* Tile Expressions
 [ ] Tiles
@@ -208,14 +255,14 @@ constexpr tile_coord_t<sizeof...(Ints)> tile_coord(Ints... idxs)
 }
 // helpers to construct a coord from a set of indices
-auto tile_coord(int i)
+inline auto tile_coord(int i)
 {
     auto c = tile_coord_t<1>();
     c.indices[0] = i;
     return c;
 }
-auto tile_coord(int i, int j)
+inline auto tile_coord(int i, int j)
 {
     auto c = tile_coord_t<2>();
     c.indices[0] = i;
@@ -223,7 +270,7 @@ auto tile_coord(int i, int j)
     return c;
 }
-auto tile_coord(int i, int j, int k)
+inline auto tile_coord(int i, int j, int k)
 {
     auto c = tile_coord_t<3>();
     c.indices[0] = i;
@@ -232,7 +279,7 @@ auto tile_coord(int i, int j, int k)
     return c;
 }
-auto tile_coord(int i, int j, int k, int l)
+inline auto tile_coord(int i, int j, int k, int l)
 {
     auto c = tile_coord_t<4>();
     c.indices[0] = i;
@@ -247,7 +294,7 @@ template <int... V>
 struct tile_tuple_t
 {
     static constexpr int N = sizeof...(V);
-    static_assert(N > 0);
+    static_assert(N > 0, "Expected N > 0");
     static constexpr int data[N] = { V... };
@@ -400,7 +447,7 @@ struct tile_layout_register_t
     static inline CUDA_CALLABLE int linear_from_register(int reg)
     {
-        return threadIdx.x + reg*WP_TILE_BLOCK_DIM;
+        return WP_TILE_THREAD_IDX + reg*WP_TILE_BLOCK_DIM;
     }
     static inline CUDA_CALLABLE int linear_from_coord(Coord c)
@@ -500,15 +547,6 @@ struct tile_register_t
         return data[reg];
     }
-    // Returns the number of valid registers for this tile
-    // i.e.: how many registers map to a valid coordinate.
-    // When a tile's size is not aligned to the block dimension
-    // some of the trailing registers may lie outside the valid range
-    inline CUDA_CALLABLE int valid() const
-    {
-        return (int)floor(float(Size - threadIdx.x - 1)/WP_TILE_BLOCK_DIM) + 1;
-    }
     inline CUDA_CALLABLE void assign(const tile_register_t<T, Layout>& tile)
     {
         for (int i=0; i < Layout::NumRegs; ++i)
@@ -535,7 +573,7 @@ struct tile_register_t
         // ensure any previously scheduled threads have finished reading from scratch
         WP_TILE_SYNC();
-        if (threadIdx.x == thread)
+        if (WP_TILE_THREAD_IDX == thread)
         {
             scratch = data[reg];
         }
@@ -556,7 +594,7 @@ struct tile_register_t
         const int thread = Layout::thread_from_linear(linear);
         const int reg = Layout::register_from_linear(linear);
-        if (threadIdx.x == thread)
+        if (WP_TILE_THREAD_IDX == thread)
         {
             data[reg] += adj_ret;
         }
@@ -659,7 +697,7 @@ struct tile_register_t
 // users can either specify a template explicitly or
 // pass in another concrete instance
 template<typename Tile>
-auto tile_register_like(Tile* t=NULL)
+auto tile_register_like(Tile* t=nullptr)
 {
     using T = typename Tile::Type;
     using L = typename Tile::Layout;
@@ -685,26 +723,39 @@ inline CUDA_CALLABLE int tile_align(int num_bytes)
     return sign * ((num_bytes_abs + alignment - 1) / alignment) * alignment;
 }
-inline CUDA_CALLABLE void* tile_alloc_shared(int num_bytes, bool init=false)
+inline CUDA_CALLABLE void* tile_alloc_shared(int num_bytes, bool init=false, bool check=false)
 {
     // we maintain a per-thread offset into dynamic
     // shared memory that allows us to keep track of
     // current use across dynamic function calls
-    __shared__ int smem_base[WP_TILE_BLOCK_DIM];
+    WP_TILE_SHARED int smem_base[WP_TILE_BLOCK_DIM];
     if (init)
     {
-        smem_base[threadIdx.x] = 0;
-        return NULL;
+        smem_base[WP_TILE_THREAD_IDX] = 0;
+        return nullptr;
+    }
+    else if (check)
+    {
+        assert(smem_base[WP_TILE_THREAD_IDX] == 0);
+        return nullptr;
     }
     else
     {
-        const int offset = smem_base[threadIdx.x];
+        const int offset = smem_base[WP_TILE_THREAD_IDX];
         // one entry per-thread so no need for synchronization
-        smem_base[threadIdx.x] += tile_align(num_bytes);
+        smem_base[WP_TILE_THREAD_IDX] += tile_align(num_bytes);
+#ifdef __CUDA_ARCH__
         extern __shared__ char dynamic_smem_base[];
+#else
+        // on CPU allocate a fixed 256k block to use for shared allocs
+        static const int max_cpu_shared = 256*1024;
+        static char dynamic_smem_base[max_cpu_shared];
+        assert(smem_base[WP_TILE_THREAD_IDX] <= max_cpu_shared);
+#endif
         return &(dynamic_smem_base[offset]);
     }
 }
@@ -838,12 +889,12 @@ struct tile_shared_t
     bool initialized;
     // default initialization (non-initialized)
-    inline CUDA_CALLABLE tile_shared_t() : data(NULL), grad(NULL), initialized(false)
+    inline CUDA_CALLABLE tile_shared_t() : data(nullptr), grad(nullptr), initialized(false)
     {
     }
     // initialize from an existing tile's memory
-    inline CUDA_CALLABLE tile_shared_t(T* data, T* grad=NULL, bool initialized=true) : data(data), grad(grad), initialized(initialized)
+    inline CUDA_CALLABLE tile_shared_t(T* data, T* grad=nullptr, bool initialized=true) : data(data), grad(grad), initialized(initialized)
     {
     }
@@ -869,6 +920,7 @@ struct tile_shared_t
     }
+/*
     // construct from another shared tile, this constructor
     // is invoked for reshape operations like `wp.tile_transpose()`
     template <typename OtherT, typename OtherLayout>
@@ -877,7 +929,7 @@ struct tile_shared_t
         using OtherTile = tile_shared_t<OtherT, OtherLayout>;
         // check dimensions are compatible
-        static_assert(Size == OtherTile::Size);
+        static_assert(Size == OtherTile::Size, "Expected Size == OtherTile::Size");
         // alias tile directly
         data = rhs.data;
@@ -886,6 +938,7 @@ struct tile_shared_t
         return *this;
     }
+*/
     // assign from a global tile (load)
     inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, typename Layout::Shape>& t)
@@ -903,7 +956,7 @@ struct tile_shared_t
         if (initialized)
             WP_TILE_SYNC();
-        for (int i=threadIdx.x; i < Layout::Size; i+= WP_TILE_BLOCK_DIM)
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i+= WP_TILE_BLOCK_DIM)
             data(i) = x;
         initialized = true;
@@ -914,7 +967,7 @@ struct tile_shared_t
     // in-place zero
     inline CUDA_CALLABLE void zero()
     {
-        for (int i=threadIdx.x; i < Layout::Size; i+= WP_TILE_BLOCK_DIM)
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i+= WP_TILE_BLOCK_DIM)
             data(i) = T(0);
         WP_TILE_SYNC();
@@ -964,7 +1017,7 @@ struct tile_shared_t
     // in-place gradient zero
     inline CUDA_CALLABLE void grad_zero()
     {
-        for (int i=threadIdx.x; i < Layout::Size; i+= WP_TILE_BLOCK_DIM)
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i+= WP_TILE_BLOCK_DIM)
             grad(i) = T(0);
         WP_TILE_SYNC();
@@ -1004,7 +1057,7 @@ struct tile_shared_t
     CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
     {
         WP_PRAGMA_UNROLL
-        for (int i=threadIdx.x; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
         {
             auto c = Layout::coord_from_linear(i);
             T g = global.load_grad(c);
@@ -1072,6 +1125,8 @@ struct tile_shared_t
     template <typename Global>
     inline CUDA_CALLABLE void copy_to_global(const Global& dest)
     {
+#if defined(__CUDA_ARCH__)
         // vectorized loads for specific input/output shapes
         if constexpr (Layout::Shape::N == 2)
         {
@@ -1100,7 +1155,7 @@ struct tile_shared_t
                 const int stride_j = 1;
                 WP_PRAGMA_UNROLL
-                for (int i=threadIdx.x; i < SrcLayout::Size; i += WP_TILE_BLOCK_DIM)
+                for (int i=WP_TILE_THREAD_IDX; i < SrcLayout::Size; i += WP_TILE_BLOCK_DIM)
                 {
                     auto c = SrcLayout::coord_from_linear(i);
@@ -1111,17 +1166,18 @@ struct tile_shared_t
             }
         }
+#endif //defined(__CUDA_ARCH__)
         // scalar bounds checked path
         WP_PRAGMA_UNROLL
-        for (int i=threadIdx.x; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
         {
             auto c = Layout::coord_from_linear(i);
             dest.store(c, data(i));
         }
     }
-    __device__ __forceinline__
-    void cp_async_global_to_shared_128(float4* shared_dest, const float4* global_src)
+    inline CUDA_CALLABLE void cp_async_global_to_shared_128(float4* shared_dest, const float4* global_src)
     {
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
@@ -1143,8 +1199,7 @@ struct tile_shared_t
     #endif
     }
-    __device__ __forceinline__
-    void cp_async_commit_and_wait_all_128()
+    inline CUDA_CALLABLE void cp_async_commit_and_wait_all_128()
     {
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
         asm volatile(
@@ -1159,6 +1214,8 @@ struct tile_shared_t
         if (initialized)
             WP_TILE_SYNC();
+#if defined(__CUDA_ARCH__)
         // vectorized loads for specific input/output shapes
         if constexpr (Layout::Shape::N == 2)
         {
@@ -1187,7 +1244,7 @@ struct tile_shared_t
                 const int stride_j = 1;
                 WP_PRAGMA_UNROLL
-                for (int i=threadIdx.x; i < DestLayout::Size; i += WP_TILE_BLOCK_DIM)
+                for (int i=WP_TILE_THREAD_IDX; i < DestLayout::Size; i += WP_TILE_BLOCK_DIM)
                 {
                     auto c = DestLayout::coord_from_linear(i);
@@ -1208,9 +1265,11 @@ struct tile_shared_t
             }
         }
+#endif //defined(__CUDA_ARCH__)
         // scalar bounds checked path
         WP_PRAGMA_UNROLL
-        for (int i=threadIdx.x; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
         {
             auto c = Layout::coord_from_linear(i);
             data(i) = src.load(c);
@@ -1323,7 +1382,7 @@ struct tile_shared_t
     inline CUDA_CALLABLE void print(bool reverse=false) const
     {
-        if (threadIdx.x != 0)
+        if (WP_TILE_THREAD_IDX != 0)
             return;
         if (reverse)
@@ -1350,13 +1409,13 @@ void tile_register_t<T, L>::print() const
     // create a temporary shared tile so that
     // we can print it deterministically
     WP_TILE_SHARED T smem[L::Size];
-    tile_shared_t<T, tile_layout_strided_t<typename L::Shape>> scratch(smem, NULL);
+    tile_shared_t<T, tile_layout_strided_t<typename L::Shape>, false> scratch(smem, nullptr);
     scratch.assign(*this);
     WP_TILE_SYNC();
-    if (threadIdx.x == 0)
+    if (WP_TILE_THREAD_IDX == 0)
     {
         scratch.print_values(scratch.data, 0);
@@ -1383,7 +1442,7 @@ inline CUDA_CALLABLE void print(const tile_shared_t<T, L, Owner>& t) { t.print()
 template <typename T, typename L, bool O>
 inline CUDA_CALLABLE int len(const tile_shared_t<T, L, O>& t)
 {
-    return Tile::Layout::Shape::dim(0);
+    return L::Shape::dim(0);
 }
 template <typename T, typename L, bool O, typename AdjTile>
@@ -1394,7 +1453,7 @@ inline CUDA_CALLABLE void adj_len(const tile_shared_t<T,L,O>& t, const AdjTile&
 template <typename T, typename L>
 inline CUDA_CALLABLE int len(const tile_register_t<T, L>& t)
 {
-    return Tile::Layout::Shape::dim(0);
+    return L::Shape::dim(0);
 }
 template <typename T, typename L, typename AdjTile>
@@ -1416,12 +1475,16 @@ inline CUDA_CALLABLE auto tile_alloc_empty()
 {   constexpr int size = Shape::size();
     T* data = (T*)tile_alloc_shared(size*sizeof(T));
-    T* grad = NULL;
+    T* grad = nullptr;
 #if FP_CHECK
-    for (int i=threadIdx.x; i < size; i+= WP_TILE_BLOCK_DIM)
-        data[i] = T(nanf(""));
+    // initialize tile to quiet nan
+    uint32_t qnanbits = 0x7FC00000;
+    float qnan = *(float*)(&qnanbits);
+    for (int i=WP_TILE_THREAD_IDX; i < size; i+= WP_TILE_BLOCK_DIM)
+        data[i] = T(qnan);
     WP_TILE_SYNC();
@@ -1432,7 +1495,7 @@ inline CUDA_CALLABLE auto tile_alloc_empty()
     {
         grad = (T*)tile_alloc_shared(size*sizeof(T));
-        for (int i=threadIdx.x; i < size; i+= WP_TILE_BLOCK_DIM)
+        for (int i=WP_TILE_THREAD_IDX; i < size; i+= WP_TILE_BLOCK_DIM)
             grad[i] = T(0);
         WP_TILE_SYNC();
@@ -1441,30 +1504,6 @@ inline CUDA_CALLABLE auto tile_alloc_empty()
     return tile_shared_t<T, tile_layout_strided_t<Shape>>(data, grad);
 }
-template <typename T, int M, int N, bool RequiresGrad>
-inline CUDA_CALLABLE auto tile_alloc_zeros()
-{
-    // compute the total storage required for the tile (may be different from M*N) for broadcast tiles
-    constexpr int Len = M*N;
-    T* data = (T*)tile_alloc_shared(Len*sizeof(T));
-    T* grad = NULL;
-    for (int i=threadIdx.x; i < Len; i+= WP_TILE_BLOCK_DIM)
-        data[i] = T(0);
-    if (RequiresGrad)
-    {
-        grad = (T*)tile_alloc_shared(Len*sizeof(T));
-        for (int i=threadIdx.x; i < Len; i+= WP_TILE_BLOCK_DIM)
-            grad[i] = T(0);
-    }
-    WP_TILE_SYNC();
-    return tile_shared_t<T, tile_layout_strided_t<tile_shape_t<M, N>>(data, grad);
-}
 //-----------------------------------------------------------------------------------------------------
 // High level entry points for each op (correspond to one Warp builtin)
@@ -1476,7 +1515,7 @@ inline CUDA_CALLABLE auto tile(const T& x)
     tile_register_t<T, tile_layout_register_t<tile_shape_t<WP_TILE_BLOCK_DIM>>> result;
     using Layout = typename decltype(result)::Layout;
-    static_assert(Layout::NumRegs == 1);
+    static_assert(Layout::NumRegs == 1, "Expected Layout::NumRegs == 1");
     result.data[0] = x;
     return result;
@@ -1489,7 +1528,7 @@ inline CUDA_CALLABLE auto tile(const wp::vec_t<Length, T>& x)
     tile_register_t<T, tile_layout_register_t<tile_shape_t<Length, WP_TILE_BLOCK_DIM>>> result;
     using Layout = typename decltype(result)::Layout;
-    static_assert(Layout::NumRegs == Length);
+    static_assert(Layout::NumRegs == Length, "Expected Layout::NumRegs == Length");
     for (int i=0; i < Length; ++i)
         result.data[i] = x[i];
@@ -1501,8 +1540,8 @@ inline CUDA_CALLABLE auto tile(const wp::vec_t<Length, T>& x)
 template <typename T, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile(const T& x, T& adj_x, AdjTile& adj_ret)
 {
-    static_assert(AdjTile::Layout::Shape::N == 1);
-    static_assert(AdjTile::Layout::Shape::dim(0) == WP_TILE_BLOCK_DIM);
+    static_assert(AdjTile::Layout::Shape::N == 1, "Expected AdjTile::Layout::Shape::N == 1");
+    static_assert(AdjTile::Layout::Shape::dim(0) == WP_TILE_BLOCK_DIM, "Expected AdjTile::Layout::Shape::dim(0) == WP_TILE_BLOCK_DIM");
     auto adj_reg = adj_ret.copy_to_register();
@@ -1512,9 +1551,9 @@ inline CUDA_CALLABLE void adj_tile(const T& x, T& adj_x, AdjTile& adj_ret)
 template <typename T, unsigned Length, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile(const wp::vec_t<Length, T>& x, wp::vec_t<Length, T>& adj_x, AdjTile& adj_ret)
 {
-    static_assert(AdjTile::Layout::Shape::N == 2);
-    static_assert(AdjTile::Layout::Shape::dim(0) == Length);
-    static_assert(AdjTile::Layout::Shape::dim(1) == WP_TILE_BLOCK_DIM);
+    static_assert(AdjTile::Layout::Shape::N == 2, "Expected AdjTile::Layout::Shape::N == 2");
+    static_assert(AdjTile::Layout::Shape::dim(0) == Length, "Expected AdjTile::Layout::Shape::dim(0) == Length");
+    static_assert(AdjTile::Layout::Shape::dim(1) == WP_TILE_BLOCK_DIM, "Expected AdjTile::Layout::Shape::dim(1) == WP_TILE_BLOCK_DIM");
     auto adj_reg = adj_ret.copy_to_register();
@@ -1692,7 +1731,7 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, Coord c, Tile& t, arr
     if (adj_dest.data)
         src.data.grad = adj_dest.data;
-    if (src.data.grad == NULL)
+    if (src.data.grad == nullptr)
         return;
     adj_t.grad_add(src);
@@ -1927,7 +1966,6 @@ void adj_tile_extract(Tile& t, int i, int j, int k, AdjTile& adj_t, int adj_i, i
 template<typename Tile, typename AdjTile>
 void adj_tile_extract(Tile& t, int i, int j, int k, int l, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type adj_ret) { adj_t.adj_extract(tile_coord(i, j, k, l), adj_ret); }
-#if WP_USE_REGISTER_GEMM
 namespace partitioned_gemm
 {
@@ -2033,9 +2071,11 @@ inline CUDA_CALLABLE void matmul(TileA& A, TileB& B, TileC& out)
     auto B_tile = partition_t<TILE_K, TILE_N, TileB>(B);
     auto C_tile = partition_t<TILE_M, TILE_N, TileC>(out);
+    //static_assert(is_same<typename TileA::Type, typename TileB::Type>::value);
     const int length = partition_size(C_tile);
-    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    for (int t=WP_TILE_THREAD_IDX; t < length; t += WP_TILE_BLOCK_DIM)
     {
         int i, j;
         partition_coord(C_tile, t, i, j);
@@ -2055,10 +2095,102 @@ inline CUDA_CALLABLE void matmul(TileA& A, TileB& B, TileC& out)
         partition_store(C_tile, i, j, sum);
     }
 }
-} // namespace partition_gemm
-#endif // WP_USE_REGISTER_GEMM
+template <typename LayoutA, typename LayoutB, typename LayoutC, typename StorageA, typename StorageB, typename StorageC, typename T>
+inline CUDA_CALLABLE void scalar_matmul(const StorageA& A, const StorageB& B, StorageC& C, T scale)
+{
+    for (int t=WP_TILE_THREAD_IDX; t < LayoutC::Size; t += WP_TILE_BLOCK_DIM)
+    {
+        auto coord = LayoutC::coord_from_linear(t);
+        int i = coord[0];
+        int j = coord[1];
+        // accumulator
+        auto sum = C(coord)*scale;
+        WP_PRAGMA_UNROLL
+        for (int k=0; k < LayoutA::Shape::dim(1); k++)
+        {
+            const auto a = A(tile_coord(i, k));
+            const auto b = B(tile_coord(k, j));
+            sum = muladd<decltype(sum)>(a, b, sum);
+        }
+        C(coord) = sum;
+    }
+}
+template <typename TileA, typename TileL>
+inline CUDA_CALLABLE void scalar_cholesky(TileA& A, TileL& L)
+{
+    using T = typename TileA::Type;
+    constexpr int n = TileA::Layout::Shape::dim(1);
+    for (int j=0; j < n; ++j)
+    {
+        T s = A.data(tile_coord(j, j));
+        for (int k=0; k < j; ++k)
+        {
+            T r = L.data(tile_coord(j, k));
+            s -= r * r;
+        }
+        s = wp::sqrt(s);
+        T invS = 1.0 / s;
+        L.data(tile_coord(j, j)) = s;
+        for (int i=j+1; i < n; ++i)
+        {
+            s = A.data(tile_coord(i, j));
+            for (int k=0; k < j; ++k)
+            {
+                s -= L.data(tile_coord(i, k)) * L.data(tile_coord(j, k));
+            }
+            L.data(tile_coord(i, j)) = s * invS;
+        }
+        // zero out upper triangular portion
+        for (int k=j+1; k < n; ++k)
+        {
+            L.data(tile_coord(j,k)) = T(0.0);
+        }
+    }
+}
+template <typename TileL, typename TileX, typename TileY>
+inline CUDA_CALLABLE void scalar_cholesky_solve(TileL& L, TileX& X, TileY& Y)
+{
+    using T = typename TileL::Type;
+    constexpr int n = TileL::Layout::Shape::dim(1);
+    for (int i=0; i < n; ++i)
+    {
+        T s = Y.data(tile_coord(i));
+        for (int j=0; j < i; ++j)
+            s -= L.data(tile_coord(i,j)) * X.data(tile_coord(j));
+        X.data(tile_coord(i)) = s / L.data(tile_coord(i, i));
+    }
+    for (int i=n-1; i >= 0; --i)
+    {
+        T s = X.data(tile_coord(i));
+        for (int j=i+1; j < n; ++j)
+            s -= L.data(tile_coord(j, i)) * X.data(tile_coord(j));
+        X.data(tile_coord(i)) = s / L.data(tile_coord(i, i));
+    }
+}
+} // namespace partition_gemm
 template <int Add, typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
@@ -2068,19 +2200,19 @@ TileC& tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, Ti
     using ShapeB = typename TileB::Layout::Shape;
     using ShapeC = typename TileC::Layout::Shape;
-    static_assert(ShapeA::N == 2);
-    static_assert(ShapeB::N == 2);
-    static_assert(ShapeC::N == 2);
+    static_assert(ShapeA::N == 2, "Expected ShapeA::N == 2");
+    static_assert(ShapeB::N == 2, "Expected ShapeB::N == 2");
+    static_assert(ShapeC::N == 2, "Expected ShapeC::N == 2");
-    static_assert(ShapeA::dim(1) == ShapeB::dim(0));
-    static_assert(ShapeC::dim(0) == ShapeA::dim(0));
-    static_assert(ShapeC::dim(1) == ShapeB::dim(1));
+    static_assert(ShapeA::dim(1) == ShapeB::dim(0), "Expected ShapeA::dim(1) == ShapeB::dim(0)");
+    static_assert(ShapeC::dim(0) == ShapeA::dim(0), "Expected ShapeC::dim(0) == ShapeA::dim(0)");
+    static_assert(ShapeC::dim(1) == ShapeB::dim(1), "Expected ShapeC::dim(1) == ShapeB::dim(1)");
     using T = typename TileA::Type;
-#if WP_USE_REGISTER_GEMM
-    partitioned_gemm::matmul(A, B, C);
+#if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
+    partitioned_gemm::scalar_matmul<typename TileA::Layout, typename TileB::Layout, typename TileC::Layout>(A.data, B.data, C.data, T(Add));
 #else
     fun_forward(T(1.0), A.data.ptr, B.data.ptr, T(Add), C.data.ptr);
 #endif
@@ -2090,6 +2222,7 @@ TileC& tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, Ti
     return C;
 }
 // backward for the wp.tile_matmul(a, b, out) syntax
 template <typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
 void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C,
@@ -2097,8 +2230,17 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 {
     using T = typename TileA::Type;
+#if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
+    auto At = tile_transpose(A);
+    auto Bt = tile_transpose(B);
+    partitioned_gemm::scalar_matmul<typename TileC::Layout, typename decltype(Bt)::Layout, typename TileA::Layout>(adj_C.grad, Bt.data, adj_A.grad, T(1.0));
+    partitioned_gemm::scalar_matmul<typename decltype(At)::Layout, typename TileC::Layout, typename TileB::Layout>(At.data, adj_C.grad, adj_B.grad, T(1.0));
+#else
     fun_backward_A(T(1.0), adj_C.grad.ptr, B.data.ptr, T(1.0), adj_A.grad.ptr);
     fun_backward_B(T(1.0), A.data.ptr, adj_C.grad.ptr, T(1.0), adj_B.grad.ptr);
+#endif
     WP_TILE_SYNC();
 }
@@ -2109,11 +2251,30 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 {
     using T = typename TileA::Type;
+#if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
+    auto At = tile_transpose(A);
+    auto Bt = tile_transpose(B);
+    partitioned_gemm::scalar_matmul<typename TileC::Layout, typename decltype(Bt)::Layout, typename TileA::Layout>(adj_C.grad, Bt.data, adj_A.grad, T(1.0));
+    partitioned_gemm::scalar_matmul<typename decltype(At)::Layout, typename TileC::Layout, typename TileB::Layout>(At.data, adj_C.grad, adj_B.grad, T(1.0));
+#else
     fun_backward_A(T(1.0), adj_C.grad.ptr, B.data.ptr, T(1.0), adj_A.grad.ptr);
     fun_backward_B(T(1.0), A.data.ptr, adj_C.grad.ptr, T(1.0), adj_B.grad.ptr);
+#endif
     WP_TILE_SYNC();
 }
+#if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
+#define tile_fft()
+#define tile_ifft()
+#define adj_tile_fft()
+#define adj_tile_ifft()
+#else
 // TODO(lcambier): use a properly overaligned complex type that matches cuFFTDx's expectation
 // and remove the need for __align__(16) dtypes data[...]
 #define tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, Xinout) \
@@ -2149,12 +2310,21 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
         tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, adj_Xinout); \
     } while (0)
+#endif // !defined(__CUDA_ARCH__)
 template <typename Fwd, typename TileA, typename TileL>
 TileL& tile_cholesky(Fwd fun_forward, TileA& A, TileL& L)
 {
     // Copy to L
     L = A;
+#if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
+    partitioned_gemm::scalar_cholesky(A, L);
+#else
     // Call cholesky on L
     WP_TILE_SYNC();
@@ -2165,7 +2335,7 @@ TileL& tile_cholesky(Fwd fun_forward, TileA& A, TileL& L)
     // Zero-out the upper triangular part of L
     WP_PRAGMA_UNROLL
-    for (int i=threadIdx.x; i < TileL::Layout::Size; i += WP_TILE_BLOCK_DIM)
+    for (int i=WP_TILE_THREAD_IDX; i < TileL::Layout::Size; i += WP_TILE_BLOCK_DIM)
     {
         auto c = TileL::Layout::coord_from_linear(i);
@@ -2174,7 +2344,9 @@ TileL& tile_cholesky(Fwd fun_forward, TileA& A, TileL& L)
     }
     WP_TILE_SYNC();
+#endif
     return L;
 }
@@ -2191,6 +2363,12 @@ TileY& tile_cholesky_solve(Fwd fun_forward, TileL& L, TileX& X, TileY& Y)
     Y = X;
+#if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
+    partitioned_gemm::scalar_cholesky_solve(L, X, Y);
+#else
     // Call cholesky solve on L & y
     WP_TILE_SYNC();
@@ -2199,6 +2377,8 @@ TileY& tile_cholesky_solve(Fwd fun_forward, TileL& L, TileX& X, TileY& Y)
     WP_TILE_SYNC();
+#endif
     return Y;
 }
@@ -2211,7 +2391,7 @@ TileY& tile_cholesky_solve(Fwd fun_forward, TileL& L, TileX& X, TileY& Y)
 template <typename Tile>
 inline CUDA_CALLABLE auto tile_transpose(Tile& t)
 {
-    static_assert(Tile::Layout::Shape::N == 2);
+    static_assert(Tile::Layout::Shape::N == 2, "Expected Tile::Layout::Shape::N == 2");
     // alias incoming tile
     constexpr int M = Tile::Layout::Shape::dim(0);
@@ -2232,13 +2412,34 @@ inline CUDA_CALLABLE void adj_tile_transpose(Tile& t, Tile& adj_t, AdjTile& adj_
     adj_t.assign(tile_add(a,b));
 }
+template <int N, int StrideN, typename Tile>
+inline CUDA_CALLABLE auto tile_broadcast(Tile& t)
+{
+    // alias incoming tile with new strides
+    return tile_shared_t<typename Tile::Type, tile_layout_strided_t<tile_shape_t<N>, tile_stride_t<StrideN>>, false>(t.data.ptr, t.grad.ptr);
+}
 template <int M, int N, int StrideM, int StrideN, typename Tile>
 inline CUDA_CALLABLE auto tile_broadcast(Tile& t)
-{
+{
     // alias incoming tile with new strides
     return tile_shared_t<typename Tile::Type, tile_layout_strided_t<tile_shape_t<M, N>, tile_stride_t<StrideM, StrideN>>, false>(t.data.ptr, t.grad.ptr);
 }
+template <int M, int N, int O, int StrideM, int StrideN, int StrideO, typename Tile>
+inline CUDA_CALLABLE auto tile_broadcast(Tile& t)
+{
+    // alias incoming tile with new strides
+    return tile_shared_t<typename Tile::Type, tile_layout_strided_t<tile_shape_t<M, N, O>, tile_stride_t<StrideM, StrideN, StrideO>>, false>(t.data.ptr, t.grad.ptr);
+}
+template <int M, int N, int O, int P, int StrideM, int StrideN, int StrideO, int StrideP, typename Tile>
+inline CUDA_CALLABLE auto tile_broadcast(Tile& t)
+{
+    // alias incoming tile with new strides
+    return tile_shared_t<typename Tile::Type, tile_layout_strided_t<tile_shape_t<M, N, O, P>, tile_stride_t<StrideM, StrideN, StrideO, StrideP>>, false>(t.data.ptr, t.grad.ptr);
+}
 template <typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_broadcast(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
@@ -2252,7 +2453,7 @@ inline CUDA_CALLABLE auto tile_view(Tile& t, Indices... indices)
     // return new tile with same strides
     typename Tile::Type* data_ptr = &t.data(c);
-    typename Tile::Type* grad_ptr = NULL;
+    typename Tile::Type* grad_ptr = nullptr;
     if (t.grad.ptr)
         grad_ptr = &t.grad(c);
@@ -2297,7 +2498,7 @@ inline CUDA_CALLABLE void tile_assign(TileA& dest, TileB& src, const Coord& offs
 {
     using Layout = typename TileB::Layout;
-    for (int t=threadIdx.x; t < Layout::Size; t += WP_TILE_BLOCK_DIM)
+    for (int t=WP_TILE_THREAD_IDX; t < Layout::Size; t += WP_TILE_BLOCK_DIM)
     {
         auto c = Layout::coord_from_linear(t);
         dest.data(c + offset) = src.data(c);
@@ -2312,7 +2513,7 @@ inline CUDA_CALLABLE void adj_tile_assign(TileA& dest, TileB& src, Coord offset,
 {
     using Layout = typename TileB::Layout;
-    for (int t=threadIdx.x; t < Layout::Size; t += WP_TILE_BLOCK_DIM)
+    for (int t=WP_TILE_THREAD_IDX; t < Layout::Size; t += WP_TILE_BLOCK_DIM)
     {
         auto c = Layout::coord_from_linear(t);
         src.grad(c) += dest.grad(c + offset);
@@ -2351,14 +2552,14 @@ inline CUDA_CALLABLE TileC& tile_diag_add(TileA& a, TileB& b, TileC& c)
     using ShapeB = typename TileB::Layout::Shape;
     using ShapeC = typename TileC::Layout::Shape;
-    static_assert(ShapeA::dim(0) == ShapeA::dim(1));
-    static_assert(ShapeB::dim(0) == ShapeA::dim(0));
-    static_assert(ShapeC::dim(0) == ShapeA::dim(0));
-    static_assert(ShapeC::dim(0) == ShapeC::dim(1));
+    static_assert(ShapeA::dim(0) == ShapeA::dim(1), "Expected ShapeA::dim(0) == ShapeA::dim(1)");
+    static_assert(ShapeB::dim(0) == ShapeA::dim(0), "Expected ShapeB::dim(0) == ShapeA::dim(0)");
+    static_assert(ShapeC::dim(0) == ShapeA::dim(0), "Expected ShapeC::dim(0) == ShapeA::dim(0)");
+    static_assert(ShapeC::dim(0) == ShapeC::dim(1), "Expected ShapeC::dim(0) == ShapeC::dim(1)");
     c = a;
-    for (int t=threadIdx.x; t < ShapeA::dim(0); t += WP_TILE_BLOCK_DIM)
+    for (int t=WP_TILE_THREAD_IDX; t < ShapeA::dim(0); t += WP_TILE_BLOCK_DIM)
     {
         c.data(tile_coord(t, t)) += b.data(tile_coord(t));
     }
@@ -2377,3 +2578,7 @@ inline CUDA_CALLABLE void adj_tile_diag_add(TileA& a, TileB& b, TileC& c, AdjTil
 } // namespace wp
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif