PyPI - warp-lang - Versions diffs - 1.7.2rc1__py3-none-win_amd64.whl → 1.8.1__py3-none-win_amd64.whl - Mend

warp-lang 1.7.2rc1__py3-none-win_amd64.whl → 1.8.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (193) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +241 -252
warp/build_dll.py +130 -26
warp/builtins.py +1907 -384
warp/codegen.py +272 -104
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +770 -238
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/interop/example_jax_kernel.py +27 -1
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +99 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +181 -95
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +210 -67
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +103 -4
warp/native/builtin.h +182 -35
warp/native/coloring.cpp +6 -2
warp/native/cuda_util.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +5 -5
warp/native/mat.h +8 -13
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/mesh.h +1 -1
warp/native/quat.h +34 -6
warp/native/rand.h +7 -7
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/svd.h +23 -8
warp/native/tile.h +603 -73
warp/native/tile_radix_sort.h +1112 -0
warp/native/tile_reduce.h +239 -13
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +10 -20
warp/native/warp.cpp +36 -4
warp/native/warp.cu +588 -52
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +110 -80
warp/render/render_usd.py +124 -62
warp/sim/__init__.py +9 -0
warp/sim/collide.py +253 -80
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +761 -322
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +54 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +378 -112
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +91 -2
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +312 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +23 -24
warp/tests/test_quat.py +28 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +83 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_static.py +48 -0
warp/tests/test_struct.py +2 -2
warp/tests/test_tape.py +38 -0
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/test_vec.py +38 -408
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +438 -131
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_reduce.py +307 -5
warp/tests/tile/test_tile_shared_memory.py +136 -7
warp/tests/tile/test_tile_sort.py +121 -0
warp/tests/unittest_suites.py +14 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/METADATA +20 -6
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/RECORD +190 -176
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0

warp/native/tile.h CHANGED Viewed

@@ -803,7 +803,7 @@ struct tile_layout_strided_t
     }
     // checks whether a strided layout is unique, i.e.: if memory locations are only
-    // every referred to by one element in the tile, this is a basic test that only
+    // ever referred to by one element in the tile, this is a basic test that only
     // checks for broadcast dimensions, it would be possible to do the full check
     // using sorted shape/strides in Python and add it as a template parameter to the type
     static constexpr bool is_unique()
@@ -912,33 +912,27 @@ struct tile_shared_t
     }
     // assign from a register tile
-    template <typename Tile>
-    inline CUDA_CALLABLE auto& operator=(const Tile& t)
+    inline CUDA_CALLABLE auto& operator=(const tile_register_t<Type, tile_layout_register_t<typename Layout::Shape>>& t)
     {
         assign(t);
         return *this;
     }
-/*
     // construct from another shared tile, this constructor
     // is invoked for reshape operations like `wp.tile_transpose()`
-    template <typename OtherT, typename OtherLayout>
-    inline CUDA_CALLABLE auto& operator=(const tile_shared_t<OtherT, OtherLayout>& rhs)
+    template <typename OtherT, typename OtherLayout, bool OtherOwner>
+    inline CUDA_CALLABLE auto& operator=(const tile_shared_t<OtherT, OtherLayout, OtherOwner>& rhs)
     {
-        using OtherTile = tile_shared_t<OtherT, OtherLayout>;
         // check dimensions are compatible
-        static_assert(Size == OtherTile::Size, "Expected Size == OtherTile::Size");
+        static_assert(Layout::Size == OtherLayout::Size, "Expected Size == OtherLayout::Size");
         // alias tile directly
-        data = rhs.data;
-        grad = rhs.grad;
+        data.ptr = rhs.data.ptr;
+        grad.ptr = rhs.grad.ptr;
         initialized = rhs.initialized;
         return *this;
     }
-*/
     // assign from a global tile (load)
     inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, typename Layout::Shape>& t)
@@ -989,6 +983,37 @@ struct tile_shared_t
         WP_TILE_SYNC();
     }
+    // add scalar value onto a single tile element
+    inline CUDA_CALLABLE void add_inplace(const typename Layout::Coord& c, const Type& x)
+    {
+        // since multiple threads may add to the same element
+        // we need to accumulate using atomic operations
+        wp::atomic_add(&data(c), x);
+        WP_TILE_SYNC();
+    }
+    // backward of inplace scalar addition
+    inline CUDA_CALLABLE void adj_add_inplace(const typename Layout::Coord& c, Type& adj_x)
+    {
+        adj_x += grad(c);
+    }
+    // subtract scalar value from a single tile element
+    inline CUDA_CALLABLE void sub_inplace(const typename Layout::Coord& c, const Type& x)
+    {
+        // since multiple threads may add to the same element
+        // we need to accumulate using atomic operations
+        wp::atomic_add(&data(c), -x);
+        WP_TILE_SYNC();
+    }
+    // backward of inplace scalar subtraction
+    inline CUDA_CALLABLE void adj_sub_inplace(const typename Layout::Coord& c, Type& adj_x)
+    {
+        adj_x -= grad(c);
+    }
     // copy register tile to shared
     template <typename Tile>
@@ -1472,10 +1497,10 @@ inline CUDA_CALLABLE void adj_print(const tile_shared_t<T, L, Owner>& t, const t
 // helpers to allocate shared tiles
-template <typename T, typename Shape, bool RequiresGrad>
+template <typename T, typename Shape, typename Strides, bool RequiresGrad>
 inline CUDA_CALLABLE auto tile_alloc_empty()
-{   constexpr int size = Shape::size();
+{
+    constexpr int size = Shape::size();
     T* data = (T*)tile_alloc_shared(size*sizeof(T));
     T* grad = nullptr;
@@ -1503,7 +1528,7 @@ inline CUDA_CALLABLE auto tile_alloc_empty()
         WP_TILE_SYNC();
     }
-    return tile_shared_t<T, tile_layout_strided_t<Shape>>(data, grad);
+    return tile_shared_t<T, tile_layout_strided_t<Shape, Strides>>(data, grad);
 }
@@ -1532,37 +1557,56 @@ inline CUDA_CALLABLE auto tile(const wp::vec_t<Length, T>& x)
     using Layout = typename decltype(result)::Layout;
     static_assert(Layout::NumRegs == Length, "Expected Layout::NumRegs == Length");
-    for (int i=0; i < Length; ++i)
+    for (unsigned i=0; i < Length; ++i)
         result.data[i] = x[i];
     return result;
 }
-// construct a tile from a local SIMT value (one per-thread)
-template <typename T, typename AdjTile>
-inline CUDA_CALLABLE void adj_tile(const T& x, T& adj_x, AdjTile& adj_ret)
+// overload for constructing a tile from a per-thread matrix
+template <unsigned Rows, unsigned Cols, typename T>
+inline CUDA_CALLABLE auto tile(const wp::mat_t<Rows, Cols, T>& x)
 {
-    static_assert(AdjTile::Layout::Shape::N == 1, "Expected AdjTile::Layout::Shape::N == 1");
-    static_assert(AdjTile::Layout::Shape::dim(0) == WP_TILE_BLOCK_DIM, "Expected AdjTile::Layout::Shape::dim(0) == WP_TILE_BLOCK_DIM");
+    tile_register_t<T, tile_layout_register_t<tile_shape_t<Rows, Cols, WP_TILE_BLOCK_DIM>>> result;
-    auto adj_reg = adj_ret.copy_to_register();
+    using Layout = typename decltype(result)::Layout;
+    static_assert(Layout::NumRegs == Rows*Cols, "Expected Layout::NumRegs == Rows*Cols");
+    for (unsigned i=0; i < Rows; ++i)
+        for (unsigned j=0; j < Cols; ++j)
+            result.data[i*Cols + j] = x.data[i][j];
-    adj_x += adj_reg.data[0];
+    return result;
 }
-template <typename T, unsigned Length, typename AdjTile>
-inline CUDA_CALLABLE void adj_tile(const wp::vec_t<Length, T>& x, wp::vec_t<Length, T>& adj_x, AdjTile& adj_ret)
+// it is sufficient to use a single adjoint for all tile overload funcs
+// it is also necessary, because we don't provide a dispatch_func for adjoint calls
+// so the compiler will default to choosing based on argument types
+template <typename T, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile(const T& x, T& adj_x, AdjTile& adj_ret)
 {
-    static_assert(AdjTile::Layout::Shape::N == 2, "Expected AdjTile::Layout::Shape::N == 2");
-    static_assert(AdjTile::Layout::Shape::dim(0) == Length, "Expected AdjTile::Layout::Shape::dim(0) == Length");
-    static_assert(AdjTile::Layout::Shape::dim(1) == WP_TILE_BLOCK_DIM, "Expected AdjTile::Layout::Shape::dim(1) == WP_TILE_BLOCK_DIM");
+    static_assert(AdjTile::Layout::Shape::dim(AdjTile::Layout::Shape::N - 1) == WP_TILE_BLOCK_DIM, "Expected AdjTile::Layout::Shape::dim(AdjTile::Layout::Shape::N - 1) == WP_TILE_BLOCK_DIM");
     auto adj_reg = adj_ret.copy_to_register();
-    for (int i=0; i < Length; ++i)
-        adj_x[i] += adj_reg.data[i];
+    if constexpr (AdjTile::Layout::Shape::N == 1)
+    {
+        adj_x += adj_reg.data[0];
+    }
+    else if constexpr (AdjTile::Layout::Shape::N == 2)
+    {
+        for (unsigned i=0; i < AdjTile::Layout::Shape::dim(0); ++i)
+            adj_x[i] += adj_reg.data[i];
+    }
+    else if constexpr (AdjTile::Layout::Shape::N == 3)
+    {
+        for (unsigned i=0; i < AdjTile::Layout::Shape::dim(0); ++i)
+            for (unsigned j=0; j < AdjTile::Layout::Shape::dim(1); ++j)
+                adj_x.data[i][j] += adj_reg.data[i*AdjTile::Layout::Shape::dim(1) + j];
+    }
 }
 template <typename Tile>
 inline CUDA_CALLABLE auto untile(Tile& tile)
 {
@@ -1589,6 +1633,19 @@ inline CUDA_CALLABLE auto untile(Tile& tile)
         return v;
     }
+    // matrix case
+    if constexpr(N == 3)
+    {
+        constexpr int Rows = Tile::Layout::Shape::dim(0);
+        constexpr int Cols = Tile::Layout::Shape::dim(1);
+        wp::mat_t<Rows, Cols, typename Tile::Type> m;
+        for (int i=0; i < Rows; ++i)
+            for (int j=0; j < Cols; ++j)
+                m.data[i][j] = reg.data[i*Cols + j];
+        return m;
+    }
 }
 template <typename Tile, typename Value>
@@ -1612,6 +1669,16 @@ inline CUDA_CALLABLE void adj_untile(Tile& tile, Tile& adj_tile, Value& adj_ret)
             adj.data[i] += adj_ret[i];
     }
+    // matrix case
+    if constexpr(N == 3)
+    {
+        constexpr int Rows = Tile::Layout::Shape::dim(0);
+        constexpr int Cols = Tile::Layout::Shape::dim(1);
+        for (int i=0; i < Rows; ++i)
+            for (int j=0; j < Cols; ++j)
+                adj.data[i*Cols + j] += adj_ret.data[i][j];
+    }
     adj_tile.assign(adj);
 }
@@ -1893,6 +1960,27 @@ inline CUDA_CALLABLE auto tile_add(TileA& a, TileB& b)
     return tile_binary_map(add, a, b);
 }
+// add overloads get called in user function adjoints generated by codegen (adj_tile += adj_ret)
+template <typename T, typename L>
+inline CUDA_CALLABLE auto add(tile_register_t<T, L>& a, const tile_register_t<T, L>& b) {
+    return tile_add(a, b);
+}
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto add(tile_shared_t<T, L, Owner>& a, const tile_shared_t<T, L, Owner>& b) {
+    return tile_add(a, b);
+}
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto add(tile_register_t<T, L>& a, const tile_shared_t<T, L, Owner>& b) {
+    return tile_add(a, b);
+}
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto add(tile_shared_t<T, L, Owner>& a, const tile_register_t<T, L>& b) {
+    return tile_add(a, b);
+}
 template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_add(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b, AdjTile& adj_c)
 {
@@ -1961,6 +2049,126 @@ inline CUDA_CALLABLE void adj_tile_mul(const typename Tile::Type& s, Tile& a,
 }
+template <typename TileA, typename TileB>
+inline CUDA_CALLABLE void tile_add_inplace(TileA& a, TileB& b)
+{
+    using ShapeA = typename TileA::Layout::Shape;
+    using ShapeB = typename TileB::Layout::Shape;
+    // verify shapes and sizes are compatible
+    static_assert(ShapeA::N == ShapeB::N, "Tile shapes must match for inplace addition");
+    static_assert(ShapeA::size() == ShapeB::size(), "Tile sizes must match for inplace addition");
+    auto a_reg = a.copy_to_register();
+    auto b_reg = b.copy_to_register();
+    using Layout = typename decltype(b_reg)::Layout;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < Layout::NumRegs; ++i)
+    {
+        const int linear = Layout::linear_from_register(i);
+        if(!Layout::valid(linear))
+            break;
+        a_reg.data[i] += b_reg.data[i];
+    }
+    a.assign(a_reg);
+}
+template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB>
+inline CUDA_CALLABLE void adj_tile_add_inplace(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b)
+{
+    using ShapeA = typename TileA::Layout::Shape;
+    using ShapeB = typename TileB::Layout::Shape;
+    // verify shapes and sizes are compatible
+    static_assert(ShapeA::N == ShapeB::N, "Tile shapes must match for inplace addition");
+    static_assert(ShapeA::size() == ShapeB::size(), "Tile sizes must match for inplace addition");
+    // allocate storage for adjoints
+    auto adj_a_reg = adj_a.grad_to_register();
+    auto adj_b_reg = tile_register_like<TileB>();
+    using Layout = typename decltype(adj_a_reg)::Layout;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < Layout::NumRegs; ++i)
+    {
+        const int linear = Layout::linear_from_register(i);
+        if(!Layout::valid(linear))
+            break;
+        adj_b_reg.data[i] += adj_a_reg.data[i];
+    }
+    adj_b.grad_add(adj_b_reg);
+}
+template <typename TileA, typename TileB>
+inline CUDA_CALLABLE void tile_sub_inplace(TileA& a, TileB& b)
+{
+    using ShapeA = typename TileA::Layout::Shape;
+    using ShapeB = typename TileB::Layout::Shape;
+    // verify shapes and sizes are compatible
+    static_assert(ShapeA::N == ShapeB::N, "Tile shapes must match for inplace subtraction");
+    static_assert(ShapeA::size() == ShapeB::size(), "Tile sizes must match for inplace subtraction");
+    // work with register tiles for inplace operations, regardless of the storage type of the input tiles
+    auto a_reg = a.copy_to_register();
+    auto b_reg = b.copy_to_register();
+    using Layout = typename decltype(a_reg)::Layout;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < Layout::NumRegs; ++i)
+    {
+        const int linear = Layout::linear_from_register(i);
+        if(!Layout::valid(linear))
+            break;
+        a_reg.data[i] -= b_reg.data[i];
+    }
+    a.assign(a_reg);
+}
+template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB>
+inline CUDA_CALLABLE void adj_tile_sub_inplace(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b)
+{
+    using ShapeA = typename TileA::Layout::Shape;
+    using ShapeB = typename TileB::Layout::Shape;
+    // verify shapes and sizes are compatible
+    static_assert(ShapeA::N == ShapeB::N, "Tile shapes must match for inplace subtraction");
+    static_assert(ShapeA::size() == ShapeB::size(), "Tile sizes must match for inplace subtraction");
+    // allocate storage for adjoints
+    auto adj_a_reg = adj_a.grad_to_register();
+    auto adj_b_reg = tile_register_like<TileB>();
+    using Layout = typename decltype(adj_a_reg)::Layout;
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < Layout::NumRegs; ++i)
+    {
+        const int linear = Layout::linear_from_register(i);
+        if(!Layout::valid(linear))
+            break;
+        adj_b_reg.data[i] -= adj_a_reg.data[i];
+    }
+    adj_b.grad_add(adj_b_reg);
+}
 template<typename Tile>
 typename Tile::Type tile_extract(Tile& t, int i) { return t.extract(tile_coord(i)); }
 template<typename Tile>
@@ -1970,7 +2178,6 @@ typename Tile::Type tile_extract(Tile& t, int i, int j, int k) { return t.extrac
 template<typename Tile>
 typename Tile::Type tile_extract(Tile& t, int i, int j, int k, int l) { return t.extract(tile_coord(i,j,k,l)); }
 template<typename Tile, typename AdjTile>
 void adj_tile_extract(Tile& t, int i, AdjTile& adj_t, int adj_i, typename Tile::Type adj_ret) { adj_t.adj_extract(tile_coord(i), adj_ret); }
 template<typename Tile, typename AdjTile>
@@ -1981,6 +2188,42 @@ template<typename Tile, typename AdjTile>
 void adj_tile_extract(Tile& t, int i, int j, int k, int l, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type adj_ret) { adj_t.adj_extract(tile_coord(i, j, k, l), adj_ret); }
+template<typename Tile>
+void tile_add_inplace(Tile& t, int i, typename Tile::Type value) { t.add_inplace(tile_coord(i), value); }
+template<typename Tile>
+void tile_add_inplace(Tile& t, int i, int j, typename Tile::Type value) { t.add_inplace(tile_coord(i,j), value); }
+template<typename Tile>
+void tile_add_inplace(Tile& t, int i, int j, int k, typename Tile::Type value) { t.add_inplace(tile_coord(i,j,k), value); }
+template<typename Tile>
+void tile_add_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value) { t.add_inplace(tile_coord(i,j,k,l), value); }
+template<typename Tile>
+void tile_sub_inplace(Tile& t, int i, typename Tile::Type value) { t.sub_inplace(tile_coord(i), value); }
+template<typename Tile>
+void tile_sub_inplace(Tile& t, int i, int j, typename Tile::Type value) { t.sub_inplace(tile_coord(i,j), value); }
+template<typename Tile>
+void tile_sub_inplace(Tile& t, int i, int j, int k, typename Tile::Type value) { t.sub_inplace(tile_coord(i,j,k), value); }
+template<typename Tile>
+void tile_sub_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value) { t.sub_inplace(tile_coord(i,j,k,l), value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_add_inplace(Tile& t, int i, typename Tile::Type value, AdjTile& adj_t, int adj_i, typename Tile::Type& adj_value) { adj_t.adj_add_inplace(tile_coord(i), adj_value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_add_inplace(Tile& t, int i, int j, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, typename Tile::Type& adj_value) { adj_t.adj_add_inplace(tile_coord(i, j), adj_value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_add_inplace(Tile& t, int i, int j, int k, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, typename Tile::Type& adj_value) { adj_t.adj_add_inplace(tile_coord(i, j, k), adj_value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_add_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type& adj_value) { adj_t.adj_add_inplace(tile_coord(i, j, k, l), adj_value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_sub_inplace(Tile& t, int i, typename Tile::Type value, AdjTile& adj_t, int adj_i, typename Tile::Type& adj_value) { adj_t.adj_sub_inplace(tile_coord(i), adj_value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_sub_inplace(Tile& t, int i, int j, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, typename Tile::Type& adj_value) { adj_t.adj_sub_inplace(tile_coord(i, j), adj_value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_sub_inplace(Tile& t, int i, int j, int k, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, typename Tile::Type& adj_value) { adj_t.adj_sub_inplace(tile_coord(i, j, k), adj_value); }
+template<typename Tile, typename AdjTile>
+void adj_tile_sub_inplace(Tile& t, int i, int j, int k, int l, typename Tile::Type value, AdjTile& adj_t, int adj_i, int adj_j, int adj_k, int adj_l, typename Tile::Type& adj_value) { adj_t.adj_sub_inplace(tile_coord(i, j, k, l), adj_value); }
 namespace partitioned_gemm
 {
@@ -2177,33 +2420,98 @@ inline CUDA_CALLABLE void scalar_cholesky(TileA& A, TileL& L)
     }
 }
+// Writes into X
 template <typename TileL, typename TileX, typename TileY>
-inline CUDA_CALLABLE void scalar_cholesky_solve(TileL& L, TileX& X, TileY& Y)
+inline CUDA_CALLABLE void scalar_cholesky_forward_substitution(TileL& L, TileX& X, TileY& Y)
 {
-    using T = typename TileL::Type;
-    constexpr int n = TileL::Layout::Shape::dim(1);
+    using T = typename TileL::Type;
-    for (int i=0; i < n; ++i)
+    if constexpr (TileY::Layout::Shape::N == 1)
     {
-        T s = Y.data(tile_coord(i));
+        constexpr int n = TileL::Layout::Shape::dim(1);
+        for (int i=0; i < n; ++i)
+        {
+            T s = Y.data(tile_coord(i));
-        for (int j=0; j < i; ++j)
-            s -= L.data(tile_coord(i,j)) * X.data(tile_coord(j));
+            for (int j=0; j < i; ++j)
+                s -= L.data(tile_coord(i,j)) * X.data(tile_coord(j));
-        X.data(tile_coord(i)) = s / L.data(tile_coord(i, i));
+            T diag = L.data(tile_coord(i, i));
+            X.data(tile_coord(i)) = (diag != T(0.0f)) ? s / diag : s;
+        }
     }
+    else if constexpr (TileY::Layout::Shape::N == 2)
+    {
+        constexpr int n = TileL::Layout::Shape::dim(1);
+        constexpr int m = TileY::Layout::Shape::dim(1);
+        for (int k=0; k < m; ++k)
+        {
+            for (int i=0; i < n; ++i)
+            {
+                T s = Y.data(tile_coord(i,k));
+                for (int j=0; j < i; ++j)
+                    s -= L.data(tile_coord(i,j)) * X.data(tile_coord(j,k));
+                T diag = L.data(tile_coord(i, i));
+                X.data(tile_coord(i,k)) = (diag != T(0.0f)) ? s / diag : s;
+            }
+        }
+    }
+}
+// Reads and writes X
+template <typename TileL, typename TileX>
+inline CUDA_CALLABLE void scalar_cholesky_back_substitution(TileL& L, TileX& X)
+{
+    using T = typename TileL::Type;
+    if constexpr (TileX::Layout::Shape::N == 1)
+    {
+        constexpr int n = TileL::Layout::Shape::dim(1);
+        for (int i=n-1; i >= 0; --i)
+        {
+            T s = X.data(tile_coord(i));
-    for (int i=n-1; i >= 0; --i)
+            for (int j=i+1; j < n; ++j)
+                s -= L.data(tile_coord(j, i)) * X.data(tile_coord(j));
+            T diag = L.data(tile_coord(i, i));
+            X.data(tile_coord(i)) = (diag != T(0.0f)) ? s / diag : s;
+        }
+    }
+    else if constexpr (TileX::Layout::Shape::N == 2)
     {
-        T s = X.data(tile_coord(i));
+        constexpr int n = TileL::Layout::Shape::dim(1);
+        constexpr int m = TileX::Layout::Shape::dim(1);
-        for (int j=i+1; j < n; ++j)
-            s -= L.data(tile_coord(j, i)) * X.data(tile_coord(j));
+        for (int k=0; k < m; ++k)
+        {
+            for (int i=n-1; i >= 0; --i)
+            {
+                T s = X.data(tile_coord(i,k));
+                for (int j=i+1; j < n; ++j)
+                    s -= L.data(tile_coord(j, i)) * X.data(tile_coord(j,k));
-        X.data(tile_coord(i)) = s / L.data(tile_coord(i, i));
+                T diag = L.data(tile_coord(i, i));
+                X.data(tile_coord(i,k)) = (diag != T(0.0f)) ? s / diag : s;
+            }
+        }
     }
 }
+template <typename TileL, typename TileX, typename TileY>
+inline CUDA_CALLABLE void scalar_cholesky_solve(TileL& L, TileX& X, TileY& Y)
+{
+    scalar_cholesky_forward_substitution(L, X, Y);
+    scalar_cholesky_back_substitution(L, X);
+}
 } // namespace partition_gemm
@@ -2223,12 +2531,14 @@ TileC& tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, Ti
     static_assert(ShapeC::dim(1) == ShapeB::dim(1), "Expected ShapeC::dim(1) == ShapeB::dim(1)");
-    using T = typename TileA::Type;
+    using T = typename TileC::Type;
 #if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
     partitioned_gemm::scalar_matmul<typename TileA::Layout, typename TileB::Layout, typename TileC::Layout>(A.data, B.data, C.data, T(Add));
 #else
-    fun_forward(T(1.0), A.data.ptr, B.data.ptr, T(Add), C.data.ptr);
+    T alpha = T(1.0);
+    T beta = T(Add);
+    fun_forward(&alpha, A.data.ptr, B.data.ptr, &beta, C.data.ptr);
 #endif
     WP_TILE_SYNC();
@@ -2242,17 +2552,22 @@ template <typename Fwd, typename AdjA, typename AdjB, typename TileA, typename T
 void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C,
                    Fwd adj_fun_forward, AdjA adj_fun_backward_A, AdjB adj_fun_backward_B, TileA& adj_A, TileB& adj_B, TileC& adj_C)
 {
-    using T = typename TileA::Type;
+    using T_A = typename TileA::Type;
+    using T_B = typename TileB::Type;
 #if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
     auto At = tile_transpose(A);
     auto Bt = tile_transpose(B);
-    partitioned_gemm::scalar_matmul<typename TileC::Layout, typename decltype(Bt)::Layout, typename TileA::Layout>(adj_C.grad, Bt.data, adj_A.grad, T(1.0));
-    partitioned_gemm::scalar_matmul<typename decltype(At)::Layout, typename TileC::Layout, typename TileB::Layout>(At.data, adj_C.grad, adj_B.grad, T(1.0));
+    partitioned_gemm::scalar_matmul<typename TileC::Layout, typename decltype(Bt)::Layout, typename TileA::Layout>(adj_C.grad, Bt.data, adj_A.grad, T_A(1.0));
+    partitioned_gemm::scalar_matmul<typename decltype(At)::Layout, typename TileC::Layout, typename TileB::Layout>(At.data, adj_C.grad, adj_B.grad, T_B(1.0));
 #else
-    fun_backward_A(T(1.0), adj_C.grad.ptr, B.data.ptr, T(1.0), adj_A.grad.ptr);
-    fun_backward_B(T(1.0), A.data.ptr, adj_C.grad.ptr, T(1.0), adj_B.grad.ptr);
+    T_A alpha_A = T_A(1.0);
+    T_A beta_A = T_A(1.0);
+    fun_backward_A(&alpha_A, adj_C.grad.ptr, B.data.ptr, &beta_A, adj_A.grad.ptr);
+    T_B alpha_B = T_B(1.0);
+    T_B beta_B = T_B(1.0);
+    fun_backward_B(&alpha_B, A.data.ptr, adj_C.grad.ptr, &beta_B, adj_B.grad.ptr);
 #endif
     WP_TILE_SYNC();
@@ -2263,7 +2578,7 @@ template <typename Fwd, typename AdjA, typename AdjB, typename TileA, typename T
 void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C,
                    Fwd adj_fun_forward, AdjA adj_fun_backward_A, AdjB adj_fun_backward_B, TileA& adj_A, TileB& adj_B, TileC& adj_C, TileC& adj_ret)
 {
-    using T = typename TileA::Type;
+    using T = typename TileC::Type;
 #if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
     auto At = tile_transpose(A);
@@ -2272,8 +2587,10 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
     partitioned_gemm::scalar_matmul<typename TileC::Layout, typename decltype(Bt)::Layout, typename TileA::Layout>(adj_C.grad, Bt.data, adj_A.grad, T(1.0));
     partitioned_gemm::scalar_matmul<typename decltype(At)::Layout, typename TileC::Layout, typename TileB::Layout>(At.data, adj_C.grad, adj_B.grad, T(1.0));
 #else
-    fun_backward_A(T(1.0), adj_C.grad.ptr, B.data.ptr, T(1.0), adj_A.grad.ptr);
-    fun_backward_B(T(1.0), A.data.ptr, adj_C.grad.ptr, T(1.0), adj_B.grad.ptr);
+    T alpha = T(1.0);
+    T beta = T(1.0);
+    fun_backward_A(&alpha, adj_C.grad.ptr, B.data.ptr, &beta, adj_A.grad.ptr);
+    fun_backward_B(&alpha, A.data.ptr, adj_C.grad.ptr, &beta, adj_B.grad.ptr);
 #endif
     WP_TILE_SYNC();
@@ -2293,13 +2610,13 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 // and remove the need for __align__(16) dtypes data[...]
 #define tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, Xinout) \
     do { \
-        void function_name(dtype*, dtype*); \
+        void function_name(dtype*, char*); \
         char* buffer = (char*)wp::tile_alloc_shared(shared_memory_size); \
         __align__(16) dtype data[ept]; \
         for(int b = 0; b < (int)batch_size; b++) { \
             dtype* inout = Xinout.data + (int)b * (int)ept; \
             memcpy(data, inout, sizeof(dtype) * ept); \
-            function_name(data, (dtype*)buffer); \
+            function_name(data, buffer); \
             memcpy(inout, data, sizeof(dtype) * ept); \
             WP_TILE_SYNC(); \
         } \
@@ -2328,7 +2645,15 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 template <typename Fwd, typename TileA, typename TileL>
 TileL& tile_cholesky(Fwd fun_forward, TileA& A, TileL& L)
-{
+{
+    static_assert(TileA::Layout::Shape::N == 2, "Expected TileA::Layout::Shape::N == 2");
+    static_assert(TileL::Layout::Shape::N == 2, "Expected TileL::Layout::Shape::N == 2");
+    static_assert(TileA::Layout::Shape::dim(0) == TileA::Layout::Shape::dim(1), "Expected TileA to be square");
+    static_assert(TileL::Layout::Shape::dim(0) == TileL::Layout::Shape::dim(1), "Expected TileL to be square");
+    static_assert(TileA::Layout::Shape::dim(0) == TileL::Layout::Shape::dim(0), "Expected A and L to have the same number of rows");
+    static_assert(TileA::Layout::Shape::dim(1) == TileL::Layout::Shape::dim(1), "Expected A and L to have the same number of columns");
     // Copy to L
     L = A;
@@ -2338,14 +2663,27 @@ TileL& tile_cholesky(Fwd fun_forward, TileA& A, TileL& L)
 #else
+    // TODO: for batched Cholesky, need one info per batch
+    WP_TILE_SHARED int info[1];
+    if (WP_TILE_THREAD_IDX == 0) {
+        info[0] = 0;
+    }
     // Call cholesky on L
     WP_TILE_SYNC();
-    fun_forward(L.data.ptr, TileL::Layout::Shape::dim(0));
+    fun_forward(L.data.ptr, info);
     WP_TILE_SYNC();
+    // TODO: for batched Cholesky, check all batches
+#if defined(_DEBUG)
+    if (WP_TILE_THREAD_IDX == 0 && info[0] != 0) {
+        printf("Non-zero status in Cholesky factorization, got %d\n", info[0]);
+    }
+#endif
     // Zero-out the upper triangular part of L
     WP_PRAGMA_UNROLL
@@ -2371,11 +2709,11 @@ TileL& tile_cholesky(Fwd fun_forward, TileA& A, TileL& L)
     } while (0)
 template <typename Fwd, typename TileL, typename TileX, typename TileY>
-TileY& tile_cholesky_solve(Fwd fun_forward, TileL& L, TileX& X, TileY& Y)
+TileY& tile_cholesky_solve(Fwd fun_forward, TileL& L, TileX& Y, TileY& X)
 {
-    // Copy x to y
+    // Copy y to x
-    Y = X;
+    X = Y;
 #if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
@@ -2383,24 +2721,99 @@ TileY& tile_cholesky_solve(Fwd fun_forward, TileL& L, TileX& X, TileY& Y)
 #else
-    // Call cholesky solve on L & y
+    // Call cholesky solve on L & x
+    WP_TILE_SYNC();
+    fun_forward(L.data.ptr, X.data.ptr); \
+    WP_TILE_SYNC();
+#endif
+    return X;
+}
+#define adj_tile_cholesky_solve(function_name, L, Y, X, \
+                                adj_function_name, adj_L, adj_Y, adj_X, adj_ret) \
+    do { \
+        assert(false); \
+    } while (0)
+template <typename Fwd, typename TileL, typename TileY, typename TileZ>
+TileZ& tile_lower_solve(Fwd fun_forward, TileL& L, TileY& y, TileZ& z)
+{
+    // Copy y to z
+    z = y;
+#if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
+    partitioned_gemm::scalar_cholesky_forward_substitution(L, z, y);
+#else
+    // Call cholesky solve on L & z
     WP_TILE_SYNC();
-    fun_forward(L.data.ptr, Y.data.ptr); \
+    fun_forward(L.data.ptr, z.data.ptr);
     WP_TILE_SYNC();
 #endif
-    return Y;
+    return z;
 }
-#define adj_tile_cholesky_solve(function_name, L, X, Y, \
-                                adj_function_name, adj_L, adj_X, adj_Y, adj_ret) \
+#define adj_tile_lower_solve(function_name, L, y, z, \
+                             adj_function_name, adj_L, adj_y, adj_z, adj_ret) \
     do { \
         assert(false); \
     } while (0)
+template <typename Fwd, typename TileU, typename TileZ, typename TileX>
+TileX& tile_upper_solve(Fwd fun_forward, TileU& U, TileZ& z, TileX& x)
+{
+    // Copy z to x
+    x = z;
+#if !defined(__CUDA_ARCH__) || WP_ENABLE_MATHDX == 0
+    auto L = tile_transpose(U);
+    partitioned_gemm::scalar_cholesky_back_substitution(L, x);
+#else
+    // Call cholesky solve on U & x
+    WP_TILE_SYNC();
+    fun_forward(U.data.ptr, x.data.ptr);
+    WP_TILE_SYNC();
+#endif
+    return x;
+}
+#define adj_tile_upper_solve(function_name, U, z, x, \
+                             adj_function_name, adj_U, adj_z, adj_x, adj_ret) \
+    do { \
+        assert(false); \
+    } while (0)
 template <typename Tile>
 inline CUDA_CALLABLE auto tile_transpose(Tile& t)
@@ -2457,10 +2870,11 @@ inline CUDA_CALLABLE auto tile_broadcast(Tile& t)
 template <typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_broadcast(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
-    // nop, since memory is aliased grads already accumulated
+    // nop, since memory is aliased, grads already accumulated
 }
-template <typename ReturnType, typename Tile, typename... Indices>
+template <typename ReturnTile, typename Tile, typename... Indices>
 inline CUDA_CALLABLE auto tile_view(Tile& t, Indices... indices)
 {
     auto c = tile_coord(indices...);
@@ -2472,7 +2886,104 @@ inline CUDA_CALLABLE auto tile_view(Tile& t, Indices... indices)
     if (t.grad.ptr)
         grad_ptr = &t.grad(c);
-    return ReturnType(data_ptr, grad_ptr);
+    return ReturnTile(data_ptr, grad_ptr);
+}
+template <typename ReturnTile, typename Tile>
+inline CUDA_CALLABLE auto tile_squeeze(Tile& t)
+{
+    // ReturnTile layout is set in builtins.py
+    typename Tile::Type* data_ptr = t.data.ptr;
+    typename Tile::Type* grad_ptr = nullptr;
+    if (t.grad.ptr)
+        grad_ptr = t.grad.ptr;
+    return ReturnTile(data_ptr, grad_ptr);
+}
+template <typename Tile, typename AdjTile, typename AdjReturnTile>
+inline CUDA_CALLABLE void adj_tile_squeeze(Tile& t, AdjTile& adj_t, AdjReturnTile& adj_ret)
+{
+    // nop, since memory is aliased, grads already accumulated
+}
+template <typename ReturnTile, typename Tile>
+inline CUDA_CALLABLE auto tile_reshape(Tile& t)
+{
+    // ReturnTile layout is set in builtins.py
+    typename Tile::Type* data_ptr = t.data.ptr;
+    typename Tile::Type* grad_ptr = nullptr;
+    if (t.grad.ptr)
+        grad_ptr = t.grad.ptr;
+    return ReturnTile(data_ptr, grad_ptr);
+}
+template <typename Tile, typename AdjTile, typename AdjReturnTile>
+inline CUDA_CALLABLE void adj_tile_reshape(Tile& t, AdjTile& adj_t, AdjReturnTile& adj_ret)
+{
+    // nop, since memory is aliased, grads already accumulated
+}
+template <typename ReturnTile, typename Tile>
+inline CUDA_CALLABLE auto tile_astype(Tile& t)
+{
+    // verify shapes and sizes are compatible
+    using ShapeIn = typename Tile::Layout::Shape;
+    using ShapeOut = typename ReturnTile::Layout::Shape;
+    static_assert(ShapeIn::N == ShapeOut::N, "Tile shapes must match for data type casting");
+    static_assert(ShapeIn::size() == ShapeOut::size(), "Tile sizes must match for data type casting");
+    // work with register tiles for type casting
+    auto t_reg = t.copy_to_register();
+    auto result = tile_register_like<ReturnTile>();
+    using Layout = typename decltype(result)::Layout;
+    WP_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::NumRegs; ++i)
+    {
+        const int linear = Layout::linear_from_register(i);
+        if(!Layout::valid(linear))
+            break;
+        result.data[i] = static_cast<typename ReturnTile::Type>(t_reg.data[i]);
+    }
+    return result;
+}
+template <typename Tile, typename AdjTile, typename AdjReturnTile>
+inline CUDA_CALLABLE void adj_tile_astype(Tile& t, AdjTile& adj_t, AdjReturnTile& adj_ret)
+{
+    // gradients only flow between float conversions
+    if constexpr((is_same<typename AdjTile::Type, wp::float16>::value ||
+                  is_same<typename AdjTile::Type, wp::float32>::value ||
+                  is_same<typename AdjTile::Type, wp::float64>::value) &&
+                 (is_same<typename AdjReturnTile::Type, wp::float16>::value ||
+                  is_same<typename AdjReturnTile::Type, wp::float32>::value ||
+                  is_same<typename AdjReturnTile::Type, wp::float64>::value))
+    {
+        auto adj_ret_reg = adj_ret.grad_to_register();
+        auto adj_t_reg = tile_register_like<AdjTile>();
+        using Layout = typename decltype(adj_t_reg)::Layout;
+        WP_PRAGMA_UNROLL
+        for (int i = 0; i < Layout::NumRegs; ++i)
+        {
+            adj_t_reg.data[i] += static_cast<typename AdjTile::Type>(adj_ret_reg.data[i]);
+        }
+        adj_t.grad_add(adj_t_reg);
+    }
 }
@@ -2504,21 +3015,41 @@ inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, int k, int l, const
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, const Scalar& src, AdjTileA& adj_dest, int adj_i, Scalar& adj_src)
 {
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
     adj_src += dest.grad(tile_coord(i));
 }
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, Scalar& adj_src)
 {
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
     adj_src += dest.grad(tile_coord(i, j));
 }
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, Scalar& adj_src)
 {
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
     adj_src += dest.grad(tile_coord(i, j, k));
 }
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, int l, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, int adj_l, Scalar& adj_src)
 {
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
     adj_src += dest.grad(tile_coord(i, j, k, l));
 }
@@ -2601,7 +3132,6 @@ inline CUDA_CALLABLE TileC& tile_diag_add(TileA& a, TileB& b, TileC& c)
 template <typename TileA, typename TileB, typename TileC, typename AdjTileA, typename AdjTileB, typename AdjTileC>
 inline CUDA_CALLABLE void adj_tile_diag_add(TileA& a, TileB& b, TileC& c, AdjTileA& adj_a, AdjTileB& adj_b, AdjTileC& adj_c, AdjTileC& adj_ret)
 {
-    assert(false);
 }