PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-win_amd64.whl → 1.9.1__py3-none-win_amd64.whl - Mend

warp-lang 1.8.1__py3-none-win_amd64.whl → 1.9.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/native/tile.h CHANGED Viewed

@@ -230,7 +230,9 @@ struct tile_coord_t
             out.indices[i] = indices[i] + c.indices[i];
         }
         return out;
-    }
+    }
+    static constexpr int size() { return N; }
 };
 // This function deduces N = sizeof...(Ints)
@@ -338,7 +340,8 @@ using tile_stride_t = tile_tuple_t<V...>;
 // represents a tile stored in global memory with dynamic strides
 // used to represent the source and offset for tile loads to register/shared
-template <typename T, typename Shape_>
+// BoundsCheck: when true (default), validates array access bounds; when false, skips validation for performance
+template <typename T, typename Shape_, bool BoundsCheck=true>
 struct tile_global_t
 {
     using Type = T;
@@ -370,25 +373,33 @@ struct tile_global_t
     inline CUDA_CALLABLE bool index(const Coord& coord, int& out) const
     {
-        // element index
-        int index = 0;
-        WP_PRAGMA_UNROLL
-        for (int i=0; i < Shape::N; ++i)
+        if constexpr (BoundsCheck)
         {
-            // global = offset + coord
-            int c = offset[i] + coord[i];
-            // handle out of bounds case
-            if (c >= data.shape[i])
-                return false;
-            else
-                index += data.strides[i]*c;
-        }
-        // array strides are in bytes so we convert to elements
-        out = index / sizeof(T);
-        return true;
+            // element index
+            int index = 0;
+            WP_PRAGMA_UNROLL
+            for (int i=0; i < Shape::N; ++i)
+            {
+                // global = offset + coord
+                int c = offset[i] + coord[i];
+                // handle out of bounds case
+                if (c >= data.shape[i])
+                    return false;
+                else
+                    index += data.strides[i]*c;
+            }
+            // array strides are in bytes so we convert to elements
+            out = index / sizeof(T);
+            return true;
+        }
+        else
+        {
+            out = index_from_coord(coord);
+            return true;
+        }
     }
     inline CUDA_CALLABLE T load(const Coord& coord) const
@@ -435,6 +446,7 @@ struct tile_global_t
     }
 };
 template <typename Shape_>
 struct tile_layout_register_t
 {
@@ -521,7 +533,8 @@ struct tile_register_t
             data[i] = value;
     }
-    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, typename Layout::Shape>& t)
+    template <bool BoundsCheck>
+    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, typename Layout::Shape, BoundsCheck>& t)
     {
         copy_from_global(t);
         return *this;
@@ -529,7 +542,7 @@ struct tile_register_t
     // define the += operator which is used during backward pass codegen
     // when returning a register tile from a user defined function
-    inline CUDA_CALLABLE auto& operator += (tile_register_t<T, Layout>& rhs)
+    inline CUDA_CALLABLE auto& operator += (const tile_register_t<T, Layout>& rhs)
     {
         grad_add(rhs);
         return *this;
@@ -645,10 +658,9 @@ struct tile_register_t
             data[i] += tile.data[i];
     }
-    CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
+    inline CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
     {
-        apply([&](int reg, auto c) {data[reg] = global.load_grad(c);});
+        apply([&](int reg, auto c) {data[reg] += global.load_grad(c);});
     }
     inline CUDA_CALLABLE auto& grad_to_register()
@@ -746,6 +758,7 @@ inline CUDA_CALLABLE void* tile_alloc_shared(int num_bytes, bool init=false, boo
         // one entry per-thread so no need for synchronization
         smem_base[WP_TILE_THREAD_IDX] += tile_align(num_bytes);
+        assert(smem_base[WP_TILE_THREAD_IDX] >= 0);
 #ifdef __CUDA_ARCH__
         extern __shared__ char dynamic_smem_base[];
@@ -893,6 +906,28 @@ struct tile_shared_t
     {
     }
+    // we delete the copy constructor because in the case the shared tile is owning,
+    // this leads to a double deallocation.
+    // this also forces one to handle copies explicitly
+    inline CUDA_CALLABLE tile_shared_t(const tile_shared_t& other) : data(other.data), grad(other.grad), initialized(other.initialized)
+    {
+        static_assert(!Owner, "Copy constructor is only supported for non-owning tiles.");
+    }
+    // move constructor
+    inline CUDA_CALLABLE tile_shared_t(tile_shared_t&& other) : data(other.data), grad(other.grad), initialized(other.initialized)
+    {
+        other.data.ptr = nullptr;
+        other.grad.ptr = nullptr;
+    }
+    template <typename OtherT, typename OtherLayout, bool OtherOwner>
+    inline CUDA_CALLABLE tile_shared_t(const tile_shared_t<OtherT, OtherLayout, OtherOwner>& other) : data(other.data.ptr), grad(other.grad.ptr), initialized(other.initialized)
+    {
+        static_assert(!Owner, "Copy constructor is only supported for non-owning tiles.");
+        static_assert(Layout::Size == OtherLayout::Size, "Expected Size == OtherLayout::Size");
+    }
     // initialize from an existing tile's memory
     inline CUDA_CALLABLE tile_shared_t(T* data, T* grad=nullptr, bool initialized=true) : data(data), grad(grad), initialized(initialized)
     {
@@ -920,22 +955,52 @@ struct tile_shared_t
     // construct from another shared tile, this constructor
     // is invoked for reshape operations like `wp.tile_transpose()`
+    // or `wp::copy()`
     template <typename OtherT, typename OtherLayout, bool OtherOwner>
     inline CUDA_CALLABLE auto& operator=(const tile_shared_t<OtherT, OtherLayout, OtherOwner>& rhs)
     {
         // check dimensions are compatible
         static_assert(Layout::Size == OtherLayout::Size, "Expected Size == OtherLayout::Size");
-        // alias tile directly
-        data.ptr = rhs.data.ptr;
-        grad.ptr = rhs.grad.ptr;
-        initialized = rhs.initialized;
+        if (Owner)
+        {
+            // if the tile owns the data we need to copy
+            assign(rhs);
+        }
+        else
+        {
+            // alias tile directly
+            data.ptr = rhs.data.ptr;
+            grad.ptr = rhs.grad.ptr;
+            initialized = rhs.initialized;
+        }
         return *this;
-    }
+    }
+    inline CUDA_CALLABLE auto& operator=(const tile_shared_t& rhs)
+    {
+        if (Owner)
+        {
+            // if the tile owns the data we need to copy
+            assign(rhs);
+        }
+        else
+        {
+            // alias tile directly
+            data.ptr = rhs.data.ptr;
+            grad.ptr = rhs.grad.ptr;
+            initialized = rhs.initialized;
+        }
+        return *this;
+    }
     // assign from a global tile (load)
-    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, typename Layout::Shape>& t)
+    template <bool BoundsCheck>
+    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, typename Layout::Shape, BoundsCheck>& t)
     {
         copy_from_global(t);
         return *this;
@@ -958,6 +1023,21 @@ struct tile_shared_t
         return *this;
     }
+    // define the += operator which is used during backward pass codegen
+    // when returning a register tile from a user defined function
+    template<typename OtherLayout>
+    inline CUDA_CALLABLE auto& operator += (const tile_register_t<T, OtherLayout>& rhs)
+    {
+        grad_add(rhs);
+        return *this;
+    }
+    inline CUDA_CALLABLE auto& operator += (const tile_shared_t<T, Layout>& rhs)
+    {
+        grad_add(rhs);
+        return *this;
+    }
     // in-place zero
     inline CUDA_CALLABLE void zero()
     {
@@ -1039,6 +1119,27 @@ struct tile_shared_t
         WP_TILE_SYNC();
     }
+    // shared tile deep copy
+    template <typename OtherT, typename OtherLayout, bool OtherOwner>
+    inline CUDA_CALLABLE void assign(const tile_shared_t<OtherT, OtherLayout, OtherOwner>& tile)
+    {
+        // check dimensions are compatible
+        static_assert(Layout::Size == OtherLayout::Size, "Expected Size == OtherLayout::Size");
+        if (initialized)
+            WP_TILE_SYNC();
+        WP_PRAGMA_UNROLL
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
+        {
+            auto c = Layout::coord_from_linear(i);
+            data(c) = tile.data(c);
+        }
+        initialized = true;
+        WP_TILE_SYNC();
+    }
     // in-place gradient zero
     inline CUDA_CALLABLE void grad_zero()
     {
@@ -1078,8 +1179,21 @@ struct tile_shared_t
         WP_TILE_SYNC();
     }
+    // accumulate gradients onto this tile from another shared tile
+    inline CUDA_CALLABLE void grad_add(const tile_shared_t<T, Layout>& tile)
+    {
+        WP_PRAGMA_UNROLL
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
+        {
+            auto c = Layout::coord_from_linear(i);
+            grad(c) += tile.grad(c);
+        }
+        WP_TILE_SYNC();
+    }
     // accumulate gradient onto this tile from a global array
-    CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
+    inline CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
     {
         WP_PRAGMA_UNROLL
         for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
@@ -1103,7 +1217,7 @@ struct tile_shared_t
         }
         WP_TILE_SYNC();
-    }
+    }
     // copy shared tile to register
     inline CUDA_CALLABLE auto grad_to_register()
@@ -1172,7 +1286,7 @@ struct tile_shared_t
             {
                 // alias of shared tile with 128bit type
                 using SrcLayout = tile_layout_strided_t<tile_shape_t<M, N>>;
-                tile_shared_t<float4, SrcLayout> src128((float4*)data.ptr);
+                tile_shared_t<float4, SrcLayout, false> src128((float4*)data.ptr);
                 assert(((uint64_t)(data.ptr))%sizeof(float4) == 0);
                 assert(((uint64_t)(dest128))%sizeof(float4) == 0);
@@ -1251,7 +1365,7 @@ struct tile_shared_t
             const int elements = min(Layout::Shape::dim(1), (src.data.shape[lastdim] - src.offset[lastdim]));
             const bool aligned_size = (elements*sizeof(T))%sizeof(float4) == 0;
             const bool aligned_stride = (src.data.strides[0]/sizeof(T))%Layout::Stride::dim(0) == 0;
             float4* src128 = (float4*)&src.data.data[src.index_from_coord(tile_coord(0,0))];
             const bool aligned_src = (uint64_t)(src128)%sizeof(float4) == 0;
@@ -1262,7 +1376,7 @@ struct tile_shared_t
             {
                 // alias of shared tile with 128bit type
                 using DestLayout = tile_layout_strided_t<tile_shape_t<M, N>>;
-                tile_shared_t<float4, DestLayout> dest128((float4*)data.ptr);
+                tile_shared_t<float4, DestLayout, false> dest128((float4*)data.ptr);
                 assert(((uint64_t)(dest128.data.ptr))%sizeof(float4) == 0);
                 assert(((uint64_t)(src128))%sizeof(float4) == 0);
@@ -1463,9 +1577,16 @@ void tile_register_t<T, L>::print() const
 // print entry points
 template <typename T, typename L>
 inline CUDA_CALLABLE void print(const tile_register_t<T, L>& t) { t.print(); }
+template <typename T, typename L>
+inline CUDA_CALLABLE void adj_print(const tile_register_t<T, L>& t, const tile_register_t<T, L>& a) { a.print(); }
 template <typename T, typename L, bool Owner>
 inline CUDA_CALLABLE void print(const tile_shared_t<T, L, Owner>& t) { t.print(); }
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE void adj_print(const tile_shared_t<T, L, Owner>& t, const tile_shared_t<T, L, Owner>& a) { a.print(true); }
 template <typename T, typename L, bool O>
 inline CUDA_CALLABLE int len(const tile_shared_t<T, L, O>& t)
 {
@@ -1488,13 +1609,81 @@ inline CUDA_CALLABLE void adj_len(const tile_register_t<T,L>& t, const AdjTile&
 {
 }
+// select specialization for shared tiles
+template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
+inline CUDA_CALLABLE auto select(const C& cond, const tile_register_t<T, LRegister>& a, const tile_shared_t<T, LShared, Owner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? b.copy_to_register() : a;
+}
-template <typename T, typename L>
-inline CUDA_CALLABLE void adj_print(const tile_register_t<T, L>& t, const tile_register_t<T, L>& a) { a.print(); }
-template <typename T, typename L, bool Owner>
-inline CUDA_CALLABLE void adj_print(const tile_shared_t<T, L, Owner>& t, const tile_shared_t<T, L, Owner>& a) { a.print(true); }
+template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
+inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, LShared, Owner>& a, const tile_register_t<T, LRegister>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? b : a.copy_to_register();
+}
+template <typename C, typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, L, Owner>& a, const tile_shared_t<T, L, Owner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr) : tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr);
+}
+template <typename C, typename T, typename L, bool LOwner, bool ROwner>
+inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, L, LOwner>& a, const tile_shared_t<T, L, ROwner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr) : tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr);
+}
+// adj_select same as in builtin.h
+// where specialization for register/shared tiles
+template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
+inline CUDA_CALLABLE auto where(const C& cond, const tile_register_t<T, LRegister>& a, const tile_shared_t<T, LShared, Owner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? a : b.copy_to_register();
+}
+template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
+inline CUDA_CALLABLE auto where(const C& cond, const tile_shared_t<T, LShared, Owner>& a, const tile_register_t<T, LRegister>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? a.copy_to_register() : b;
+}
+template <typename C, typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto where(const C& cond, const tile_shared_t<T, L, Owner>& a, const tile_shared_t<T, L, Owner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr) : tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr);
+}
+template <typename C, typename T, typename L, bool LOwner, bool ROwner>
+inline CUDA_CALLABLE auto where(const C& cond, const tile_shared_t<T, L, LOwner>& a, const tile_shared_t<T, L, ROwner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr) : tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr);
+}
+// adj_where same as in builtin.h
+// copy specialization for shared tiles, the lvalue this gets assigned to is owning, thus, this invokes the copy assign path
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto copy(const tile_shared_t<T, L, Owner>& t)
+{
+    return tile_shared_t<T, L, false>(t.data.ptr, t.grad.ptr);
+}
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE void adj_copy(const tile_shared_t<T, L, Owner>& src, tile_shared_t<T, L, Owner>& adj_src, tile_shared_t<T, L, Owner>& adj_dest)
+{
+    adj_src += adj_dest;
+    adj_dest.grad_zero();
+}
 // helpers to allocate shared tiles
 template <typename T, typename Shape, typename Strides, bool RequiresGrad>
@@ -1727,10 +1916,66 @@ inline CUDA_CALLABLE void adj_tile_arange(T start, T stop, T step,
                                           T& adj_start, T& adj_stop, T& adj_step, AdjTile& adj_ret) {}
 // entry point for load operations, these just return a reference to a global memory array + coordinate
-template <unsigned... Shape, typename... Indices, typename T>
-inline CUDA_CALLABLE auto tile_load(array_t<T>& src, Indices... offset)
+template <typename T, bool BoundsCheck, unsigned... Shape, typename... Offset>
+inline CUDA_CALLABLE auto tile_load(array_t<T>& src, Offset... offset)
+{
+    return tile_global_t<T, tile_shape_t<Shape...>, BoundsCheck>(src, tile_coord(offset...));
+}
+// used for indexed loads and stores
+template <typename T, int M, typename Coord>
+inline CUDA_CALLABLE bool compute_index(array_t<T>& src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, Coord offset, Coord c, int& out)
 {
-    return tile_global_t<T, tile_shape_t<Shape...>>(src, tile_coord(offset...));
+    int index = 0;
+    WP_PRAGMA_UNROLL
+    for (int i = 0; i < Coord::size(); ++i)
+    {
+        if (i == axis)
+        {
+            // global = offset_coord + index_mapped_coord
+            int index_along_axis = offset[i] + indices.data(c[i]);
+            // handle out of bounds case
+            if (index_along_axis >= src.shape[i])
+                return false;
+            else
+                index += src.strides[i] * index_along_axis;
+        }
+        else
+        {
+            // global = offset_coord + coord
+            int g = offset[i] + c[i];
+            // handle out of bounds case
+            if (g >= src.shape[i])
+                return false;
+            else
+                index += src.strides[i] * g;
+        }
+    }
+    // array strides are in bytes so we convert to elements
+    out = index / sizeof(T);
+    return true;
+}
+template <unsigned... Shape, int M, typename T, typename... Offset>
+inline CUDA_CALLABLE auto tile_load_indexed(array_t<T>& src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, Offset... offset)
+{
+    auto out = tile_register_t<T, tile_layout_register_t<tile_shape_t<Shape...>>>();
+    auto offset_coord = tile_coord(offset...);
+    out.apply([&](int reg, auto c) {
+        int i;
+        if (compute_index(src, indices, axis, offset_coord, c, i))
+            out.data[reg] = src.data[i];
+        else
+            out.data[reg] = T(0);
+    });
+    return out;
 }
 // // entry point for tile store operations
@@ -1741,38 +1986,90 @@ inline CUDA_CALLABLE auto tile_load(array_t<T>& src, Indices... offset)
 // }
 // entry point for tile store operations
-template <typename T, typename Tile>
-inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, Tile& src) { src.copy_to_global(tile_global_t<T, typename Tile::Layout::Shape>(dest, tile_coord(x))); }
-template <typename T, typename Tile>
-inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src) { src.copy_to_global(tile_global_t<T, typename Tile::Layout::Shape>(dest, tile_coord(x, y))); }
-template <typename T, typename Tile>
-inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, int z, Tile& src) { src.copy_to_global(tile_global_t<T, typename Tile::Layout::Shape>(dest, tile_coord(x, y, z))); }
-template <typename T, typename Tile>
-inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, int z, int w, Tile& src) { src.copy_to_global(tile_global_t<T, typename Tile::Layout::Shape>(dest, tile_coord(x, y, z, w))); }
+template <typename T, bool BoundsCheck, typename Tile>
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, Tile& src) { src.copy_to_global(tile_global_t<T, typename Tile::Layout::Shape, BoundsCheck>(dest, tile_coord(x))); }
+template <typename T, bool BoundsCheck, typename Tile>
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src) { src.copy_to_global(tile_global_t<T, typename Tile::Layout::Shape, BoundsCheck>(dest, tile_coord(x, y))); }
+template <typename T, bool BoundsCheck, typename Tile>
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, int z, Tile& src) { src.copy_to_global(tile_global_t<T, typename Tile::Layout::Shape, BoundsCheck>(dest, tile_coord(x, y, z))); }
+template <typename T, bool BoundsCheck, typename Tile>
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, int z, int w, Tile& src) { src.copy_to_global(tile_global_t<T, typename Tile::Layout::Shape, BoundsCheck>(dest, tile_coord(x, y, z, w))); }
+template <typename T, int M, typename Tile, typename Coord>
+inline CUDA_CALLABLE void tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, Coord offset, Tile& src)
+{
+    auto src_reg = src.copy_to_register();
+    src_reg.apply([&](int reg, auto c) {
+        int i;
+        if (compute_index(dest, indices, axis, offset, c, i))
+            dest.data[i] = src_reg.data[reg];
+    });
+}
+// entry point for tile index store operations
+template <typename T, int M, typename Tile>
+inline CUDA_CALLABLE void tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, Tile& src) { tile_store_indexed(dest, indices, axis, tile_coord(x), src); }
+template <typename T, int M, typename Tile>
+inline CUDA_CALLABLE void tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, Tile& src) { tile_store_indexed(dest, indices, axis, tile_coord(x, y), src); }
+template <typename T, int M, typename Tile>
+inline CUDA_CALLABLE void tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, Tile& src) { tile_store_indexed(dest, indices, axis, tile_coord(x, y, z), src); }
+template <typename T, int M, typename Tile>
+inline CUDA_CALLABLE void tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, int w, Tile& src) { tile_store_indexed(dest, indices, axis, tile_coord(x, y, z, w), src); }
 // compiler struggles with these if they are one line
-template <typename T, typename Tile>
+template <typename T, bool BoundsCheck, typename Tile>
 inline CUDA_CALLABLE auto tile_atomic_add(array_t<T>& dest, int x, Tile& src) {
-    tile_global_t<T, typename Tile::Layout::Shape> global(dest, tile_coord(x));
+    tile_global_t<T, typename Tile::Layout::Shape, BoundsCheck> global(dest, tile_coord(x));
     return src.atomic_add(global);
 }
-template <typename T, typename Tile>
+template <typename T, bool BoundsCheck, typename Tile>
 inline CUDA_CALLABLE auto tile_atomic_add(array_t<T>& dest, int x, int y, Tile& src) {
-    tile_global_t<T, typename Tile::Layout::Shape> global(dest, tile_coord(x, y));
+    tile_global_t<T, typename Tile::Layout::Shape, BoundsCheck> global(dest, tile_coord(x, y));
     return src.atomic_add(global);
 }
-template <typename T, typename Tile>
+template <typename T, bool BoundsCheck, typename Tile>
 inline CUDA_CALLABLE auto tile_atomic_add(array_t<T>& dest, int x, int y, int z, Tile& src) {
-    tile_global_t<T, typename Tile::Layout::Shape> global(dest, tile_coord(x, y, z));
+    tile_global_t<T, typename Tile::Layout::Shape, BoundsCheck> global(dest, tile_coord(x, y, z));
     return src.atomic_add(global);
 }
-template <typename T, typename Tile>
+template <typename T, bool BoundsCheck, typename Tile>
 inline CUDA_CALLABLE auto tile_atomic_add(array_t<T>& dest, int x, int y, int z, int w, Tile& src) {
-    tile_global_t<T, typename Tile::Layout::Shape> global(dest, tile_coord(x, y, z, w));
+    tile_global_t<T, typename Tile::Layout::Shape, BoundsCheck> global(dest, tile_coord(x, y, z, w));
     return src.atomic_add(global);
 }
+template <typename T, int M, typename Tile, typename Coord>
+inline CUDA_CALLABLE auto tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, Coord offset, Tile& src)
+{
+    auto src_reg = src.copy_to_register();
+    auto ret_reg = tile_register_like<Tile>();
+    src_reg.apply([&](int reg, auto c) {
+        int i;
+        if (compute_index(dest, indices, axis, offset, c, i))
+            ret_reg.data[reg] = wp::atomic_add(&dest.data[i], src_reg.data[reg]);
+        else
+            ret_reg.data[reg] = T(0);
+    });
+    return ret_reg;
+}
+// entry point for tile index atomic add operations
+template <typename T, int M, typename Tile>
+inline CUDA_CALLABLE auto tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, Tile& src) { return tile_atomic_add_indexed(dest, indices, axis, tile_coord(x), src); }
+template <typename T, int M, typename Tile>
+inline CUDA_CALLABLE auto tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, Tile& src) { return tile_atomic_add_indexed(dest, indices, axis, tile_coord(x, y), src); }
+template <typename T, int M, typename Tile>
+inline CUDA_CALLABLE auto tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, Tile& src) { return tile_atomic_add_indexed(dest, indices, axis, tile_coord(x, y, z), src); }
+template <typename T, int M, typename Tile>
+inline CUDA_CALLABLE auto tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, int w, Tile& src) { return tile_atomic_add_indexed(dest, indices, axis, tile_coord(x, y, z, w), src); }
 //-------------------------------------
 // Adjoints
@@ -1791,7 +2088,6 @@ inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, Coord c,
     adj_ret.atomic_add_grad(dest);
 }
 template <typename T, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, array_t<T>& adj_src, int adj_x, AdjTile& adj_ret) { adj_tile_load( src, tile_coord(x), adj_src, tile_coord(0), adj_ret); }
 template <typename T, typename AdjTile>
@@ -1801,7 +2097,44 @@ inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y, int z, ar
 template <typename T, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y, int z, int w, array_t<T>& adj_src, int adj_x, int adj_y, int adj_z, int adj_w, AdjTile& adj_ret) { adj_tile_load( src, tile_coord(x, y, z, w), adj_src, tile_coord(0,0,0,0), adj_ret); }
+template <typename T, int M, typename AdjTile, typename Coord>
+inline CUDA_CALLABLE void adj_tile_load_indexed(array_t<T>& src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, Coord offset,
+                                        array_t<T>& adj_src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, Coord adj_offset,
+                                        AdjTile& adj_ret)
+{
+    // we allow users to override grad of src
+    if (adj_src.data)
+        src.grad = adj_src.data;
+    auto adj_ret_reg = adj_ret.grad_to_register();
+    adj_ret_reg.apply([&](int reg, auto c) {
+        int i;
+        if (compute_index(src, indices, axis, offset, c, i))
+            wp::atomic_add(&src.grad[i], adj_ret_reg.data[reg]);
+    });
+}
+template <typename T, int M, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_load_indexed(array_t<T>& src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, array_t<T>& adj_src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, AdjTile& adj_ret)
+{
+    adj_tile_load_indexed(src, indices, axis, tile_coord(x), adj_src, adj_indices, adj_axis, tile_coord(0), adj_ret);
+}
+template <typename T, int M, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_load_indexed(array_t<T>& src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, array_t<T>& adj_src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, AdjTile& adj_ret)
+{
+    adj_tile_load_indexed(src, indices, axis, tile_coord(x, y), adj_src, adj_indices, adj_axis, tile_coord(0, 0), adj_ret);
+}
+template <typename T, int M, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_load_indexed(array_t<T>& src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, array_t<T>& adj_src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, int adj_z, AdjTile& adj_ret)
+{
+    adj_tile_load_indexed(src, indices, axis, tile_coord(x, y, z), adj_src, adj_indices, adj_axis, tile_coord(0, 0, 0), adj_ret);
+}
+template <typename T, int M, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_load_indexed(array_t<T>& src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, int w, array_t<T>& adj_src, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, int adj_z, int adj_w, AdjTile& adj_ret)
+{
+    adj_tile_load_indexed(src, indices, axis, tile_coord(x, y, z, w), adj_src, adj_indices, adj_axis, tile_coord(0, 0, 0, 0), adj_ret);
+}
 template <typename T, typename Tile, typename AdjTile, typename Coord>
 inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, Coord c, Tile& t, array_t<T>& adj_dest, Coord adj_c, AdjTile& adj_t)
@@ -1827,7 +2160,33 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, int z,
 template <typename T, typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, int z, int w, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, int adj_z, int adj_w, AdjTile& adj_t) { adj_tile_store(dest, tile_coord(x, y, z, w), t, adj_dest, tile_coord(0,0,0,0), adj_t); }
+template <typename T, int M, typename Tile, typename AdjTile, typename Coord>
+inline CUDA_CALLABLE void adj_tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, Coord offset, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, Coord adj_offset, AdjTile& adj_t)
+{
+    // we allow users to override grad of src
+    if (adj_dest.data)
+        dest.grad = adj_dest.data;
+    auto adj_t_reg = tile_register_like<Tile>();
+    adj_t_reg.apply([&](int reg, auto c) {
+        int i;
+        if (compute_index(dest, indices, axis, offset, c, i))
+            adj_t_reg.data[reg] += dest.grad[i];
+    });
+    // write adjoints back
+    adj_t.grad_add(adj_t_reg);
+}
+template <typename T, int M, typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, AdjTile& adj_t) { adj_tile_store_indexed(dest, indices, axis, tile_coord(x), t, adj_dest, adj_indices, adj_axis, tile_coord(0), adj_t); }
+template <typename T, int M, typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, AdjTile& adj_t) { adj_tile_store_indexed(dest, indices, axis, tile_coord(x, y), t, adj_dest, adj_indices, adj_axis, tile_coord(0,0), adj_t); }
+template <typename T, int M, typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, int adj_z, AdjTile& adj_t) { adj_tile_store_indexed(dest, indices, axis, tile_coord(x, y, z), t, adj_dest, adj_indices, adj_axis, tile_coord(0,0,0), adj_t); }
+template <typename T, int M, typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_store_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, int w, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, int adj_z, int adj_w, AdjTile& adj_t) { adj_tile_store_indexed(dest, indices, axis, tile_coord(x, y, z, w), t, adj_dest, adj_indices, adj_axis, tile_coord(0,0,0,0), adj_t); }
 // adj_tile_atomic_add is an alias for adj_tile_store
 template <typename T, typename Tile, typename AdjTile, typename AdjRet>
@@ -1839,13 +2198,28 @@ inline CUDA_CALLABLE void adj_tile_atomic_add(array_t<T>& dest, int x, int y, in
 template <typename T, typename Tile, typename AdjTile, typename AdjRet>
 inline CUDA_CALLABLE void adj_tile_atomic_add(array_t<T>& dest, int x, int y, int z, int w, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, int adj_z, int adj_w, AdjTile& adj_t, AdjRet& adj_ret) { adj_tile_store(dest, tile_coord(x, y, z, w), t, adj_dest, tile_coord(adj_x, adj_y, adj_z, adj_w), adj_t); }
+// adj_tile_atomic_add_indexed is an alias for adj_tile_store_indexed
+template <typename T, int M, typename Tile, typename AdjTile, typename AdjRet>
+inline CUDA_CALLABLE void adj_tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, AdjTile& adj_t, AdjRet& adj_ret) { adj_tile_store_indexed(dest, indices, axis, tile_coord(x), t, adj_dest, adj_indices, adj_axis, tile_coord(0), adj_t); }
+template <typename T, int M, typename Tile, typename AdjTile, typename AdjRet>
+inline CUDA_CALLABLE void adj_tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, AdjTile& adj_t, AdjRet& adj_ret) { adj_tile_store_indexed(dest, indices, axis, tile_coord(x, y), t, adj_dest, adj_indices, adj_axis, tile_coord(0,0), adj_t); }
+template <typename T, int M, typename Tile, typename AdjTile, typename AdjRet>
+inline CUDA_CALLABLE void adj_tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, int adj_z, AdjTile& adj_t, AdjRet& adj_ret) { adj_tile_store_indexed(dest, indices, axis, tile_coord(x, y, z), t, adj_dest, adj_indices, adj_axis, tile_coord(0,0,0), adj_t); }
+template <typename T, int M, typename Tile, typename AdjTile, typename AdjRet>
+inline CUDA_CALLABLE void adj_tile_atomic_add_indexed(array_t<T>& dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& indices, int axis, int x, int y, int z, int w, Tile& t, array_t<T>& adj_dest, tile_shared_t<int, tile_layout_strided_t<tile_shape_t<M>>>& adj_indices, int adj_axis, int adj_x, int adj_y, int adj_z, int adj_w, AdjTile& adj_t, AdjRet& adj_ret) { adj_tile_store_indexed(dest, indices, axis, tile_coord(x, y, z, w), t, adj_dest, adj_indices, adj_axis, tile_coord(0,0,0,0), adj_t); }
 // unary map
-template <typename Tile, typename Fwd>
-inline CUDA_CALLABLE auto tile_map(Fwd op,
-                                   Tile &a)
+template <typename Tile, typename Fwd, typename ReturnTile>
+inline CUDA_CALLABLE auto tile_map(Fwd op, Tile &a, ReturnTile &r)
 {
-    auto out = tile_register_like<Tile>();
+    // verify shapes and sizes are compatible
+    using ShapeIn = typename Tile::Layout::Shape;
+    using ShapeOut = typename ReturnTile::Layout::Shape;
+    static_assert(ShapeIn::N == ShapeOut::N, "Number of tile dimensions must match for unary map");
+    static_assert(ShapeIn::size() == ShapeOut::size(), "Tile sizes must match for unary map");
+    auto out = tile_register_like<ReturnTile>();
     auto a_reg = a.copy_to_register();
     using Layout = typename decltype(out)::Layout;
@@ -1884,12 +2258,24 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
 }
 // binary map
-template <typename TileA, typename TileB, typename Fwd>
+template <typename TileA, typename TileB, typename Fwd, typename ReturnTile>
 inline CUDA_CALLABLE auto tile_map(Fwd op,
                                    TileA& a,
-                                   TileB& b)
+                                   TileB& b,
+                                   ReturnTile& r)
 {
-    auto out = tile_register_like<TileA>();
+    // verify shapes and sizes are compatible
+    using ShapeA = typename TileA::Layout::Shape;
+    using ShapeB = typename TileB::Layout::Shape;
+    using ShapeOut = typename ReturnTile::Layout::Shape;
+    static_assert(ShapeA::N == ShapeOut::N, "Number of tile dimensions must match for binary map");
+    static_assert(ShapeB::N == ShapeOut::N, "Number of tile dimensions must match for binary map");
+    static_assert(ShapeA::size() == ShapeOut::size(), "Tile sizes must match for binary map");
+    static_assert(ShapeB::size() == ShapeOut::size(), "Tile sizes must match for binary map");
+    auto out = tile_register_like<ReturnTile>();
     auto a_reg = a.copy_to_register();
     auto b_reg = b.copy_to_register();
@@ -1905,7 +2291,6 @@ inline CUDA_CALLABLE auto tile_map(Fwd op,
     return out;
 }
 template <typename TileA, typename TileB, typename Fwd, typename Adj, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_map(Fwd op,
                                        TileA &a,
@@ -1936,28 +2321,32 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
     adj_b.grad_add(adj_b_reg);
 }
-// wrap the operator in a lambda so that we don't have to do overload resolution for things like e.g.: wp.sin()
+// We wrap the operator in a lambda so that we don't have to do overload resolution for things like e.g.: wp.sin()
 // this is important because many of the builtin operators don't follow particular conventions on references for
 // the `adj_ret` parameter, which means it's not possible to figure out the overload we need using simple casting
-#define tile_unary_map(op, a) tile_map([](auto x) { return op(x);}, a)
-#define adj_tile_unary_map(op, a, adj_op, adj_a, adj_ret) adj_tile_map([](auto x) { return op(x);}, a, [](auto x, auto& adj_x, auto adj_ret) { adj_op(x, adj_x, adj_ret);}, adj_a, adj_ret)
+// The r argument is a dummy return tile argument, because we can't template on the return tile type in a macro definition.
+// So if we want users to be able to define functions that return a tile type that is different from the input type,
+// we must pass an extra dummy return tile argument that is used define the return type of tile_map.
+#define tile_unary_map(op, a, r) tile_map([](auto x) { return op(x);}, a, r)
+#define adj_tile_unary_map(op, a, r, adj_op, adj_a, adj_r, adj_ret) adj_tile_map([](auto x) { return op(x);}, a, [](auto x, auto& adj_x, auto adj_ret) { adj_op(x, adj_x, adj_ret);}, adj_a, adj_ret)
-#define tile_binary_map(op, a, b) tile_map([](auto x, auto y) { return op(x, y);}, a, b)
-#define adj_tile_binary_map(op, a, b, adj_op, adj_a, adj_b, adj_ret) adj_tile_map([](auto x, auto y) { return op(x, y);}, a, b, [](auto x, auto y, auto& adj_x, auto& adj_y, auto adj_ret) { adj_op(x, y, adj_x, adj_y, adj_ret);}, adj_a, adj_b, adj_ret)
+#define tile_binary_map(op, a, b, r) tile_map([](auto x, auto y) { return op(x, y);}, a, b, r)
+#define adj_tile_binary_map(op, a, b, r, adj_op, adj_a, adj_b, adj_r, adj_ret) adj_tile_map([](auto x, auto y) { return op(x, y);}, a, b, [](auto x, auto y, auto& adj_x, auto& adj_y, auto adj_ret) { adj_op(x, y, adj_x, adj_y, adj_ret);}, adj_a, adj_b, adj_ret)
 // -tile (unary neg)
 template <typename Tile>
-inline CUDA_CALLABLE auto tile_neg(Tile& a) { return tile_unary_map(wp::neg, a); }
+inline CUDA_CALLABLE auto tile_neg(Tile& a) { return tile_unary_map(wp::neg, a, a); }
 template <typename Tile, typename AdjTile>
-inline CUDA_CALLABLE void adj_tile_neg(Tile& a, Tile& adj_a, AdjTile& adj_ret) { adj_tile_unary_map(wp::neg, a, wp::adj_neg, adj_a, adj_ret); }
+inline CUDA_CALLABLE void adj_tile_neg(Tile& a, Tile& adj_a, AdjTile& adj_ret) { adj_tile_unary_map(wp::neg, a, a, wp::adj_neg, adj_a, adj_a, adj_ret); }
 // tile + tile
 template <typename TileA, typename TileB>
 inline CUDA_CALLABLE auto tile_add(TileA& a, TileB& b)
 {
-    return tile_binary_map(add, a, b);
+    return tile_binary_map(add, a, b, a);
 }
 // add overloads get called in user function adjoints generated by codegen (adj_tile += adj_ret)
@@ -1984,20 +2373,20 @@ inline CUDA_CALLABLE auto add(tile_shared_t<T, L, Owner>& a, const tile_register
 template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_add(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b, AdjTile& adj_c)
 {
-    adj_tile_binary_map(add, a, b, adj_add, adj_a, adj_b, adj_c);
+    adj_tile_binary_map(add, a, b, a, adj_add, adj_a, adj_b, adj_a, adj_c);
 }
 // tile - tile
 template <typename TileA, typename TileB>
 inline CUDA_CALLABLE auto tile_sub(TileA& a, TileB& b)
 {
-    return tile_binary_map(sub, a, b);
+    return tile_binary_map(sub, a, b, a);
 }
 template <typename TileA, typename TileB, typename AdjTileA, typename AdjTileB, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_sub(TileA& a, TileB& b, AdjTileA& adj_a, AdjTileB& adj_b, AdjTile& adj_c)
 {
-    adj_tile_binary_map(sub, a, b, adj_sub, adj_a, adj_b, adj_c);
+    adj_tile_binary_map(sub, a, b, a, adj_sub, adj_a, adj_b, adj_a, adj_c);
 }
@@ -2008,7 +2397,7 @@ inline CUDA_CALLABLE auto tile_mul(Tile& a, const typename Tile::Type& s)
     // promote scalar to a constant tile
     auto s_tile = tile_register_t<typename Tile::Type, tile_layout_register_t<typename Tile::Layout::Shape>>(s);
-    return tile_binary_map(mul, a, s_tile);
+    return tile_binary_map(mul, a, s_tile, a);
 }
 template <typename Tile, typename AdjTile>
@@ -2024,7 +2413,7 @@ inline CUDA_CALLABLE void adj_tile_mul(Tile& a, const typename Tile::Type& s,
     // initialize to constant
     s_tile = s;
-    adj_tile_binary_map(mul, a, s_tile, adj_mul, adj_a, adj_s_tile, adj_c);
+    adj_tile_binary_map(mul, a, s_tile, a, adj_mul, adj_a, adj_s_tile, adj_a, adj_c);
     for (int i=0; i < Layout::NumRegs; ++i)
     {
@@ -2834,7 +3223,7 @@ template <typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_transpose(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
     auto a = tile_transpose(adj_ret);
-    auto b = adj_t;
+    auto& b = adj_t;
     adj_t.assign(tile_add(a,b));
 }