PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl - Mend

warp-lang 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +93 -30
warp/build_dll.py +48 -63
warp/builtins.py +955 -137
warp/codegen.py +327 -209
warp/config.py +1 -1
warp/context.py +1363 -800
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +266 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +200 -91
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +203 -54
warp/marching_cubes.py +708 -0
warp/native/array.h +103 -8
warp/native/builtin.h +90 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +13 -3
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +42 -11
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +4 -4
warp/native/mat.h +1913 -119
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +5 -3
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +337 -16
warp/native/rand.h +7 -7
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +14 -14
warp/native/spatial.h +366 -17
warp/native/svd.h +23 -8
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +303 -70
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +385 -18
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +337 -193
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +137 -57
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/graph_coloring.py +2 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +559 -176
warp/tape.py +2 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +82 -7
warp/tests/test_array.py +56 -5
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1540 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +162 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +103 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_static.py +48 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tape.py +38 -0
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +216 -441
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +206 -152
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +16 -16
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +16 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/native/mat.h CHANGED Viewed

@@ -177,12 +177,12 @@ struct mat_t
     CUDA_CALLABLE vec_t<Cols,Type> get_row(int index) const
     {
-        return (vec_t<Cols,Type>&)data[index];
+        return reinterpret_cast<const vec_t<Cols,Type>&>(data[index]);
     }
     CUDA_CALLABLE void set_row(int index, const vec_t<Cols,Type>& v)
     {
-        (vec_t<Cols,Type>&)data[index] = v;
+        reinterpret_cast<vec_t<Cols,Type>&>(data[index]) = v;
     }
     CUDA_CALLABLE vec_t<Rows,Type> get_col(int index) const
@@ -204,7 +204,7 @@ struct mat_t
     }
     // row major storage assumed to be compatible with PyTorch
-    Type data[Rows][Cols];
+    Type data[Rows < 1 ? 1 : Rows][Cols < 1 ? 1 : Cols];
 };
 template<typename Type>
@@ -477,6 +477,20 @@ template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE vec_t<Cols,Type> extract(const mat_t<Rows,Cols,Type>& m, int row)
 {
     vec_t<Cols,Type> ret;
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
     for(unsigned i=0; i < Cols; ++i)
     {
         ret.c[i] = m.data[row][i];
@@ -488,31 +502,206 @@ template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE Type extract(const mat_t<Rows,Cols,Type>& m, int row, int col)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
-    if (col < 0 || col >= Cols)
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
     return m.data[row][col];
 }
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<RowSliceLength, ColSliceLength, Type> extract(const mat_t<Rows,Cols,Type>& m, slice_t row_slice)
+{
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    mat_t<RowSliceLength, ColSliceLength, Type> ret;
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        for (int j = 0; j < Cols; ++j)
+        {
+            ret.data[ii][j] = m.data[i][j];
+        }
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+    return ret;
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE vec_t<RowSliceLength, Type> extract(const mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col)
+{
+#ifndef NDEBUG
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    vec_t<RowSliceLength, Type> ret;
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        ret.c[ii] = m.data[i][col];
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+    return ret;
+}
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE vec_t<ColSliceLength, Type> extract(const mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    vec_t<ColSliceLength, Type> ret;
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = col_slice.start;
+        is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
+        i += col_slice.step
+    )
+    {
+        ret.c[ii] = m.data[row][i];
+        ++ii;
+    }
+    assert(ii == ColSliceLength);
+    return ret;
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<RowSliceLength, ColSliceLength, Type> extract(const mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice)
+{
+    mat_t<RowSliceLength, ColSliceLength, Type> ret;
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        int jj = 0;
+        for (
+            int j = col_slice.start;
+            is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
+            j += col_slice.step
+        )
+        {
+            ret.data[ii][jj] = m.data[i][j];
+            ++jj;
+        }
+        assert(jj == ColSliceLength);
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+    return ret;
+}
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE vec_t<Cols, Type>* index(mat_t<Rows,Cols,Type>& m, int row)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
     return reinterpret_cast<vec_t<Cols, Type>*>(&m.data[row]);
 }
@@ -520,38 +709,46 @@ template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE Type* index(mat_t<Rows,Cols,Type>& m, int row, int col)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
-    if (col < 0 || col >= Cols)
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
     return &m.data[row][col];
 }
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE void adj_index(const mat_t<Rows,Cols,Type>& m, int row,
-                                       const mat_t<Rows,Cols,Type>& adj_m, int adj_row, const vec_t<Cols, Type>& adj_value)
+                                    const mat_t<Rows,Cols,Type>& adj_m, int adj_row, const vec_t<Cols, Type>& adj_value)
 {
     // nop
 }
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE void adj_index(const mat_t<Rows,Cols,Type>& m, int row, int col,
-                                       const mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type adj_value)
+                                    const mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type adj_value)
 {
     // nop
 }
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+inline CUDA_CALLABLE Type* indexref(mat_t<Rows,Cols,Type>* m, int row, int col)
 {
 #ifndef NDEBUG
     if (row < 0 || row >= Rows)
@@ -566,201 +763,1091 @@ inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col
     }
 #endif
-    m.data[row][col] += value;
+    return &(m->data)[row][col];
 }
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
+inline CUDA_CALLABLE void adj_indexref(mat_t<Rows,Cols,Type>* m, int row, int col,
+                                       mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, const Type& adj_value)
 {
-#ifndef NDEBUG
-    if (row < 0 || row >= Rows)
-    {
-        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
-        assert(0);
-    }
-#endif
-    for(unsigned i=0; i < Cols; ++i)
-    {
-        m.data[row][i] += value[i];
-    }
+    // nop
 }
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
-                                        mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value)
+inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
-    if (col < 0 || col >= Cols)
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    adj_value += adj_m.data[row][col];
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    m.data[row][col] += value;
 }
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_add_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
-                                        mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value)
+inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
     for(unsigned i=0; i < Cols; ++i)
     {
-        adj_value[i] += adj_m.data[row][i];
+        m.data[row][i] += value[i];
     }
 }
-template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
 {
-#ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
     {
-        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
-        assert(0);
+        for (int j = 0; j < Cols; ++j)
+        {
+            m.data[i][j] += value.data[ii][j];
+        }
+        ++ii;
     }
-    if (col < 0 || col >= Cols)
+    assert(ii == RowSliceLength);
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
+{
+#ifndef NDEBUG
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    m.data[row][col] -= value;
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        m.data[i][col] += value.c[ii];
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
 }
-template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    for(unsigned i=0; i < Cols; ++i)
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    if (row < 0)
     {
-        m.data[row][i] -= value[i];
+        row += Rows;
+    }
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = col_slice.start;
+        is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
+        i += col_slice.step
+    )
+    {
+        m.data[row][i] += value.c[ii];
+        ++ii;
+    }
+    assert(ii == ColSliceLength);
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
+{
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        int jj = 0;
+        for (
+            int j = col_slice.start;
+            is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
+            j += col_slice.step
+        )
+        {
+            m.data[i][j] += value.data[ii][jj];
+            ++jj;
+        }
+        assert(jj == ColSliceLength);
+        ++ii;
     }
+    assert(ii == RowSliceLength);
 }
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_sub_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
+inline CUDA_CALLABLE void adj_add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
                                         mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
-    if (col < 0 || col >= Cols)
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    adj_value -= adj_m.data[row][col];
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    adj_value += adj_m.data[row][col];
 }
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_sub_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
+inline CUDA_CALLABLE void adj_add_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
                                         mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
     for(unsigned i=0; i < Cols; ++i)
     {
-        adj_value[i] -= adj_m.data[row][i];
+        adj_value[i] += adj_m.data[row][i];
     }
 }
-template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_add_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
+)
 {
-#ifndef NDEBUG
-    if (row < 0 || row >= Rows)
-    {
-        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
-        assert(0);
-    }
-    if (col < 0 || col >= Cols)
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
     {
-        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
-        assert(0);
+        for (int j = 0; j < Cols; ++j)
+        {
+            adj_value.data[ii][j] += adj_m.data[i][j];
+        }
+        ++ii;
     }
-#endif
-    m.data[row][col] = value;
+    assert(ii == RowSliceLength);
 }
-template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_add_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
+)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (col < -(int)Cols || col >= (int)Cols)
     {
-        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    for(unsigned i=0; i < Cols; ++i)
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
     {
-        m.data[row][i] = value[i];
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        adj_value.c[ii] += adj_m.data[i][col];
+        ++ii;
     }
+    assert(ii == RowSliceLength);
 }
-template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
-                                        mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value)
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_add_inplace(
+    mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
+)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
-    if (col < 0 || col >= Cols)
+#endif
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = col_slice.start;
+        is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
+        i += col_slice.step
+    )
+    {
+        adj_value.c[ii] += adj_m.data[row][i];
+        ++ii;
+    }
+    assert(ii == ColSliceLength);
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_add_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
+)
+{
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        int jj = 0;
+        for (
+            int j = col_slice.start;
+            is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
+            j += col_slice.step
+        )
+        {
+            adj_value.data[ii][jj] += adj_m.data[i][j];
+            ++jj;
+        }
+        assert(jj == ColSliceLength);
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    m.data[row][col] -= value;
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    for(unsigned i=0; i < Cols; ++i)
+    {
+        m.data[row][i] -= value[i];
+    }
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
+{
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        for (int j = 0; j < Cols; ++j)
+        {
+            m.data[i][j] -= value.data[ii][j];
+        }
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
+{
+#ifndef NDEBUG
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        m.data[i][col] -= value.c[ii];
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = col_slice.start;
+        is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
+        i += col_slice.step
+    )
+    {
+        m.data[row][i] -= value.c[ii];
+        ++ii;
+    }
+    assert(ii == ColSliceLength);
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
+{
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        int jj = 0;
+        for (
+            int j = col_slice.start;
+            is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
+            j += col_slice.step
+        )
+        {
+            m.data[i][j] -= value.data[ii][jj];
+            ++jj;
+        }
+        assert(jj == ColSliceLength);
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_sub_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
+                                        mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    adj_value -= adj_m.data[row][col];
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_sub_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
+                                        mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    for(unsigned i=0; i < Cols; ++i)
+    {
+        adj_value[i] -= adj_m.data[row][i];
+    }
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_sub_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
+)
+{
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        for (int j = 0; j < Cols; ++j)
+        {
+            adj_value.data[ii][j] -= adj_m.data[i][j];
+        }
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_sub_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
+)
+{
+#ifndef NDEBUG
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        adj_value.c[ii] -= adj_m.data[i][col];
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_sub_inplace(
+    mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
+)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = col_slice.start;
+        is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
+        i += col_slice.step
+    )
+    {
+        adj_value.c[ii] -= adj_m.data[row][i];
+        ++ii;
+    }
+    assert(ii == ColSliceLength);
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_sub_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
+)
+{
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        int jj = 0;
+        for (
+            int j = col_slice.start;
+            is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
+            j += col_slice.step
+        )
+        {
+            adj_value.data[ii][jj] -= adj_m.data[i][j];
+            ++jj;
+        }
+        assert(jj == ColSliceLength);
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    m.data[row][col] = value;
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    for(unsigned i=0; i < Cols; ++i)
+    {
+        m.data[row][i] = value[i];
+    }
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
+{
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        for (int j = 0; j < Cols; ++j)
+        {
+            m.data[i][j] = value.data[ii][j];
+        }
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
+{
+#ifndef NDEBUG
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        m.data[i][col] = value.c[ii];
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = col_slice.start;
+        is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
+        i += col_slice.step
+    )
+    {
+        m.data[row][i] = value.c[ii];
+        ++ii;
+    }
+    assert(ii == ColSliceLength);
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
+{
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        int jj = 0;
+        for (
+            int j = col_slice.start;
+            is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
+            j += col_slice.step
+        )
+        {
+            m.data[i][j] = value.data[ii][jj];
+            ++jj;
+        }
+        assert(jj == ColSliceLength);
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
+                                        mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
     adj_value += adj_m.data[row][col];
 }
@@ -770,13 +1857,18 @@ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row,
                                         mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
     for(unsigned i=0; i < Cols; ++i)
     {
         adj_value[i] += adj_m.data[row][i];
@@ -784,99 +1876,563 @@ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row,
 }
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
+)
+{
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        for (int j = 0; j < Cols; ++j)
+        {
+            adj_value.data[ii][j] += adj_m.data[i][j];
+        }
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
+)
+{
+#ifndef NDEBUG
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        adj_value.c[ii] += adj_m.data[i][col];
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_inplace(
+    mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
+)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = col_slice.start;
+        is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
+        i += col_slice.step
+    )
+    {
+        adj_value.c[ii] += adj_m.data[row][i];
+        ++ii;
+    }
+    assert(ii == ColSliceLength);
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_inplace(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
+)
+{
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        int jj = 0;
+        for (
+            int j = col_slice.start;
+            is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
+            j += col_slice.step
+        )
+        {
+            adj_value.data[ii][jj] += adj_m.data[i][j];
+            ++jj;
+        }
+        assert(jj == ColSliceLength);
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    mat_t<Rows,Cols,Type> ret(m);
+    ret.data[row][col] = value;
+    return ret;
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    mat_t<Rows,Cols,Type> ret(m);
+    for(unsigned i=0; i < Cols; ++i)
+    {
+        ret.data[row][i] = value[i];
+    }
+    return ret;
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
+{
+    mat_t<Rows, Cols, Type> ret(m);
+    assign_inplace(ret, row_slice, value);
+    return ret;
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
+{
+    mat_t<Rows, Cols, Type> ret(m);
+    assign_inplace(ret, row_slice, col, value);
+    return ret;
+}
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
+{
+    mat_t<Rows, Cols, Type> ret(m);
+    assign_inplace(ret, row, col_slice, value);
+    return ret;
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
+{
+    mat_t<Rows, Cols, Type> ret(m);
+    assign_inplace(ret, row_slice, col_slice, value);
+    return ret;
+}
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
+                                        mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
-    if (col < 0 || col >= Cols)
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    mat_t<Rows,Cols,Type> ret(m);
-    ret.data[row][col] = value;
-    return ret;
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    adj_value += adj_ret.data[row][col];
+    for(unsigned i=0; i < Rows; ++i)
+    {
+        for(unsigned j=0; j < Cols; ++j)
+        {
+            if(i != row || j != col)
+                adj_m.data[i][j] += adj_ret.data[i][j];
+        }
+    }
 }
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
+inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
+                                        mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    mat_t<Rows,Cols,Type> ret(m);
-    for(unsigned i=0; i < Cols; ++i)
+    if (row < 0)
     {
-        ret.data[row][i] = value[i];
+        row += Rows;
+    }
+    for(unsigned i=0; i < Rows; ++i)
+    {
+        for(unsigned j=0; j < Cols; ++j)
+        {
+            if (i==row)
+                adj_value[j] += adj_ret.data[i][j];
+            else
+                adj_m.data[i][j] += adj_ret.data[i][j];
+        }
     }
-    return ret;
 }
-template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
-                                        mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_copy(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value,
+    mat_t<Rows,Cols,Type>& adj_ret
+)
 {
-#ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (int i = 0; i < Rows; ++i)
     {
-        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
-        assert(0);
+        bool in_row_slice = is_row_reversed
+            ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
+            : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
+        if (!in_row_slice)
+        {
+            for (int j = 0; j < Cols; ++j)
+            {
+                adj_m.data[i][j] += adj_ret.data[i][j];
+            }
+        }
+        else
+        {
+            for (int j = 0; j < Cols; ++j)
+            {
+                adj_value.data[ii][j] += adj_ret.data[i][j];
+            }
+            ++ii;
+        }
     }
-    if (col < 0 || col >= Cols)
+    assert(ii == RowSliceLength);
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_copy(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value,
+    mat_t<Rows,Cols,Type>& adj_ret
+)
+{
+#ifndef NDEBUG
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    adj_value += adj_ret.data[row][col];
-    for(unsigned i=0; i < Rows; ++i)
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
     {
-        for(unsigned j=0; j < Cols; ++j)
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (int i = 0; i < Rows; ++i)
+    {
+        bool in_row_slice = is_row_reversed
+            ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
+            : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
+        if (!in_row_slice)
         {
-            if(i != row || j != col)
+            for (int j = 0; j < Cols; ++j)
+            {
                 adj_m.data[i][j] += adj_ret.data[i][j];
+            }
+        }
+        else
+        {
+            for (int j = 0; j < Cols; ++j)
+            {
+                if (j != col)
+                {
+                    adj_m.data[i][j] += adj_ret.data[i][j];
+                }
+                else
+                {
+                    adj_value.c[ii] += adj_ret.data[i][j];
+                }
+            }
+            ++ii;
         }
     }
+    assert(ii == RowSliceLength);
 }
-template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
-                                        mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_copy(
+    mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value,
+    mat_t<Rows,Cols,Type>& adj_ret
+)
 {
 #ifndef NDEBUG
-    if (row < 0 || row >= Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
 #endif
-    for(unsigned i=0; i < Rows; ++i)
+    if (row < 0)
     {
-        for(unsigned j=0; j < Cols; ++j)
+        row += Rows;
+    }
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (int i = 0; i < Rows; ++i)
+    {
+        if (i != row)
         {
-            if (i==row)
-                adj_value[j] += adj_ret.data[i][j];
-            else
+            for (int j = 0; j < Cols; ++j)
+            {
+                adj_m.data[i][j] += adj_ret.data[i][j];
+            }
+        }
+        else
+        {
+            for (int j = 0; j < Cols; ++j)
+            {
+                bool in_col_slice = is_col_reversed
+                    ? (j <= col_slice.start && j > col_slice.stop && (col_slice.start - j) % (-col_slice.step) == 0)
+                    : (j >= col_slice.start && j < col_slice.stop && (j - col_slice.start) % col_slice.step == 0);
+                if (!in_col_slice)
+                {
+                    adj_m.data[i][j] += adj_ret.data[i][j];
+                }
+                else
+                {
+                    adj_value.c[ii] += adj_ret.data[i][j];
+                    ++ii;
+                }
+            }
+        }
+    }
+    assert(ii == ColSliceLength);
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_assign_copy(
+    mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value,
+    mat_t<Rows,Cols,Type>& adj_ret
+)
+{
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (int i = 0; i < Rows; ++i)
+    {
+        bool in_row_slice = is_row_reversed
+            ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
+            : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
+        if (!in_row_slice)
+        {
+            for (int j = 0; j < Cols; ++j)
+            {
                 adj_m.data[i][j] += adj_ret.data[i][j];
+            }
+        }
+        else
+        {
+            assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+            assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+            assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+            assert(slice_get_length(col_slice) == ColSliceLength);
+            int jj = 0;
+            for (int j = 0; j < Cols; ++j)
+            {
+                bool in_col_slice = is_col_reversed
+                    ? (j <= col_slice.start && j > col_slice.stop && (col_slice.start - j) % (-col_slice.step) == 0)
+                    : (j >= col_slice.start && j < col_slice.stop && (j - col_slice.start) % col_slice.step == 0);
+                if (!in_col_slice)
+                {
+                    adj_m.data[i][j] += adj_ret.data[i][j];
+                }
+                else
+                {
+                    adj_value.data[ii][jj] += adj_ret.data[i][j];
+                    ++jj;
+                }
+            }
+            assert(jj == ColSliceLength);
+            ++ii;
         }
     }
+    assert(ii == RowSliceLength);
 }
@@ -940,6 +2496,21 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(const mat_t<Rows,Cols,Type>& a, c
     return t;
 }
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(Type a, const mat_t<Rows,Cols,Type>& b)
+{
+    mat_t<Rows,Cols,Type> t;
+    for (unsigned i=0; i < Rows; ++i)
+    {
+        for (unsigned j=0; j < Cols; ++j)
+        {
+            t.data[i][j] = a + b.data[i][j];
+        }
+    }
+    return t;
+}
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
 {
@@ -955,6 +2526,21 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(const mat_t<Rows,Cols,Type>& a, c
     return t;
 }
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(Type a, const mat_t<Rows,Cols,Type>& b)
+{
+    mat_t<Rows,Cols,Type> t;
+    for (unsigned i=0; i < Rows; ++i)
+    {
+        for (unsigned j=0; j < Cols; ++j)
+        {
+            t.data[i][j] = a - b.data[i][j];
+        }
+    }
+    return t;
+}
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(const mat_t<Rows,Cols,Type>& a, Type b)
 {
@@ -1469,20 +3055,194 @@ template<unsigned Rows, unsigned Cols, typename Type>
 inline void CUDA_CALLABLE adj_extract(const mat_t<Rows,Cols,Type>& m, int row, int col, mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type adj_ret)
 {
 #ifndef NDEBUG
-    if (row < 0 || row > Rows)
+    if (row < -(int)Rows || row >= (int)Rows)
     {
         printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
         assert(0);
     }
-    if (col < 0 || col > Cols)
+    if (col < -(int)Cols || col >= (int)Cols)
     {
         printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    if (col < 0)
+    {
+        col += Cols;
+    }
     adj_m.data[row][col] += adj_ret;
 }
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_extract(
+    const mat_t<Rows,Cols,Type>& m, slice_t row_slice,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice,
+    const mat_t<RowSliceLength, ColSliceLength, Type>& adj_ret
+)
+{
+    static_assert(
+        RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
+        "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
+    );
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        for (int j = 0; j < Cols; ++j)
+        {
+            adj_m.data[i][j] += adj_ret.data[ii][j];
+        }
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_extract(
+    const mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col,
+    const vec_t<RowSliceLength, Type>& adj_ret
+)
+{
+#ifndef NDEBUG
+    if (col < -(int)Cols || col >= (int)Cols)
+    {
+        printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    if (col < 0)
+    {
+        col += Cols;
+    }
+    bool is_row_reversed = row_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        adj_m.data[i][col] += adj_ret.c[ii];
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
+template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_extract(
+    const mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice,
+    mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice,
+    const vec_t<ColSliceLength, Type>& adj_ret
+)
+{
+#ifndef NDEBUG
+    if (row < -(int)Rows || row >= (int)Rows)
+    {
+        printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    if (row < 0)
+    {
+        row += Rows;
+    }
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = col_slice.start;
+        is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
+        i += col_slice.step
+    )
+    {
+        adj_m.data[row][i] += adj_ret.c[ii];
+        ++ii;
+    }
+    assert(ii == ColSliceLength);
+}
+template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_extract(
+    const mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice,
+    mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice,
+    const mat_t<RowSliceLength, ColSliceLength, Type>& adj_ret
+)
+{
+    assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
+    assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
+    assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
+    assert(slice_get_length(row_slice) == RowSliceLength);
+    assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
+    assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
+    assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
+    assert(slice_get_length(col_slice) == ColSliceLength);
+    bool is_row_reversed = row_slice.step < 0;
+    bool is_col_reversed = col_slice.step < 0;
+    int ii = 0;
+    for (
+        int i = row_slice.start;
+        is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
+        i += row_slice.step
+    )
+    {
+        int jj = 0;
+        for (
+            int j = col_slice.start;
+            is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
+            j += col_slice.step
+        )
+        {
+            adj_m.data[i][j] += adj_ret.data[ii][jj];
+            ++jj;
+        }
+        assert(jj == ColSliceLength);
+        ++ii;
+    }
+    assert(ii == RowSliceLength);
+}
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE void adj_outer(const vec_t<Rows,Type>& a, const vec_t<Cols,Type>& b, vec_t<Rows,Type>& adj_a, vec_t<Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
 {
@@ -1503,6 +3263,23 @@ inline CUDA_CALLABLE void adj_add(const mat_t<Rows,Cols,Type>& a, const mat_t<Ro
     }
 }
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_add(
+    Type a, const mat_t<Rows,Cols,Type>& b,
+    Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
+    const mat_t<Rows,Cols,Type>& adj_ret
+)
+{
+    for (unsigned i=0; i < Rows; ++i)
+    {
+        for (unsigned j=0; j < Cols; ++j)
+        {
+            adj_a += adj_ret.data[i][j];
+            adj_b.data[i][j] += adj_ret.data[i][j];
+        }
+    }
+}
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE void adj_sub(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
 {
@@ -1516,6 +3293,23 @@ inline CUDA_CALLABLE void adj_sub(const mat_t<Rows,Cols,Type>& a, const mat_t<Ro
     }
 }
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_sub(
+    Type a, const mat_t<Rows,Cols,Type>& b,
+    Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
+    const mat_t<Rows,Cols,Type>& adj_ret
+)
+{
+    for (unsigned i=0; i < Rows; ++i)
+    {
+        for (unsigned j=0; j < Cols; ++j)
+        {
+            adj_a += adj_ret.data[i][j];
+            adj_b.data[i][j] -= adj_ret.data[i][j];
+        }
+    }
+}
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE void adj_div(const mat_t<Rows,Cols,Type>& a, Type s, mat_t<Rows,Cols,Type>& adj_a, Type& adj_s, const mat_t<Rows,Cols,Type>& adj_ret)
 {
@@ -1533,13 +3327,13 @@ inline CUDA_CALLABLE void adj_div(const mat_t<Rows,Cols,Type>& a, Type s, mat_t<
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE void adj_div(Type s, const mat_t<Rows,Cols,Type>& a, Type& adj_s, mat_t<Rows,Cols,Type>& adj_a, const mat_t<Rows,Cols,Type>& adj_ret)
 {
-    adj_s -= tensordot(a , adj_ret)/ (s * s); // - a / s^2
     for (unsigned i=0; i < Rows; ++i)
     {
         for (unsigned j=0; j < Cols; ++j)
         {
-            adj_a.data[i][j] += s / adj_ret.data[i][j];
+            Type inv = Type(1) / a.data[i][j];
+            adj_a.data[i][j] -= s * adj_ret.data[i][j] * inv * inv;
+            adj_s += adj_ret.data[i][j] * inv;
         }
     }
 }