warp-lang 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +482 -110
  3. warp/bin/warp-clang.dll +0 -0
  4. warp/bin/warp.dll +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +48 -63
  7. warp/builtins.py +955 -137
  8. warp/codegen.py +327 -209
  9. warp/config.py +1 -1
  10. warp/context.py +1363 -800
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/examples/interop/example_jax_callable.py +34 -4
  18. warp/examples/interop/example_jax_kernel.py +27 -1
  19. warp/fabric.py +1 -1
  20. warp/fem/cache.py +27 -19
  21. warp/fem/domain.py +2 -2
  22. warp/fem/field/nodal_field.py +2 -2
  23. warp/fem/field/virtual.py +266 -166
  24. warp/fem/geometry/geometry.py +5 -5
  25. warp/fem/integrate.py +200 -91
  26. warp/fem/space/restriction.py +4 -0
  27. warp/fem/space/shape/tet_shape_function.py +3 -10
  28. warp/jax_experimental/custom_call.py +1 -1
  29. warp/jax_experimental/ffi.py +203 -54
  30. warp/marching_cubes.py +708 -0
  31. warp/native/array.h +103 -8
  32. warp/native/builtin.h +90 -9
  33. warp/native/bvh.cpp +64 -28
  34. warp/native/bvh.cu +58 -58
  35. warp/native/bvh.h +2 -2
  36. warp/native/clang/clang.cpp +7 -7
  37. warp/native/coloring.cpp +13 -3
  38. warp/native/crt.cpp +2 -2
  39. warp/native/crt.h +3 -5
  40. warp/native/cuda_util.cpp +42 -11
  41. warp/native/cuda_util.h +10 -4
  42. warp/native/exports.h +1842 -1908
  43. warp/native/fabric.h +2 -1
  44. warp/native/hashgrid.cpp +37 -37
  45. warp/native/hashgrid.cu +2 -2
  46. warp/native/initializer_array.h +1 -1
  47. warp/native/intersect.h +4 -4
  48. warp/native/mat.h +1913 -119
  49. warp/native/mathdx.cpp +43 -43
  50. warp/native/mesh.cpp +24 -24
  51. warp/native/mesh.cu +26 -26
  52. warp/native/mesh.h +5 -3
  53. warp/native/nanovdb/GridHandle.h +179 -12
  54. warp/native/nanovdb/HostBuffer.h +8 -7
  55. warp/native/nanovdb/NanoVDB.h +517 -895
  56. warp/native/nanovdb/NodeManager.h +323 -0
  57. warp/native/nanovdb/PNanoVDB.h +2 -2
  58. warp/native/quat.h +337 -16
  59. warp/native/rand.h +7 -7
  60. warp/native/range.h +7 -1
  61. warp/native/reduce.cpp +10 -10
  62. warp/native/reduce.cu +13 -14
  63. warp/native/runlength_encode.cpp +2 -2
  64. warp/native/runlength_encode.cu +5 -5
  65. warp/native/scan.cpp +3 -3
  66. warp/native/scan.cu +4 -4
  67. warp/native/sort.cpp +10 -10
  68. warp/native/sort.cu +22 -22
  69. warp/native/sparse.cpp +8 -8
  70. warp/native/sparse.cu +14 -14
  71. warp/native/spatial.h +366 -17
  72. warp/native/svd.h +23 -8
  73. warp/native/temp_buffer.h +2 -2
  74. warp/native/tile.h +303 -70
  75. warp/native/tile_radix_sort.h +5 -1
  76. warp/native/tile_reduce.h +16 -25
  77. warp/native/tuple.h +2 -2
  78. warp/native/vec.h +385 -18
  79. warp/native/volume.cpp +54 -54
  80. warp/native/volume.cu +1 -1
  81. warp/native/volume.h +2 -1
  82. warp/native/volume_builder.cu +30 -37
  83. warp/native/warp.cpp +150 -149
  84. warp/native/warp.cu +337 -193
  85. warp/native/warp.h +227 -226
  86. warp/optim/linear.py +736 -271
  87. warp/render/imgui_manager.py +289 -0
  88. warp/render/render_opengl.py +137 -57
  89. warp/render/render_usd.py +0 -1
  90. warp/sim/collide.py +1 -2
  91. warp/sim/graph_coloring.py +2 -2
  92. warp/sim/integrator_vbd.py +10 -2
  93. warp/sparse.py +559 -176
  94. warp/tape.py +2 -0
  95. warp/tests/aux_test_module_aot.py +7 -0
  96. warp/tests/cuda/test_async.py +3 -3
  97. warp/tests/cuda/test_conditional_captures.py +101 -0
  98. warp/tests/geometry/test_marching_cubes.py +233 -12
  99. warp/tests/sim/test_cloth.py +89 -6
  100. warp/tests/sim/test_coloring.py +82 -7
  101. warp/tests/test_array.py +56 -5
  102. warp/tests/test_assert.py +53 -0
  103. warp/tests/test_atomic_cas.py +127 -114
  104. warp/tests/test_codegen.py +3 -2
  105. warp/tests/test_context.py +8 -15
  106. warp/tests/test_enum.py +136 -0
  107. warp/tests/test_examples.py +2 -2
  108. warp/tests/test_fem.py +45 -2
  109. warp/tests/test_fixedarray.py +229 -0
  110. warp/tests/test_func.py +18 -15
  111. warp/tests/test_future_annotations.py +7 -5
  112. warp/tests/test_linear_solvers.py +30 -0
  113. warp/tests/test_map.py +1 -1
  114. warp/tests/test_mat.py +1540 -378
  115. warp/tests/test_mat_assign_copy.py +178 -0
  116. warp/tests/test_mat_constructors.py +574 -0
  117. warp/tests/test_module_aot.py +287 -0
  118. warp/tests/test_print.py +69 -0
  119. warp/tests/test_quat.py +162 -34
  120. warp/tests/test_quat_assign_copy.py +145 -0
  121. warp/tests/test_reload.py +2 -1
  122. warp/tests/test_sparse.py +103 -0
  123. warp/tests/test_spatial.py +140 -34
  124. warp/tests/test_spatial_assign_copy.py +160 -0
  125. warp/tests/test_static.py +48 -0
  126. warp/tests/test_struct.py +43 -3
  127. warp/tests/test_tape.py +38 -0
  128. warp/tests/test_types.py +0 -20
  129. warp/tests/test_vec.py +216 -441
  130. warp/tests/test_vec_assign_copy.py +143 -0
  131. warp/tests/test_vec_constructors.py +325 -0
  132. warp/tests/tile/test_tile.py +206 -152
  133. warp/tests/tile/test_tile_cholesky.py +605 -0
  134. warp/tests/tile/test_tile_load.py +169 -0
  135. warp/tests/tile/test_tile_mathdx.py +2 -558
  136. warp/tests/tile/test_tile_matmul.py +179 -0
  137. warp/tests/tile/test_tile_mlp.py +1 -1
  138. warp/tests/tile/test_tile_reduce.py +100 -11
  139. warp/tests/tile/test_tile_shared_memory.py +16 -16
  140. warp/tests/tile/test_tile_sort.py +59 -55
  141. warp/tests/unittest_suites.py +16 -0
  142. warp/tests/walkthrough_debug.py +1 -1
  143. warp/thirdparty/unittest_parallel.py +108 -9
  144. warp/types.py +554 -264
  145. warp/utils.py +68 -86
  146. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
  147. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
  148. warp/native/marching.cpp +0 -19
  149. warp/native/marching.cu +0 -514
  150. warp/native/marching.h +0 -19
  151. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
  152. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
  153. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/mat.h CHANGED
@@ -177,12 +177,12 @@ struct mat_t
177
177
 
178
178
  CUDA_CALLABLE vec_t<Cols,Type> get_row(int index) const
179
179
  {
180
- return (vec_t<Cols,Type>&)data[index];
180
+ return reinterpret_cast<const vec_t<Cols,Type>&>(data[index]);
181
181
  }
182
182
 
183
183
  CUDA_CALLABLE void set_row(int index, const vec_t<Cols,Type>& v)
184
184
  {
185
- (vec_t<Cols,Type>&)data[index] = v;
185
+ reinterpret_cast<vec_t<Cols,Type>&>(data[index]) = v;
186
186
  }
187
187
 
188
188
  CUDA_CALLABLE vec_t<Rows,Type> get_col(int index) const
@@ -204,7 +204,7 @@ struct mat_t
204
204
  }
205
205
 
206
206
  // row major storage assumed to be compatible with PyTorch
207
- Type data[Rows][Cols];
207
+ Type data[Rows < 1 ? 1 : Rows][Cols < 1 ? 1 : Cols];
208
208
  };
209
209
 
210
210
  template<typename Type>
@@ -477,6 +477,20 @@ template<unsigned Rows, unsigned Cols, typename Type>
477
477
  inline CUDA_CALLABLE vec_t<Cols,Type> extract(const mat_t<Rows,Cols,Type>& m, int row)
478
478
  {
479
479
  vec_t<Cols,Type> ret;
480
+
481
+ #ifndef NDEBUG
482
+ if (row < -(int)Rows || row >= (int)Rows)
483
+ {
484
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
485
+ assert(0);
486
+ }
487
+ #endif
488
+
489
+ if (row < 0)
490
+ {
491
+ row += Rows;
492
+ }
493
+
480
494
  for(unsigned i=0; i < Cols; ++i)
481
495
  {
482
496
  ret.c[i] = m.data[row][i];
@@ -488,31 +502,206 @@ template<unsigned Rows, unsigned Cols, typename Type>
488
502
  inline CUDA_CALLABLE Type extract(const mat_t<Rows,Cols,Type>& m, int row, int col)
489
503
  {
490
504
  #ifndef NDEBUG
491
- if (row < 0 || row >= Rows)
505
+ if (row < -(int)Rows || row >= (int)Rows)
492
506
  {
493
507
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
494
508
  assert(0);
495
509
  }
496
- if (col < 0 || col >= Cols)
510
+ if (col < -(int)Cols || col >= (int)Cols)
497
511
  {
498
512
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
499
513
  assert(0);
500
514
  }
501
515
  #endif
516
+
517
+ if (row < 0)
518
+ {
519
+ row += Rows;
520
+ }
521
+ if (col < 0)
522
+ {
523
+ col += Cols;
524
+ }
525
+
502
526
  return m.data[row][col];
503
527
  }
504
528
 
529
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
530
+ inline CUDA_CALLABLE mat_t<RowSliceLength, ColSliceLength, Type> extract(const mat_t<Rows,Cols,Type>& m, slice_t row_slice)
531
+ {
532
+ static_assert(
533
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
534
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
535
+ );
536
+
537
+ mat_t<RowSliceLength, ColSliceLength, Type> ret;
538
+
539
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
540
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
541
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
542
+ assert(slice_get_length(row_slice) == RowSliceLength);
543
+
544
+ bool is_row_reversed = row_slice.step < 0;
545
+
546
+ int ii = 0;
547
+ for (
548
+ int i = row_slice.start;
549
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
550
+ i += row_slice.step
551
+ )
552
+ {
553
+ for (int j = 0; j < Cols; ++j)
554
+ {
555
+ ret.data[ii][j] = m.data[i][j];
556
+ }
557
+
558
+ ++ii;
559
+ }
560
+
561
+ assert(ii == RowSliceLength);
562
+ return ret;
563
+ }
564
+
565
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
566
+ inline CUDA_CALLABLE vec_t<RowSliceLength, Type> extract(const mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col)
567
+ {
568
+ #ifndef NDEBUG
569
+ if (col < -(int)Cols || col >= (int)Cols)
570
+ {
571
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
572
+ assert(0);
573
+ }
574
+ #endif
575
+
576
+ vec_t<RowSliceLength, Type> ret;
577
+
578
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
579
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
580
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
581
+ assert(slice_get_length(row_slice) == RowSliceLength);
582
+
583
+ if (col < 0)
584
+ {
585
+ col += Cols;
586
+ }
587
+
588
+ bool is_row_reversed = row_slice.step < 0;
589
+
590
+ int ii = 0;
591
+ for (
592
+ int i = row_slice.start;
593
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
594
+ i += row_slice.step
595
+ )
596
+ {
597
+ ret.c[ii] = m.data[i][col];
598
+ ++ii;
599
+ }
600
+
601
+ assert(ii == RowSliceLength);
602
+ return ret;
603
+ }
604
+
605
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
606
+ inline CUDA_CALLABLE vec_t<ColSliceLength, Type> extract(const mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice)
607
+ {
608
+ #ifndef NDEBUG
609
+ if (row < -(int)Rows || row >= (int)Rows)
610
+ {
611
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
612
+ assert(0);
613
+ }
614
+ #endif
615
+
616
+ vec_t<ColSliceLength, Type> ret;
617
+
618
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
619
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
620
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
621
+ assert(slice_get_length(col_slice) == ColSliceLength);
622
+
623
+ if (row < 0)
624
+ {
625
+ row += Rows;
626
+ }
627
+
628
+ bool is_col_reversed = col_slice.step < 0;
629
+
630
+ int ii = 0;
631
+ for (
632
+ int i = col_slice.start;
633
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
634
+ i += col_slice.step
635
+ )
636
+ {
637
+ ret.c[ii] = m.data[row][i];
638
+ ++ii;
639
+ }
640
+
641
+ assert(ii == ColSliceLength);
642
+ return ret;
643
+ }
644
+
645
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
646
+ inline CUDA_CALLABLE mat_t<RowSliceLength, ColSliceLength, Type> extract(const mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice)
647
+ {
648
+ mat_t<RowSliceLength, ColSliceLength, Type> ret;
649
+
650
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
651
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
652
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
653
+ assert(slice_get_length(row_slice) == RowSliceLength);
654
+
655
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
656
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
657
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
658
+ assert(slice_get_length(col_slice) == ColSliceLength);
659
+
660
+ bool is_row_reversed = row_slice.step < 0;
661
+ bool is_col_reversed = col_slice.step < 0;
662
+
663
+ int ii = 0;
664
+ for (
665
+ int i = row_slice.start;
666
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
667
+ i += row_slice.step
668
+ )
669
+ {
670
+ int jj = 0;
671
+ for (
672
+ int j = col_slice.start;
673
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
674
+ j += col_slice.step
675
+ )
676
+ {
677
+ ret.data[ii][jj] = m.data[i][j];
678
+ ++jj;
679
+ }
680
+
681
+ assert(jj == ColSliceLength);
682
+ ++ii;
683
+ }
684
+
685
+ assert(ii == RowSliceLength);
686
+ return ret;
687
+ }
688
+
505
689
  template<unsigned Rows, unsigned Cols, typename Type>
506
690
  inline CUDA_CALLABLE vec_t<Cols, Type>* index(mat_t<Rows,Cols,Type>& m, int row)
507
691
  {
508
692
  #ifndef NDEBUG
509
- if (row < 0 || row >= Rows)
693
+ if (row < -(int)Rows || row >= (int)Rows)
510
694
  {
511
695
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
512
696
  assert(0);
513
697
  }
514
698
  #endif
515
699
 
700
+ if (row < 0)
701
+ {
702
+ row += Rows;
703
+ }
704
+
516
705
  return reinterpret_cast<vec_t<Cols, Type>*>(&m.data[row]);
517
706
  }
518
707
 
@@ -520,38 +709,46 @@ template<unsigned Rows, unsigned Cols, typename Type>
520
709
  inline CUDA_CALLABLE Type* index(mat_t<Rows,Cols,Type>& m, int row, int col)
521
710
  {
522
711
  #ifndef NDEBUG
523
- if (row < 0 || row >= Rows)
712
+ if (row < -(int)Rows || row >= (int)Rows)
524
713
  {
525
714
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
526
715
  assert(0);
527
716
  }
528
- if (col < 0 || col >= Cols)
717
+ if (col < -(int)Cols || col >= (int)Cols)
529
718
  {
530
719
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
531
720
  assert(0);
532
721
  }
533
722
  #endif
534
723
 
724
+ if (row < 0)
725
+ {
726
+ row += Rows;
727
+ }
728
+ if (col < 0)
729
+ {
730
+ col += Cols;
731
+ }
732
+
535
733
  return &m.data[row][col];
536
734
  }
537
735
 
538
736
  template<unsigned Rows, unsigned Cols, typename Type>
539
737
  inline CUDA_CALLABLE void adj_index(const mat_t<Rows,Cols,Type>& m, int row,
540
- const mat_t<Rows,Cols,Type>& adj_m, int adj_row, const vec_t<Cols, Type>& adj_value)
738
+ const mat_t<Rows,Cols,Type>& adj_m, int adj_row, const vec_t<Cols, Type>& adj_value)
541
739
  {
542
740
  // nop
543
741
  }
544
742
 
545
743
  template<unsigned Rows, unsigned Cols, typename Type>
546
744
  inline CUDA_CALLABLE void adj_index(const mat_t<Rows,Cols,Type>& m, int row, int col,
547
- const mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type adj_value)
745
+ const mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type adj_value)
548
746
  {
549
747
  // nop
550
748
  }
551
749
 
552
-
553
750
  template<unsigned Rows, unsigned Cols, typename Type>
554
- inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
751
+ inline CUDA_CALLABLE Type* indexref(mat_t<Rows,Cols,Type>* m, int row, int col)
555
752
  {
556
753
  #ifndef NDEBUG
557
754
  if (row < 0 || row >= Rows)
@@ -566,201 +763,1091 @@ inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col
566
763
  }
567
764
  #endif
568
765
 
569
- m.data[row][col] += value;
766
+ return &(m->data)[row][col];
570
767
  }
571
768
 
572
-
573
769
  template<unsigned Rows, unsigned Cols, typename Type>
574
- inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
770
+ inline CUDA_CALLABLE void adj_indexref(mat_t<Rows,Cols,Type>* m, int row, int col,
771
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, const Type& adj_value)
575
772
  {
576
- #ifndef NDEBUG
577
- if (row < 0 || row >= Rows)
578
- {
579
- printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
580
- assert(0);
581
- }
582
- #endif
583
-
584
- for(unsigned i=0; i < Cols; ++i)
585
- {
586
- m.data[row][i] += value[i];
587
- }
773
+ // nop
588
774
  }
589
775
 
590
776
 
591
777
  template<unsigned Rows, unsigned Cols, typename Type>
592
- inline CUDA_CALLABLE void adj_add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
593
- mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value)
778
+ inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
594
779
  {
595
780
  #ifndef NDEBUG
596
- if (row < 0 || row >= Rows)
781
+ if (row < -(int)Rows || row >= (int)Rows)
597
782
  {
598
783
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
599
784
  assert(0);
600
785
  }
601
- if (col < 0 || col >= Cols)
786
+ if (col < -(int)Cols || col >= (int)Cols)
602
787
  {
603
788
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
604
789
  assert(0);
605
790
  }
606
791
  #endif
607
792
 
608
- adj_value += adj_m.data[row][col];
793
+ if (row < 0)
794
+ {
795
+ row += Rows;
796
+ }
797
+ if (col < 0)
798
+ {
799
+ col += Cols;
800
+ }
801
+
802
+ m.data[row][col] += value;
609
803
  }
610
804
 
611
805
 
612
806
  template<unsigned Rows, unsigned Cols, typename Type>
613
- inline CUDA_CALLABLE void adj_add_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
614
- mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value)
807
+ inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
615
808
  {
616
809
  #ifndef NDEBUG
617
- if (row < 0 || row >= Rows)
810
+ if (row < -(int)Rows || row >= (int)Rows)
618
811
  {
619
812
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
620
813
  assert(0);
621
814
  }
622
815
  #endif
623
816
 
817
+ if (row < 0)
818
+ {
819
+ row += Rows;
820
+ }
821
+
624
822
  for(unsigned i=0; i < Cols; ++i)
625
823
  {
626
- adj_value[i] += adj_m.data[row][i];
824
+ m.data[row][i] += value[i];
627
825
  }
628
826
  }
629
827
 
630
828
 
631
- template<unsigned Rows, unsigned Cols, typename Type>
632
- inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
829
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
830
+ inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
633
831
  {
634
- #ifndef NDEBUG
635
- if (row < 0 || row >= Rows)
832
+ static_assert(
833
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
834
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
835
+ );
836
+
837
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
838
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
839
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
840
+ assert(slice_get_length(row_slice) == RowSliceLength);
841
+
842
+ bool is_row_reversed = row_slice.step < 0;
843
+
844
+ int ii = 0;
845
+ for (
846
+ int i = row_slice.start;
847
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
848
+ i += row_slice.step
849
+ )
636
850
  {
637
- printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
638
- assert(0);
851
+ for (int j = 0; j < Cols; ++j)
852
+ {
853
+ m.data[i][j] += value.data[ii][j];
854
+ }
855
+
856
+ ++ii;
639
857
  }
640
- if (col < 0 || col >= Cols)
858
+
859
+ assert(ii == RowSliceLength);
860
+ }
861
+
862
+
863
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
864
+ inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
865
+ {
866
+ #ifndef NDEBUG
867
+ if (col < -(int)Cols || col >= (int)Cols)
641
868
  {
642
869
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
643
870
  assert(0);
644
871
  }
645
872
  #endif
646
873
 
647
- m.data[row][col] -= value;
874
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
875
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
876
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
877
+ assert(slice_get_length(row_slice) == RowSliceLength);
878
+
879
+ if (col < 0)
880
+ {
881
+ col += Cols;
882
+ }
883
+
884
+ bool is_row_reversed = row_slice.step < 0;
885
+
886
+ int ii = 0;
887
+ for (
888
+ int i = row_slice.start;
889
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
890
+ i += row_slice.step
891
+ )
892
+ {
893
+ m.data[i][col] += value.c[ii];
894
+ ++ii;
895
+ }
896
+
897
+ assert(ii == RowSliceLength);
648
898
  }
649
899
 
650
900
 
651
- template<unsigned Rows, unsigned Cols, typename Type>
652
- inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
901
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
902
+ inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
653
903
  {
654
904
  #ifndef NDEBUG
655
- if (row < 0 || row >= Rows)
905
+ if (row < -(int)Rows || row >= (int)Rows)
656
906
  {
657
907
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
658
908
  assert(0);
659
909
  }
660
910
  #endif
661
911
 
662
- for(unsigned i=0; i < Cols; ++i)
912
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
913
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
914
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
915
+ assert(slice_get_length(col_slice) == ColSliceLength);
916
+
917
+ if (row < 0)
663
918
  {
664
- m.data[row][i] -= value[i];
919
+ row += Rows;
920
+ }
921
+
922
+ bool is_col_reversed = col_slice.step < 0;
923
+
924
+ int ii = 0;
925
+ for (
926
+ int i = col_slice.start;
927
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
928
+ i += col_slice.step
929
+ )
930
+ {
931
+ m.data[row][i] += value.c[ii];
932
+ ++ii;
933
+ }
934
+
935
+ assert(ii == ColSliceLength);
936
+ }
937
+
938
+
939
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
940
+ inline CUDA_CALLABLE void add_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
941
+ {
942
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
943
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
944
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
945
+ assert(slice_get_length(row_slice) == RowSliceLength);
946
+
947
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
948
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
949
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
950
+ assert(slice_get_length(col_slice) == ColSliceLength);
951
+
952
+ bool is_row_reversed = row_slice.step < 0;
953
+ bool is_col_reversed = col_slice.step < 0;
954
+
955
+ int ii = 0;
956
+ for (
957
+ int i = row_slice.start;
958
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
959
+ i += row_slice.step
960
+ )
961
+ {
962
+ int jj = 0;
963
+ for (
964
+ int j = col_slice.start;
965
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
966
+ j += col_slice.step
967
+ )
968
+ {
969
+ m.data[i][j] += value.data[ii][jj];
970
+ ++jj;
971
+ }
972
+
973
+ assert(jj == ColSliceLength);
974
+ ++ii;
665
975
  }
976
+
977
+ assert(ii == RowSliceLength);
666
978
  }
667
979
 
668
980
 
669
981
  template<unsigned Rows, unsigned Cols, typename Type>
670
- inline CUDA_CALLABLE void adj_sub_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
982
+ inline CUDA_CALLABLE void adj_add_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
671
983
  mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value)
672
984
  {
673
985
  #ifndef NDEBUG
674
- if (row < 0 || row >= Rows)
986
+ if (row < -(int)Rows || row >= (int)Rows)
675
987
  {
676
988
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
677
989
  assert(0);
678
990
  }
679
- if (col < 0 || col >= Cols)
991
+ if (col < -(int)Cols || col >= (int)Cols)
680
992
  {
681
993
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
682
994
  assert(0);
683
995
  }
684
996
  #endif
685
997
 
686
- adj_value -= adj_m.data[row][col];
998
+ if (row < 0)
999
+ {
1000
+ row += Rows;
1001
+ }
1002
+ if (col < 0)
1003
+ {
1004
+ col += Cols;
1005
+ }
1006
+
1007
+ adj_value += adj_m.data[row][col];
687
1008
  }
688
1009
 
689
1010
 
690
1011
  template<unsigned Rows, unsigned Cols, typename Type>
691
- inline CUDA_CALLABLE void adj_sub_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
1012
+ inline CUDA_CALLABLE void adj_add_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
692
1013
  mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value)
693
1014
  {
694
1015
  #ifndef NDEBUG
695
- if (row < 0 || row >= Rows)
1016
+ if (row < -(int)Rows || row >= (int)Rows)
696
1017
  {
697
1018
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
698
1019
  assert(0);
699
1020
  }
700
1021
  #endif
701
1022
 
1023
+ if (row < 0)
1024
+ {
1025
+ row += Rows;
1026
+ }
1027
+
702
1028
  for(unsigned i=0; i < Cols; ++i)
703
1029
  {
704
- adj_value[i] -= adj_m.data[row][i];
1030
+ adj_value[i] += adj_m.data[row][i];
705
1031
  }
706
1032
  }
707
1033
 
708
1034
 
709
- template<unsigned Rows, unsigned Cols, typename Type>
710
- inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
1035
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1036
+ inline CUDA_CALLABLE void adj_add_inplace(
1037
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
1038
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
1039
+ )
711
1040
  {
712
- #ifndef NDEBUG
713
- if (row < 0 || row >= Rows)
714
- {
715
- printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
716
- assert(0);
717
- }
718
- if (col < 0 || col >= Cols)
1041
+ static_assert(
1042
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
1043
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
1044
+ );
1045
+
1046
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1047
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1048
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1049
+ assert(slice_get_length(row_slice) == RowSliceLength);
1050
+
1051
+ bool is_row_reversed = row_slice.step < 0;
1052
+
1053
+ int ii = 0;
1054
+ for (
1055
+ int i = row_slice.start;
1056
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1057
+ i += row_slice.step
1058
+ )
719
1059
  {
720
- printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
721
- assert(0);
1060
+ for (int j = 0; j < Cols; ++j)
1061
+ {
1062
+ adj_value.data[ii][j] += adj_m.data[i][j];
1063
+ }
1064
+
1065
+ ++ii;
722
1066
  }
723
- #endif
724
1067
 
725
- m.data[row][col] = value;
1068
+ assert(ii == RowSliceLength);
726
1069
  }
727
1070
 
728
1071
 
729
- template<unsigned Rows, unsigned Cols, typename Type>
730
- inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
1072
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
1073
+ inline CUDA_CALLABLE void adj_add_inplace(
1074
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
1075
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
1076
+ )
731
1077
  {
732
1078
  #ifndef NDEBUG
733
- if (row < 0 || row >= Rows)
1079
+ if (col < -(int)Cols || col >= (int)Cols)
734
1080
  {
735
- printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1081
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
736
1082
  assert(0);
737
1083
  }
738
1084
  #endif
739
1085
 
740
- for(unsigned i=0; i < Cols; ++i)
1086
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1087
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1088
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1089
+ assert(slice_get_length(row_slice) == RowSliceLength);
1090
+
1091
+ if (col < 0)
741
1092
  {
742
- m.data[row][i] = value[i];
1093
+ col += Cols;
1094
+ }
1095
+
1096
+ bool is_row_reversed = row_slice.step < 0;
1097
+
1098
+ int ii = 0;
1099
+ for (
1100
+ int i = row_slice.start;
1101
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1102
+ i += row_slice.step
1103
+ )
1104
+ {
1105
+ adj_value.c[ii] += adj_m.data[i][col];
1106
+ ++ii;
743
1107
  }
1108
+
1109
+ assert(ii == RowSliceLength);
744
1110
  }
745
1111
 
746
1112
 
747
- template<unsigned Rows, unsigned Cols, typename Type>
748
- inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
749
- mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value)
1113
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1114
+ inline CUDA_CALLABLE void adj_add_inplace(
1115
+ mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
1116
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
1117
+ )
750
1118
  {
751
1119
  #ifndef NDEBUG
752
- if (row < 0 || row >= Rows)
1120
+ if (row < -(int)Rows || row >= (int)Rows)
753
1121
  {
754
1122
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
755
1123
  assert(0);
756
1124
  }
757
- if (col < 0 || col >= Cols)
1125
+ #endif
1126
+
1127
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1128
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1129
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1130
+ assert(slice_get_length(col_slice) == ColSliceLength);
1131
+
1132
+ if (row < 0)
1133
+ {
1134
+ row += Rows;
1135
+ }
1136
+
1137
+ bool is_col_reversed = col_slice.step < 0;
1138
+
1139
+ int ii = 0;
1140
+ for (
1141
+ int i = col_slice.start;
1142
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
1143
+ i += col_slice.step
1144
+ )
1145
+ {
1146
+ adj_value.c[ii] += adj_m.data[row][i];
1147
+ ++ii;
1148
+ }
1149
+
1150
+ assert(ii == ColSliceLength);
1151
+ }
1152
+
1153
+
1154
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1155
+ inline CUDA_CALLABLE void adj_add_inplace(
1156
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
1157
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
1158
+ )
1159
+ {
1160
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1161
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1162
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1163
+ assert(slice_get_length(row_slice) == RowSliceLength);
1164
+
1165
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1166
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1167
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1168
+ assert(slice_get_length(col_slice) == ColSliceLength);
1169
+
1170
+ bool is_row_reversed = row_slice.step < 0;
1171
+ bool is_col_reversed = col_slice.step < 0;
1172
+
1173
+ int ii = 0;
1174
+ for (
1175
+ int i = row_slice.start;
1176
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1177
+ i += row_slice.step
1178
+ )
1179
+ {
1180
+ int jj = 0;
1181
+ for (
1182
+ int j = col_slice.start;
1183
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
1184
+ j += col_slice.step
1185
+ )
1186
+ {
1187
+ adj_value.data[ii][jj] += adj_m.data[i][j];
1188
+ ++jj;
1189
+ }
1190
+
1191
+ assert(jj == ColSliceLength);
1192
+ ++ii;
1193
+ }
1194
+
1195
+ assert(ii == RowSliceLength);
1196
+ }
1197
+
1198
+
1199
+ template<unsigned Rows, unsigned Cols, typename Type>
1200
+ inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
1201
+ {
1202
+ #ifndef NDEBUG
1203
+ if (row < -(int)Rows || row >= (int)Rows)
1204
+ {
1205
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1206
+ assert(0);
1207
+ }
1208
+ if (col < -(int)Cols || col >= (int)Cols)
1209
+ {
1210
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
1211
+ assert(0);
1212
+ }
1213
+ #endif
1214
+
1215
+ if (row < 0)
1216
+ {
1217
+ row += Rows;
1218
+ }
1219
+ if (col < 0)
1220
+ {
1221
+ col += Cols;
1222
+ }
1223
+
1224
+ m.data[row][col] -= value;
1225
+ }
1226
+
1227
+
1228
+ template<unsigned Rows, unsigned Cols, typename Type>
1229
+ inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
1230
+ {
1231
+ #ifndef NDEBUG
1232
+ if (row < -(int)Rows || row >= (int)Rows)
1233
+ {
1234
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1235
+ assert(0);
1236
+ }
1237
+ #endif
1238
+
1239
+ if (row < 0)
1240
+ {
1241
+ row += Rows;
1242
+ }
1243
+
1244
+ for(unsigned i=0; i < Cols; ++i)
1245
+ {
1246
+ m.data[row][i] -= value[i];
1247
+ }
1248
+ }
1249
+
1250
+
1251
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1252
+ inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1253
+ {
1254
+ static_assert(
1255
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
1256
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
1257
+ );
1258
+
1259
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1260
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1261
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1262
+ assert(slice_get_length(row_slice) == RowSliceLength);
1263
+
1264
+ bool is_row_reversed = row_slice.step < 0;
1265
+
1266
+ int ii = 0;
1267
+ for (
1268
+ int i = row_slice.start;
1269
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1270
+ i += row_slice.step
1271
+ )
1272
+ {
1273
+ for (int j = 0; j < Cols; ++j)
1274
+ {
1275
+ m.data[i][j] -= value.data[ii][j];
1276
+ }
1277
+
1278
+ ++ii;
1279
+ }
1280
+
1281
+ assert(ii == RowSliceLength);
1282
+ }
1283
+
1284
+
1285
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
1286
+ inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
1287
+ {
1288
+ #ifndef NDEBUG
1289
+ if (col < -(int)Cols || col >= (int)Cols)
1290
+ {
1291
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
1292
+ assert(0);
1293
+ }
1294
+ #endif
1295
+
1296
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1297
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1298
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1299
+ assert(slice_get_length(row_slice) == RowSliceLength);
1300
+
1301
+ if (col < 0)
1302
+ {
1303
+ col += Cols;
1304
+ }
1305
+
1306
+ bool is_row_reversed = row_slice.step < 0;
1307
+
1308
+ int ii = 0;
1309
+ for (
1310
+ int i = row_slice.start;
1311
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1312
+ i += row_slice.step
1313
+ )
1314
+ {
1315
+ m.data[i][col] -= value.c[ii];
1316
+ ++ii;
1317
+ }
1318
+
1319
+ assert(ii == RowSliceLength);
1320
+ }
1321
+
1322
+
1323
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1324
+ inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
1325
+ {
1326
+ #ifndef NDEBUG
1327
+ if (row < -(int)Rows || row >= (int)Rows)
1328
+ {
1329
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1330
+ assert(0);
1331
+ }
1332
+ #endif
1333
+
1334
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1335
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1336
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1337
+ assert(slice_get_length(col_slice) == ColSliceLength);
1338
+
1339
+ if (row < 0)
1340
+ {
1341
+ row += Rows;
1342
+ }
1343
+
1344
+ bool is_col_reversed = col_slice.step < 0;
1345
+
1346
+ int ii = 0;
1347
+ for (
1348
+ int i = col_slice.start;
1349
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
1350
+ i += col_slice.step
1351
+ )
1352
+ {
1353
+ m.data[row][i] -= value.c[ii];
1354
+ ++ii;
1355
+ }
1356
+
1357
+ assert(ii == ColSliceLength);
1358
+ }
1359
+
1360
+
1361
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1362
+ inline CUDA_CALLABLE void sub_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1363
+ {
1364
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1365
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1366
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1367
+ assert(slice_get_length(row_slice) == RowSliceLength);
1368
+
1369
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1370
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1371
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1372
+ assert(slice_get_length(col_slice) == ColSliceLength);
1373
+
1374
+ bool is_row_reversed = row_slice.step < 0;
1375
+ bool is_col_reversed = col_slice.step < 0;
1376
+
1377
+ int ii = 0;
1378
+ for (
1379
+ int i = row_slice.start;
1380
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1381
+ i += row_slice.step
1382
+ )
1383
+ {
1384
+ int jj = 0;
1385
+ for (
1386
+ int j = col_slice.start;
1387
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
1388
+ j += col_slice.step
1389
+ )
1390
+ {
1391
+ m.data[i][j] -= value.data[ii][jj];
1392
+ ++jj;
1393
+ }
1394
+
1395
+ assert(jj == ColSliceLength);
1396
+ ++ii;
1397
+ }
1398
+
1399
+ assert(ii == RowSliceLength);
1400
+ }
1401
+
1402
+
1403
+ template<unsigned Rows, unsigned Cols, typename Type>
1404
+ inline CUDA_CALLABLE void adj_sub_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
1405
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value)
1406
+ {
1407
+ #ifndef NDEBUG
1408
+ if (row < -(int)Rows || row >= (int)Rows)
1409
+ {
1410
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1411
+ assert(0);
1412
+ }
1413
+ if (col < -(int)Cols || col >= (int)Cols)
758
1414
  {
759
1415
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
760
1416
  assert(0);
761
1417
  }
762
1418
  #endif
763
1419
 
1420
+ if (row < 0)
1421
+ {
1422
+ row += Rows;
1423
+ }
1424
+ if (col < 0)
1425
+ {
1426
+ col += Cols;
1427
+ }
1428
+
1429
+ adj_value -= adj_m.data[row][col];
1430
+ }
1431
+
1432
+
1433
+ template<unsigned Rows, unsigned Cols, typename Type>
1434
+ inline CUDA_CALLABLE void adj_sub_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
1435
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value)
1436
+ {
1437
+ #ifndef NDEBUG
1438
+ if (row < -(int)Rows || row >= (int)Rows)
1439
+ {
1440
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1441
+ assert(0);
1442
+ }
1443
+ #endif
1444
+
1445
+ if (row < 0)
1446
+ {
1447
+ row += Rows;
1448
+ }
1449
+
1450
+ for(unsigned i=0; i < Cols; ++i)
1451
+ {
1452
+ adj_value[i] -= adj_m.data[row][i];
1453
+ }
1454
+ }
1455
+
1456
+
1457
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1458
+ inline CUDA_CALLABLE void adj_sub_inplace(
1459
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
1460
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
1461
+ )
1462
+ {
1463
+ static_assert(
1464
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
1465
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
1466
+ );
1467
+
1468
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1469
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1470
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1471
+ assert(slice_get_length(row_slice) == RowSliceLength);
1472
+
1473
+ bool is_row_reversed = row_slice.step < 0;
1474
+
1475
+ int ii = 0;
1476
+ for (
1477
+ int i = row_slice.start;
1478
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1479
+ i += row_slice.step
1480
+ )
1481
+ {
1482
+ for (int j = 0; j < Cols; ++j)
1483
+ {
1484
+ adj_value.data[ii][j] -= adj_m.data[i][j];
1485
+ }
1486
+
1487
+ ++ii;
1488
+ }
1489
+
1490
+ assert(ii == RowSliceLength);
1491
+ }
1492
+
1493
+
1494
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
1495
+ inline CUDA_CALLABLE void adj_sub_inplace(
1496
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
1497
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
1498
+ )
1499
+ {
1500
+ #ifndef NDEBUG
1501
+ if (col < -(int)Cols || col >= (int)Cols)
1502
+ {
1503
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
1504
+ assert(0);
1505
+ }
1506
+ #endif
1507
+
1508
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1509
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1510
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1511
+ assert(slice_get_length(row_slice) == RowSliceLength);
1512
+
1513
+ if (col < 0)
1514
+ {
1515
+ col += Cols;
1516
+ }
1517
+
1518
+ bool is_row_reversed = row_slice.step < 0;
1519
+
1520
+ int ii = 0;
1521
+ for (
1522
+ int i = row_slice.start;
1523
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1524
+ i += row_slice.step
1525
+ )
1526
+ {
1527
+ adj_value.c[ii] -= adj_m.data[i][col];
1528
+ ++ii;
1529
+ }
1530
+
1531
+ assert(ii == RowSliceLength);
1532
+ }
1533
+
1534
+
1535
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1536
+ inline CUDA_CALLABLE void adj_sub_inplace(
1537
+ mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
1538
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
1539
+ )
1540
+ {
1541
+ #ifndef NDEBUG
1542
+ if (row < -(int)Rows || row >= (int)Rows)
1543
+ {
1544
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1545
+ assert(0);
1546
+ }
1547
+ #endif
1548
+
1549
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1550
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1551
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1552
+ assert(slice_get_length(col_slice) == ColSliceLength);
1553
+
1554
+ if (row < 0)
1555
+ {
1556
+ row += Rows;
1557
+ }
1558
+
1559
+ bool is_col_reversed = col_slice.step < 0;
1560
+
1561
+ int ii = 0;
1562
+ for (
1563
+ int i = col_slice.start;
1564
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
1565
+ i += col_slice.step
1566
+ )
1567
+ {
1568
+ adj_value.c[ii] -= adj_m.data[row][i];
1569
+ ++ii;
1570
+ }
1571
+
1572
+ assert(ii == ColSliceLength);
1573
+ }
1574
+
1575
+
1576
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1577
+ inline CUDA_CALLABLE void adj_sub_inplace(
1578
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
1579
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
1580
+ )
1581
+ {
1582
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1583
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1584
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1585
+ assert(slice_get_length(row_slice) == RowSliceLength);
1586
+
1587
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1588
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1589
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1590
+ assert(slice_get_length(col_slice) == ColSliceLength);
1591
+
1592
+ bool is_row_reversed = row_slice.step < 0;
1593
+ bool is_col_reversed = col_slice.step < 0;
1594
+
1595
+ int ii = 0;
1596
+ for (
1597
+ int i = row_slice.start;
1598
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1599
+ i += row_slice.step
1600
+ )
1601
+ {
1602
+ int jj = 0;
1603
+ for (
1604
+ int j = col_slice.start;
1605
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
1606
+ j += col_slice.step
1607
+ )
1608
+ {
1609
+ adj_value.data[ii][jj] -= adj_m.data[i][j];
1610
+ ++jj;
1611
+ }
1612
+
1613
+ assert(jj == ColSliceLength);
1614
+ ++ii;
1615
+ }
1616
+
1617
+ assert(ii == RowSliceLength);
1618
+ }
1619
+
1620
+
1621
+ template<unsigned Rows, unsigned Cols, typename Type>
1622
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
1623
+ {
1624
+ #ifndef NDEBUG
1625
+ if (row < -(int)Rows || row >= (int)Rows)
1626
+ {
1627
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1628
+ assert(0);
1629
+ }
1630
+ if (col < -(int)Cols || col >= (int)Cols)
1631
+ {
1632
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
1633
+ assert(0);
1634
+ }
1635
+ #endif
1636
+
1637
+ if (row < 0)
1638
+ {
1639
+ row += Rows;
1640
+ }
1641
+ if (col < 0)
1642
+ {
1643
+ col += Cols;
1644
+ }
1645
+
1646
+ m.data[row][col] = value;
1647
+ }
1648
+
1649
+
1650
+ template<unsigned Rows, unsigned Cols, typename Type>
1651
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
1652
+ {
1653
+ #ifndef NDEBUG
1654
+ if (row < -(int)Rows || row >= (int)Rows)
1655
+ {
1656
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1657
+ assert(0);
1658
+ }
1659
+ #endif
1660
+
1661
+ if (row < 0)
1662
+ {
1663
+ row += Rows;
1664
+ }
1665
+
1666
+ for(unsigned i=0; i < Cols; ++i)
1667
+ {
1668
+ m.data[row][i] = value[i];
1669
+ }
1670
+ }
1671
+
1672
+
1673
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1674
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1675
+ {
1676
+ static_assert(
1677
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
1678
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
1679
+ );
1680
+
1681
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1682
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1683
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1684
+ assert(slice_get_length(row_slice) == RowSliceLength);
1685
+
1686
+ bool is_row_reversed = row_slice.step < 0;
1687
+
1688
+ int ii = 0;
1689
+ for (
1690
+ int i = row_slice.start;
1691
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1692
+ i += row_slice.step
1693
+ )
1694
+ {
1695
+ for (int j = 0; j < Cols; ++j)
1696
+ {
1697
+ m.data[i][j] = value.data[ii][j];
1698
+ }
1699
+
1700
+ ++ii;
1701
+ }
1702
+
1703
+ assert(ii == RowSliceLength);
1704
+ }
1705
+
1706
+
1707
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
1708
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
1709
+ {
1710
+ #ifndef NDEBUG
1711
+ if (col < -(int)Cols || col >= (int)Cols)
1712
+ {
1713
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
1714
+ assert(0);
1715
+ }
1716
+ #endif
1717
+
1718
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1719
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1720
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1721
+ assert(slice_get_length(row_slice) == RowSliceLength);
1722
+
1723
+ if (col < 0)
1724
+ {
1725
+ col += Cols;
1726
+ }
1727
+
1728
+ bool is_row_reversed = row_slice.step < 0;
1729
+
1730
+ int ii = 0;
1731
+ for (
1732
+ int i = row_slice.start;
1733
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1734
+ i += row_slice.step
1735
+ )
1736
+ {
1737
+ m.data[i][col] = value.c[ii];
1738
+ ++ii;
1739
+ }
1740
+
1741
+ assert(ii == RowSliceLength);
1742
+ }
1743
+
1744
+
1745
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1746
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
1747
+ {
1748
+ #ifndef NDEBUG
1749
+ if (row < -(int)Rows || row >= (int)Rows)
1750
+ {
1751
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1752
+ assert(0);
1753
+ }
1754
+ #endif
1755
+
1756
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1757
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1758
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1759
+ assert(slice_get_length(col_slice) == ColSliceLength);
1760
+
1761
+ if (row < 0)
1762
+ {
1763
+ row += Rows;
1764
+ }
1765
+
1766
+ bool is_col_reversed = col_slice.step < 0;
1767
+
1768
+ int ii = 0;
1769
+ for (
1770
+ int i = col_slice.start;
1771
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
1772
+ i += col_slice.step
1773
+ )
1774
+ {
1775
+ m.data[row][i] = value.c[ii];
1776
+ ++ii;
1777
+ }
1778
+
1779
+ assert(ii == ColSliceLength);
1780
+ }
1781
+
1782
+
1783
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1784
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1785
+ {
1786
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1787
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1788
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1789
+ assert(slice_get_length(row_slice) == RowSliceLength);
1790
+
1791
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1792
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1793
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1794
+ assert(slice_get_length(col_slice) == ColSliceLength);
1795
+
1796
+ bool is_row_reversed = row_slice.step < 0;
1797
+ bool is_col_reversed = col_slice.step < 0;
1798
+
1799
+ int ii = 0;
1800
+ for (
1801
+ int i = row_slice.start;
1802
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1803
+ i += row_slice.step
1804
+ )
1805
+ {
1806
+ int jj = 0;
1807
+ for (
1808
+ int j = col_slice.start;
1809
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
1810
+ j += col_slice.step
1811
+ )
1812
+ {
1813
+ m.data[i][j] = value.data[ii][jj];
1814
+ ++jj;
1815
+ }
1816
+
1817
+ assert(jj == ColSliceLength);
1818
+ ++ii;
1819
+ }
1820
+
1821
+ assert(ii == RowSliceLength);
1822
+ }
1823
+
1824
+
1825
+ template<unsigned Rows, unsigned Cols, typename Type>
1826
+ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
1827
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value)
1828
+ {
1829
+ #ifndef NDEBUG
1830
+ if (row < -(int)Rows || row >= (int)Rows)
1831
+ {
1832
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1833
+ assert(0);
1834
+ }
1835
+ if (col < -(int)Cols || col >= (int)Cols)
1836
+ {
1837
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
1838
+ assert(0);
1839
+ }
1840
+ #endif
1841
+
1842
+ if (row < 0)
1843
+ {
1844
+ row += Rows;
1845
+ }
1846
+ if (col < 0)
1847
+ {
1848
+ col += Cols;
1849
+ }
1850
+
764
1851
  adj_value += adj_m.data[row][col];
765
1852
  }
766
1853
 
@@ -770,13 +1857,18 @@ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row,
770
1857
  mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value)
771
1858
  {
772
1859
  #ifndef NDEBUG
773
- if (row < 0 || row >= Rows)
1860
+ if (row < -(int)Rows || row >= (int)Rows)
774
1861
  {
775
1862
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
776
1863
  assert(0);
777
1864
  }
778
1865
  #endif
779
1866
 
1867
+ if (row < 0)
1868
+ {
1869
+ row += Rows;
1870
+ }
1871
+
780
1872
  for(unsigned i=0; i < Cols; ++i)
781
1873
  {
782
1874
  adj_value[i] += adj_m.data[row][i];
@@ -784,99 +1876,563 @@ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row,
784
1876
  }
785
1877
 
786
1878
 
1879
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1880
+ inline CUDA_CALLABLE void adj_assign_inplace(
1881
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
1882
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
1883
+ )
1884
+ {
1885
+ static_assert(
1886
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
1887
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
1888
+ );
1889
+
1890
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1891
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1892
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1893
+ assert(slice_get_length(row_slice) == RowSliceLength);
1894
+
1895
+ bool is_row_reversed = row_slice.step < 0;
1896
+
1897
+ int ii = 0;
1898
+ for (
1899
+ int i = row_slice.start;
1900
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1901
+ i += row_slice.step
1902
+ )
1903
+ {
1904
+ for (int j = 0; j < Cols; ++j)
1905
+ {
1906
+ adj_value.data[ii][j] += adj_m.data[i][j];
1907
+ }
1908
+
1909
+ ++ii;
1910
+ }
1911
+
1912
+ assert(ii == RowSliceLength);
1913
+ }
1914
+
1915
+
1916
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
1917
+ inline CUDA_CALLABLE void adj_assign_inplace(
1918
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
1919
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
1920
+ )
1921
+ {
1922
+ #ifndef NDEBUG
1923
+ if (col < -(int)Cols || col >= (int)Cols)
1924
+ {
1925
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
1926
+ assert(0);
1927
+ }
1928
+ #endif
1929
+
1930
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1931
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
1932
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
1933
+ assert(slice_get_length(row_slice) == RowSliceLength);
1934
+
1935
+ if (col < 0)
1936
+ {
1937
+ col += Cols;
1938
+ }
1939
+
1940
+ bool is_row_reversed = row_slice.step < 0;
1941
+
1942
+ int ii = 0;
1943
+ for (
1944
+ int i = row_slice.start;
1945
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
1946
+ i += row_slice.step
1947
+ )
1948
+ {
1949
+ adj_value.c[ii] += adj_m.data[i][col];
1950
+ ++ii;
1951
+ }
1952
+
1953
+ assert(ii == RowSliceLength);
1954
+ }
1955
+
1956
+
1957
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1958
+ inline CUDA_CALLABLE void adj_assign_inplace(
1959
+ mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
1960
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
1961
+ )
1962
+ {
1963
+ #ifndef NDEBUG
1964
+ if (row < -(int)Rows || row >= (int)Rows)
1965
+ {
1966
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1967
+ assert(0);
1968
+ }
1969
+ #endif
1970
+
1971
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
1972
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
1973
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
1974
+ assert(slice_get_length(col_slice) == ColSliceLength);
1975
+
1976
+ if (row < 0)
1977
+ {
1978
+ row += Rows;
1979
+ }
1980
+
1981
+ bool is_col_reversed = col_slice.step < 0;
1982
+
1983
+ int ii = 0;
1984
+ for (
1985
+ int i = col_slice.start;
1986
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
1987
+ i += col_slice.step
1988
+ )
1989
+ {
1990
+ adj_value.c[ii] += adj_m.data[row][i];
1991
+ ++ii;
1992
+ }
1993
+
1994
+ assert(ii == ColSliceLength);
1995
+ }
1996
+
1997
+
1998
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1999
+ inline CUDA_CALLABLE void adj_assign_inplace(
2000
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2001
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
2002
+ )
2003
+ {
2004
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2005
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2006
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2007
+ assert(slice_get_length(row_slice) == RowSliceLength);
2008
+
2009
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2010
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2011
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2012
+ assert(slice_get_length(col_slice) == ColSliceLength);
2013
+
2014
+ bool is_row_reversed = row_slice.step < 0;
2015
+ bool is_col_reversed = col_slice.step < 0;
2016
+
2017
+ int ii = 0;
2018
+ for (
2019
+ int i = row_slice.start;
2020
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2021
+ i += row_slice.step
2022
+ )
2023
+ {
2024
+ int jj = 0;
2025
+ for (
2026
+ int j = col_slice.start;
2027
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
2028
+ j += col_slice.step
2029
+ )
2030
+ {
2031
+ adj_value.data[ii][jj] += adj_m.data[i][j];
2032
+ ++jj;
2033
+ }
2034
+
2035
+ assert(jj == ColSliceLength);
2036
+ ++ii;
2037
+ }
2038
+
2039
+ assert(ii == RowSliceLength);
2040
+ }
2041
+
2042
+
2043
+ template<unsigned Rows, unsigned Cols, typename Type>
2044
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
2045
+ {
2046
+ #ifndef NDEBUG
2047
+ if (row < -(int)Rows || row >= (int)Rows)
2048
+ {
2049
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2050
+ assert(0);
2051
+ }
2052
+ if (col < -(int)Cols || col >= (int)Cols)
2053
+ {
2054
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
2055
+ assert(0);
2056
+ }
2057
+ #endif
2058
+
2059
+ if (row < 0)
2060
+ {
2061
+ row += Rows;
2062
+ }
2063
+ if (col < 0)
2064
+ {
2065
+ col += Cols;
2066
+ }
2067
+
2068
+ mat_t<Rows,Cols,Type> ret(m);
2069
+ ret.data[row][col] = value;
2070
+ return ret;
2071
+ }
2072
+
2073
+
2074
+ template<unsigned Rows, unsigned Cols, typename Type>
2075
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
2076
+ {
2077
+ #ifndef NDEBUG
2078
+ if (row < -(int)Rows || row >= (int)Rows)
2079
+ {
2080
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2081
+ assert(0);
2082
+ }
2083
+ #endif
2084
+
2085
+ if (row < 0)
2086
+ {
2087
+ row += Rows;
2088
+ }
2089
+
2090
+ mat_t<Rows,Cols,Type> ret(m);
2091
+ for(unsigned i=0; i < Cols; ++i)
2092
+ {
2093
+ ret.data[row][i] = value[i];
2094
+ }
2095
+ return ret;
2096
+ }
2097
+
2098
+
2099
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2100
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2101
+ {
2102
+ mat_t<Rows, Cols, Type> ret(m);
2103
+ assign_inplace(ret, row_slice, value);
2104
+ return ret;
2105
+ }
2106
+
2107
+
2108
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
2109
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
2110
+ {
2111
+ mat_t<Rows, Cols, Type> ret(m);
2112
+ assign_inplace(ret, row_slice, col, value);
2113
+ return ret;
2114
+ }
2115
+
2116
+
2117
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2118
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
2119
+ {
2120
+ mat_t<Rows, Cols, Type> ret(m);
2121
+ assign_inplace(ret, row, col_slice, value);
2122
+ return ret;
2123
+ }
2124
+
2125
+
2126
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2127
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2128
+ {
2129
+ mat_t<Rows, Cols, Type> ret(m);
2130
+ assign_inplace(ret, row_slice, col_slice, value);
2131
+ return ret;
2132
+ }
2133
+
2134
+
787
2135
  template<unsigned Rows, unsigned Cols, typename Type>
788
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
2136
+ inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
2137
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
789
2138
  {
790
2139
  #ifndef NDEBUG
791
- if (row < 0 || row >= Rows)
2140
+ if (row < -(int)Rows || row >= (int)Rows)
792
2141
  {
793
2142
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
794
2143
  assert(0);
795
2144
  }
796
- if (col < 0 || col >= Cols)
2145
+ if (col < -(int)Cols || col >= (int)Cols)
797
2146
  {
798
2147
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
799
2148
  assert(0);
800
2149
  }
801
2150
  #endif
802
2151
 
803
- mat_t<Rows,Cols,Type> ret(m);
804
- ret.data[row][col] = value;
805
- return ret;
2152
+ if (row < 0)
2153
+ {
2154
+ row += Rows;
2155
+ }
2156
+ if (col < 0)
2157
+ {
2158
+ col += Cols;
2159
+ }
2160
+
2161
+ adj_value += adj_ret.data[row][col];
2162
+ for(unsigned i=0; i < Rows; ++i)
2163
+ {
2164
+ for(unsigned j=0; j < Cols; ++j)
2165
+ {
2166
+ if(i != row || j != col)
2167
+ adj_m.data[i][j] += adj_ret.data[i][j];
2168
+ }
2169
+ }
806
2170
  }
807
2171
 
808
2172
 
809
2173
  template<unsigned Rows, unsigned Cols, typename Type>
810
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
2174
+ inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
2175
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
811
2176
  {
812
2177
  #ifndef NDEBUG
813
- if (row < 0 || row >= Rows)
2178
+ if (row < -(int)Rows || row >= (int)Rows)
814
2179
  {
815
2180
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
816
2181
  assert(0);
817
2182
  }
818
2183
  #endif
819
2184
 
820
- mat_t<Rows,Cols,Type> ret(m);
821
- for(unsigned i=0; i < Cols; ++i)
2185
+ if (row < 0)
822
2186
  {
823
- ret.data[row][i] = value[i];
2187
+ row += Rows;
2188
+ }
2189
+
2190
+ for(unsigned i=0; i < Rows; ++i)
2191
+ {
2192
+ for(unsigned j=0; j < Cols; ++j)
2193
+ {
2194
+ if (i==row)
2195
+ adj_value[j] += adj_ret.data[i][j];
2196
+ else
2197
+ adj_m.data[i][j] += adj_ret.data[i][j];
2198
+ }
824
2199
  }
825
- return ret;
826
2200
  }
827
2201
 
828
2202
 
829
- template<unsigned Rows, unsigned Cols, typename Type>
830
- inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
831
- mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
2203
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2204
+ inline CUDA_CALLABLE void adj_assign_copy(
2205
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2206
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value,
2207
+ mat_t<Rows,Cols,Type>& adj_ret
2208
+ )
832
2209
  {
833
- #ifndef NDEBUG
834
- if (row < 0 || row >= Rows)
2210
+ static_assert(
2211
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
2212
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
2213
+ );
2214
+
2215
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2216
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2217
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2218
+ assert(slice_get_length(row_slice) == RowSliceLength);
2219
+
2220
+ bool is_row_reversed = row_slice.step < 0;
2221
+
2222
+ int ii = 0;
2223
+ for (int i = 0; i < Rows; ++i)
835
2224
  {
836
- printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
837
- assert(0);
2225
+ bool in_row_slice = is_row_reversed
2226
+ ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
2227
+ : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
2228
+
2229
+ if (!in_row_slice)
2230
+ {
2231
+ for (int j = 0; j < Cols; ++j)
2232
+ {
2233
+ adj_m.data[i][j] += adj_ret.data[i][j];
2234
+ }
2235
+ }
2236
+ else
2237
+ {
2238
+ for (int j = 0; j < Cols; ++j)
2239
+ {
2240
+ adj_value.data[ii][j] += adj_ret.data[i][j];
2241
+ }
2242
+
2243
+ ++ii;
2244
+ }
838
2245
  }
839
- if (col < 0 || col >= Cols)
2246
+
2247
+ assert(ii == RowSliceLength);
2248
+ }
2249
+
2250
+
2251
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
2252
+ inline CUDA_CALLABLE void adj_assign_copy(
2253
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
2254
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value,
2255
+ mat_t<Rows,Cols,Type>& adj_ret
2256
+ )
2257
+ {
2258
+ #ifndef NDEBUG
2259
+ if (col < -(int)Cols || col >= (int)Cols)
840
2260
  {
841
2261
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
842
2262
  assert(0);
843
2263
  }
844
2264
  #endif
845
2265
 
846
- adj_value += adj_ret.data[row][col];
847
- for(unsigned i=0; i < Rows; ++i)
2266
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2267
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2268
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2269
+ assert(slice_get_length(row_slice) == RowSliceLength);
2270
+
2271
+ if (col < 0)
848
2272
  {
849
- for(unsigned j=0; j < Cols; ++j)
2273
+ col += Cols;
2274
+ }
2275
+
2276
+ bool is_row_reversed = row_slice.step < 0;
2277
+
2278
+ int ii = 0;
2279
+ for (int i = 0; i < Rows; ++i)
2280
+ {
2281
+ bool in_row_slice = is_row_reversed
2282
+ ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
2283
+ : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
2284
+
2285
+ if (!in_row_slice)
850
2286
  {
851
- if(i != row || j != col)
2287
+ for (int j = 0; j < Cols; ++j)
2288
+ {
852
2289
  adj_m.data[i][j] += adj_ret.data[i][j];
2290
+ }
2291
+ }
2292
+ else
2293
+ {
2294
+ for (int j = 0; j < Cols; ++j)
2295
+ {
2296
+ if (j != col)
2297
+ {
2298
+ adj_m.data[i][j] += adj_ret.data[i][j];
2299
+ }
2300
+ else
2301
+ {
2302
+ adj_value.c[ii] += adj_ret.data[i][j];
2303
+ }
2304
+ }
2305
+
2306
+ ++ii;
853
2307
  }
854
2308
  }
2309
+
2310
+ assert(ii == RowSliceLength);
855
2311
  }
856
2312
 
857
2313
 
858
- template<unsigned Rows, unsigned Cols, typename Type>
859
- inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
860
- mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
2314
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2315
+ inline CUDA_CALLABLE void adj_assign_copy(
2316
+ mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
2317
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value,
2318
+ mat_t<Rows,Cols,Type>& adj_ret
2319
+ )
861
2320
  {
862
2321
  #ifndef NDEBUG
863
- if (row < 0 || row >= Rows)
2322
+ if (row < -(int)Rows || row >= (int)Rows)
864
2323
  {
865
2324
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
866
2325
  assert(0);
867
2326
  }
868
2327
  #endif
869
2328
 
870
- for(unsigned i=0; i < Rows; ++i)
2329
+ if (row < 0)
871
2330
  {
872
- for(unsigned j=0; j < Cols; ++j)
2331
+ row += Rows;
2332
+ }
2333
+
2334
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2335
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2336
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2337
+ assert(slice_get_length(col_slice) == ColSliceLength);
2338
+
2339
+ bool is_col_reversed = col_slice.step < 0;
2340
+
2341
+ int ii = 0;
2342
+ for (int i = 0; i < Rows; ++i)
2343
+ {
2344
+ if (i != row)
873
2345
  {
874
- if (i==row)
875
- adj_value[j] += adj_ret.data[i][j];
876
- else
2346
+ for (int j = 0; j < Cols; ++j)
2347
+ {
2348
+ adj_m.data[i][j] += adj_ret.data[i][j];
2349
+ }
2350
+ }
2351
+ else
2352
+ {
2353
+ for (int j = 0; j < Cols; ++j)
2354
+ {
2355
+ bool in_col_slice = is_col_reversed
2356
+ ? (j <= col_slice.start && j > col_slice.stop && (col_slice.start - j) % (-col_slice.step) == 0)
2357
+ : (j >= col_slice.start && j < col_slice.stop && (j - col_slice.start) % col_slice.step == 0);
2358
+
2359
+ if (!in_col_slice)
2360
+ {
2361
+ adj_m.data[i][j] += adj_ret.data[i][j];
2362
+ }
2363
+ else
2364
+ {
2365
+ adj_value.c[ii] += adj_ret.data[i][j];
2366
+ ++ii;
2367
+ }
2368
+ }
2369
+ }
2370
+ }
2371
+
2372
+ assert(ii == ColSliceLength);
2373
+ }
2374
+
2375
+
2376
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2377
+ inline CUDA_CALLABLE void adj_assign_copy(
2378
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2379
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value,
2380
+ mat_t<Rows,Cols,Type>& adj_ret
2381
+ )
2382
+ {
2383
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2384
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2385
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2386
+ assert(slice_get_length(row_slice) == RowSliceLength);
2387
+
2388
+ bool is_row_reversed = row_slice.step < 0;
2389
+ bool is_col_reversed = col_slice.step < 0;
2390
+
2391
+ int ii = 0;
2392
+ for (int i = 0; i < Rows; ++i)
2393
+ {
2394
+ bool in_row_slice = is_row_reversed
2395
+ ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
2396
+ : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
2397
+
2398
+ if (!in_row_slice)
2399
+ {
2400
+ for (int j = 0; j < Cols; ++j)
2401
+ {
877
2402
  adj_m.data[i][j] += adj_ret.data[i][j];
2403
+ }
2404
+ }
2405
+ else
2406
+ {
2407
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2408
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2409
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2410
+ assert(slice_get_length(col_slice) == ColSliceLength);
2411
+
2412
+ int jj = 0;
2413
+ for (int j = 0; j < Cols; ++j)
2414
+ {
2415
+ bool in_col_slice = is_col_reversed
2416
+ ? (j <= col_slice.start && j > col_slice.stop && (col_slice.start - j) % (-col_slice.step) == 0)
2417
+ : (j >= col_slice.start && j < col_slice.stop && (j - col_slice.start) % col_slice.step == 0);
2418
+
2419
+ if (!in_col_slice)
2420
+ {
2421
+ adj_m.data[i][j] += adj_ret.data[i][j];
2422
+ }
2423
+ else
2424
+ {
2425
+ adj_value.data[ii][jj] += adj_ret.data[i][j];
2426
+ ++jj;
2427
+ }
2428
+ }
2429
+
2430
+ assert(jj == ColSliceLength);
2431
+ ++ii;
878
2432
  }
879
2433
  }
2434
+
2435
+ assert(ii == RowSliceLength);
880
2436
  }
881
2437
 
882
2438
 
@@ -940,6 +2496,21 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(const mat_t<Rows,Cols,Type>& a, c
940
2496
  return t;
941
2497
  }
942
2498
 
2499
+ template<unsigned Rows, unsigned Cols, typename Type>
2500
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(Type a, const mat_t<Rows,Cols,Type>& b)
2501
+ {
2502
+ mat_t<Rows,Cols,Type> t;
2503
+ for (unsigned i=0; i < Rows; ++i)
2504
+ {
2505
+ for (unsigned j=0; j < Cols; ++j)
2506
+ {
2507
+ t.data[i][j] = a + b.data[i][j];
2508
+ }
2509
+ }
2510
+
2511
+ return t;
2512
+ }
2513
+
943
2514
  template<unsigned Rows, unsigned Cols, typename Type>
944
2515
  inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
945
2516
  {
@@ -955,6 +2526,21 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(const mat_t<Rows,Cols,Type>& a, c
955
2526
  return t;
956
2527
  }
957
2528
 
2529
+ template<unsigned Rows, unsigned Cols, typename Type>
2530
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(Type a, const mat_t<Rows,Cols,Type>& b)
2531
+ {
2532
+ mat_t<Rows,Cols,Type> t;
2533
+ for (unsigned i=0; i < Rows; ++i)
2534
+ {
2535
+ for (unsigned j=0; j < Cols; ++j)
2536
+ {
2537
+ t.data[i][j] = a - b.data[i][j];
2538
+ }
2539
+ }
2540
+
2541
+ return t;
2542
+ }
2543
+
958
2544
  template<unsigned Rows, unsigned Cols, typename Type>
959
2545
  inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(const mat_t<Rows,Cols,Type>& a, Type b)
960
2546
  {
@@ -1469,20 +3055,194 @@ template<unsigned Rows, unsigned Cols, typename Type>
1469
3055
  inline void CUDA_CALLABLE adj_extract(const mat_t<Rows,Cols,Type>& m, int row, int col, mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type adj_ret)
1470
3056
  {
1471
3057
  #ifndef NDEBUG
1472
- if (row < 0 || row > Rows)
3058
+ if (row < -(int)Rows || row >= (int)Rows)
1473
3059
  {
1474
3060
  printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
1475
3061
  assert(0);
1476
3062
  }
1477
- if (col < 0 || col > Cols)
3063
+ if (col < -(int)Cols || col >= (int)Cols)
1478
3064
  {
1479
3065
  printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
1480
3066
  assert(0);
1481
3067
  }
1482
3068
  #endif
3069
+
3070
+ if (row < 0)
3071
+ {
3072
+ row += Rows;
3073
+ }
3074
+ if (col < 0)
3075
+ {
3076
+ col += Cols;
3077
+ }
3078
+
1483
3079
  adj_m.data[row][col] += adj_ret;
1484
3080
  }
1485
3081
 
3082
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
3083
+ inline CUDA_CALLABLE void adj_extract(
3084
+ const mat_t<Rows,Cols,Type>& m, slice_t row_slice,
3085
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice,
3086
+ const mat_t<RowSliceLength, ColSliceLength, Type>& adj_ret
3087
+ )
3088
+ {
3089
+ static_assert(
3090
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
3091
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
3092
+ );
3093
+
3094
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
3095
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
3096
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
3097
+ assert(slice_get_length(row_slice) == RowSliceLength);
3098
+
3099
+ bool is_row_reversed = row_slice.step < 0;
3100
+
3101
+ int ii = 0;
3102
+ for (
3103
+ int i = row_slice.start;
3104
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
3105
+ i += row_slice.step
3106
+ )
3107
+ {
3108
+ for (int j = 0; j < Cols; ++j)
3109
+ {
3110
+ adj_m.data[i][j] += adj_ret.data[ii][j];
3111
+ }
3112
+
3113
+ ++ii;
3114
+ }
3115
+
3116
+ assert(ii == RowSliceLength);
3117
+ }
3118
+
3119
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
3120
+ inline CUDA_CALLABLE void adj_extract(
3121
+ const mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col,
3122
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col,
3123
+ const vec_t<RowSliceLength, Type>& adj_ret
3124
+ )
3125
+ {
3126
+ #ifndef NDEBUG
3127
+ if (col < -(int)Cols || col >= (int)Cols)
3128
+ {
3129
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
3130
+ assert(0);
3131
+ }
3132
+ #endif
3133
+
3134
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
3135
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
3136
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
3137
+ assert(slice_get_length(row_slice) == RowSliceLength);
3138
+
3139
+ if (col < 0)
3140
+ {
3141
+ col += Cols;
3142
+ }
3143
+
3144
+ bool is_row_reversed = row_slice.step < 0;
3145
+
3146
+ int ii = 0;
3147
+ for (
3148
+ int i = row_slice.start;
3149
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
3150
+ i += row_slice.step
3151
+ )
3152
+ {
3153
+ adj_m.data[i][col] += adj_ret.c[ii];
3154
+ ++ii;
3155
+ }
3156
+
3157
+ assert(ii == RowSliceLength);
3158
+ }
3159
+
3160
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
3161
+ inline CUDA_CALLABLE void adj_extract(
3162
+ const mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice,
3163
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice,
3164
+ const vec_t<ColSliceLength, Type>& adj_ret
3165
+ )
3166
+ {
3167
+ #ifndef NDEBUG
3168
+ if (row < -(int)Rows || row >= (int)Rows)
3169
+ {
3170
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
3171
+ assert(0);
3172
+ }
3173
+ #endif
3174
+
3175
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
3176
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
3177
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
3178
+ assert(slice_get_length(col_slice) == ColSliceLength);
3179
+
3180
+ if (row < 0)
3181
+ {
3182
+ row += Rows;
3183
+ }
3184
+
3185
+ bool is_col_reversed = col_slice.step < 0;
3186
+
3187
+ int ii = 0;
3188
+ for (
3189
+ int i = col_slice.start;
3190
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
3191
+ i += col_slice.step
3192
+ )
3193
+ {
3194
+ adj_m.data[row][i] += adj_ret.c[ii];
3195
+ ++ii;
3196
+ }
3197
+
3198
+ assert(ii == ColSliceLength);
3199
+ }
3200
+
3201
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
3202
+ inline CUDA_CALLABLE void adj_extract(
3203
+ const mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice,
3204
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice,
3205
+ const mat_t<RowSliceLength, ColSliceLength, Type>& adj_ret
3206
+ )
3207
+ {
3208
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
3209
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
3210
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
3211
+ assert(slice_get_length(row_slice) == RowSliceLength);
3212
+
3213
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
3214
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
3215
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
3216
+ assert(slice_get_length(col_slice) == ColSliceLength);
3217
+
3218
+ bool is_row_reversed = row_slice.step < 0;
3219
+ bool is_col_reversed = col_slice.step < 0;
3220
+
3221
+ int ii = 0;
3222
+ for (
3223
+ int i = row_slice.start;
3224
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
3225
+ i += row_slice.step
3226
+ )
3227
+ {
3228
+ int jj = 0;
3229
+ for (
3230
+ int j = col_slice.start;
3231
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
3232
+ j += col_slice.step
3233
+ )
3234
+ {
3235
+ adj_m.data[i][j] += adj_ret.data[ii][jj];
3236
+ ++jj;
3237
+ }
3238
+
3239
+ assert(jj == ColSliceLength);
3240
+ ++ii;
3241
+ }
3242
+
3243
+ assert(ii == RowSliceLength);
3244
+ }
3245
+
1486
3246
  template<unsigned Rows, unsigned Cols, typename Type>
1487
3247
  inline CUDA_CALLABLE void adj_outer(const vec_t<Rows,Type>& a, const vec_t<Cols,Type>& b, vec_t<Rows,Type>& adj_a, vec_t<Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
1488
3248
  {
@@ -1503,6 +3263,23 @@ inline CUDA_CALLABLE void adj_add(const mat_t<Rows,Cols,Type>& a, const mat_t<Ro
1503
3263
  }
1504
3264
  }
1505
3265
 
3266
+ template<unsigned Rows, unsigned Cols, typename Type>
3267
+ inline CUDA_CALLABLE void adj_add(
3268
+ Type a, const mat_t<Rows,Cols,Type>& b,
3269
+ Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
3270
+ const mat_t<Rows,Cols,Type>& adj_ret
3271
+ )
3272
+ {
3273
+ for (unsigned i=0; i < Rows; ++i)
3274
+ {
3275
+ for (unsigned j=0; j < Cols; ++j)
3276
+ {
3277
+ adj_a += adj_ret.data[i][j];
3278
+ adj_b.data[i][j] += adj_ret.data[i][j];
3279
+ }
3280
+ }
3281
+ }
3282
+
1506
3283
  template<unsigned Rows, unsigned Cols, typename Type>
1507
3284
  inline CUDA_CALLABLE void adj_sub(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
1508
3285
  {
@@ -1516,6 +3293,23 @@ inline CUDA_CALLABLE void adj_sub(const mat_t<Rows,Cols,Type>& a, const mat_t<Ro
1516
3293
  }
1517
3294
  }
1518
3295
 
3296
+ template<unsigned Rows, unsigned Cols, typename Type>
3297
+ inline CUDA_CALLABLE void adj_sub(
3298
+ Type a, const mat_t<Rows,Cols,Type>& b,
3299
+ Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
3300
+ const mat_t<Rows,Cols,Type>& adj_ret
3301
+ )
3302
+ {
3303
+ for (unsigned i=0; i < Rows; ++i)
3304
+ {
3305
+ for (unsigned j=0; j < Cols; ++j)
3306
+ {
3307
+ adj_a += adj_ret.data[i][j];
3308
+ adj_b.data[i][j] -= adj_ret.data[i][j];
3309
+ }
3310
+ }
3311
+ }
3312
+
1519
3313
  template<unsigned Rows, unsigned Cols, typename Type>
1520
3314
  inline CUDA_CALLABLE void adj_div(const mat_t<Rows,Cols,Type>& a, Type s, mat_t<Rows,Cols,Type>& adj_a, Type& adj_s, const mat_t<Rows,Cols,Type>& adj_ret)
1521
3315
  {
@@ -1533,13 +3327,13 @@ inline CUDA_CALLABLE void adj_div(const mat_t<Rows,Cols,Type>& a, Type s, mat_t<
1533
3327
  template<unsigned Rows, unsigned Cols, typename Type>
1534
3328
  inline CUDA_CALLABLE void adj_div(Type s, const mat_t<Rows,Cols,Type>& a, Type& adj_s, mat_t<Rows,Cols,Type>& adj_a, const mat_t<Rows,Cols,Type>& adj_ret)
1535
3329
  {
1536
- adj_s -= tensordot(a , adj_ret)/ (s * s); // - a / s^2
1537
-
1538
3330
  for (unsigned i=0; i < Rows; ++i)
1539
3331
  {
1540
3332
  for (unsigned j=0; j < Cols; ++j)
1541
3333
  {
1542
- adj_a.data[i][j] += s / adj_ret.data[i][j];
3334
+ Type inv = Type(1) / a.data[i][j];
3335
+ adj_a.data[i][j] -= s * adj_ret.data[i][j] * inv * inv;
3336
+ adj_s += adj_ret.data[i][j] * inv;
1543
3337
  }
1544
3338
  }
1545
3339
  }