warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +482 -110
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +48 -63
  7. warp/builtins.py +955 -137
  8. warp/codegen.py +327 -209
  9. warp/config.py +1 -1
  10. warp/context.py +1363 -800
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/examples/interop/example_jax_callable.py +34 -4
  18. warp/examples/interop/example_jax_kernel.py +27 -1
  19. warp/fabric.py +1 -1
  20. warp/fem/cache.py +27 -19
  21. warp/fem/domain.py +2 -2
  22. warp/fem/field/nodal_field.py +2 -2
  23. warp/fem/field/virtual.py +266 -166
  24. warp/fem/geometry/geometry.py +5 -5
  25. warp/fem/integrate.py +200 -91
  26. warp/fem/space/restriction.py +4 -0
  27. warp/fem/space/shape/tet_shape_function.py +3 -10
  28. warp/jax_experimental/custom_call.py +1 -1
  29. warp/jax_experimental/ffi.py +203 -54
  30. warp/marching_cubes.py +708 -0
  31. warp/native/array.h +103 -8
  32. warp/native/builtin.h +90 -9
  33. warp/native/bvh.cpp +64 -28
  34. warp/native/bvh.cu +58 -58
  35. warp/native/bvh.h +2 -2
  36. warp/native/clang/clang.cpp +7 -7
  37. warp/native/coloring.cpp +13 -3
  38. warp/native/crt.cpp +2 -2
  39. warp/native/crt.h +3 -5
  40. warp/native/cuda_util.cpp +42 -11
  41. warp/native/cuda_util.h +10 -4
  42. warp/native/exports.h +1842 -1908
  43. warp/native/fabric.h +2 -1
  44. warp/native/hashgrid.cpp +37 -37
  45. warp/native/hashgrid.cu +2 -2
  46. warp/native/initializer_array.h +1 -1
  47. warp/native/intersect.h +4 -4
  48. warp/native/mat.h +1913 -119
  49. warp/native/mathdx.cpp +43 -43
  50. warp/native/mesh.cpp +24 -24
  51. warp/native/mesh.cu +26 -26
  52. warp/native/mesh.h +5 -3
  53. warp/native/nanovdb/GridHandle.h +179 -12
  54. warp/native/nanovdb/HostBuffer.h +8 -7
  55. warp/native/nanovdb/NanoVDB.h +517 -895
  56. warp/native/nanovdb/NodeManager.h +323 -0
  57. warp/native/nanovdb/PNanoVDB.h +2 -2
  58. warp/native/quat.h +337 -16
  59. warp/native/rand.h +7 -7
  60. warp/native/range.h +7 -1
  61. warp/native/reduce.cpp +10 -10
  62. warp/native/reduce.cu +13 -14
  63. warp/native/runlength_encode.cpp +2 -2
  64. warp/native/runlength_encode.cu +5 -5
  65. warp/native/scan.cpp +3 -3
  66. warp/native/scan.cu +4 -4
  67. warp/native/sort.cpp +10 -10
  68. warp/native/sort.cu +22 -22
  69. warp/native/sparse.cpp +8 -8
  70. warp/native/sparse.cu +14 -14
  71. warp/native/spatial.h +366 -17
  72. warp/native/svd.h +23 -8
  73. warp/native/temp_buffer.h +2 -2
  74. warp/native/tile.h +303 -70
  75. warp/native/tile_radix_sort.h +5 -1
  76. warp/native/tile_reduce.h +16 -25
  77. warp/native/tuple.h +2 -2
  78. warp/native/vec.h +385 -18
  79. warp/native/volume.cpp +54 -54
  80. warp/native/volume.cu +1 -1
  81. warp/native/volume.h +2 -1
  82. warp/native/volume_builder.cu +30 -37
  83. warp/native/warp.cpp +150 -149
  84. warp/native/warp.cu +337 -193
  85. warp/native/warp.h +227 -226
  86. warp/optim/linear.py +736 -271
  87. warp/render/imgui_manager.py +289 -0
  88. warp/render/render_opengl.py +137 -57
  89. warp/render/render_usd.py +0 -1
  90. warp/sim/collide.py +1 -2
  91. warp/sim/graph_coloring.py +2 -2
  92. warp/sim/integrator_vbd.py +10 -2
  93. warp/sparse.py +559 -176
  94. warp/tape.py +2 -0
  95. warp/tests/aux_test_module_aot.py +7 -0
  96. warp/tests/cuda/test_async.py +3 -3
  97. warp/tests/cuda/test_conditional_captures.py +101 -0
  98. warp/tests/geometry/test_marching_cubes.py +233 -12
  99. warp/tests/sim/test_cloth.py +89 -6
  100. warp/tests/sim/test_coloring.py +82 -7
  101. warp/tests/test_array.py +56 -5
  102. warp/tests/test_assert.py +53 -0
  103. warp/tests/test_atomic_cas.py +127 -114
  104. warp/tests/test_codegen.py +3 -2
  105. warp/tests/test_context.py +8 -15
  106. warp/tests/test_enum.py +136 -0
  107. warp/tests/test_examples.py +2 -2
  108. warp/tests/test_fem.py +45 -2
  109. warp/tests/test_fixedarray.py +229 -0
  110. warp/tests/test_func.py +18 -15
  111. warp/tests/test_future_annotations.py +7 -5
  112. warp/tests/test_linear_solvers.py +30 -0
  113. warp/tests/test_map.py +1 -1
  114. warp/tests/test_mat.py +1540 -378
  115. warp/tests/test_mat_assign_copy.py +178 -0
  116. warp/tests/test_mat_constructors.py +574 -0
  117. warp/tests/test_module_aot.py +287 -0
  118. warp/tests/test_print.py +69 -0
  119. warp/tests/test_quat.py +162 -34
  120. warp/tests/test_quat_assign_copy.py +145 -0
  121. warp/tests/test_reload.py +2 -1
  122. warp/tests/test_sparse.py +103 -0
  123. warp/tests/test_spatial.py +140 -34
  124. warp/tests/test_spatial_assign_copy.py +160 -0
  125. warp/tests/test_static.py +48 -0
  126. warp/tests/test_struct.py +43 -3
  127. warp/tests/test_tape.py +38 -0
  128. warp/tests/test_types.py +0 -20
  129. warp/tests/test_vec.py +216 -441
  130. warp/tests/test_vec_assign_copy.py +143 -0
  131. warp/tests/test_vec_constructors.py +325 -0
  132. warp/tests/tile/test_tile.py +206 -152
  133. warp/tests/tile/test_tile_cholesky.py +605 -0
  134. warp/tests/tile/test_tile_load.py +169 -0
  135. warp/tests/tile/test_tile_mathdx.py +2 -558
  136. warp/tests/tile/test_tile_matmul.py +179 -0
  137. warp/tests/tile/test_tile_mlp.py +1 -1
  138. warp/tests/tile/test_tile_reduce.py +100 -11
  139. warp/tests/tile/test_tile_shared_memory.py +16 -16
  140. warp/tests/tile/test_tile_sort.py +59 -55
  141. warp/tests/unittest_suites.py +16 -0
  142. warp/tests/walkthrough_debug.py +1 -1
  143. warp/thirdparty/unittest_parallel.py +108 -9
  144. warp/types.py +554 -264
  145. warp/utils.py +68 -86
  146. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
  147. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
  148. warp/native/marching.cpp +0 -19
  149. warp/native/marching.cu +0 -514
  150. warp/native/marching.h +0 -19
  151. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
  152. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
  153. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/vec.h CHANGED
@@ -25,7 +25,7 @@ namespace wp
25
25
  template<unsigned Length, typename Type>
26
26
  struct vec_t
27
27
  {
28
- Type c[Length];
28
+ Type c[Length < 1 ? 1 : Length];
29
29
 
30
30
  inline CUDA_CALLABLE vec_t()
31
31
  : c()
@@ -343,6 +343,17 @@ inline CUDA_CALLABLE vec_t<Length, Type> add(vec_t<Length, Type> a, vec_t<Length
343
343
  return ret;
344
344
  }
345
345
 
346
+ template<unsigned Length, typename Type>
347
+ inline CUDA_CALLABLE vec_t<Length, Type> add(Type a, vec_t<Length, Type> b)
348
+ {
349
+ vec_t<Length, Type> ret;
350
+ for( unsigned i=0; i < Length; ++i )
351
+ {
352
+ ret[i] = a + b[i];
353
+ }
354
+ return ret;
355
+ }
356
+
346
357
  template<typename Type>
347
358
  inline CUDA_CALLABLE vec_t<2, Type> add(vec_t<2, Type> a, vec_t<2, Type> b)
348
359
  {
@@ -367,6 +378,18 @@ inline CUDA_CALLABLE vec_t<Length, Type> sub(vec_t<Length, Type> a, vec_t<Length
367
378
  return ret;
368
379
  }
369
380
 
381
+ template<unsigned Length, typename Type>
382
+ inline CUDA_CALLABLE vec_t<Length, Type> sub(Type a, vec_t<Length, Type> b)
383
+ {
384
+ vec_t<Length, Type> ret;
385
+ for (unsigned i=0; i < Length; ++i)
386
+ {
387
+ ret[i] = Type(a - b[i]);
388
+ }
389
+
390
+ return ret;
391
+ }
392
+
370
393
  template<typename Type>
371
394
  inline CUDA_CALLABLE vec_t<2, Type> sub(vec_t<2, Type> a, vec_t<2, Type> b)
372
395
  {
@@ -440,27 +463,64 @@ template<unsigned Length, typename Type>
440
463
  inline CUDA_CALLABLE Type extract(const vec_t<Length, Type> & a, int idx)
441
464
  {
442
465
  #ifndef NDEBUG
443
- if (idx < 0 || idx >= Length)
466
+ if (idx < -(int)Length || idx >= (int)Length)
444
467
  {
445
468
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
446
469
  assert(0);
447
470
  }
448
471
  #endif
449
472
 
473
+ if (idx < 0)
474
+ {
475
+ idx += Length;
476
+ }
477
+
450
478
  return a[idx];
451
479
  }
452
480
 
481
+ template<unsigned SliceLength, unsigned Length, typename Type>
482
+ inline CUDA_CALLABLE vec_t<SliceLength, Type> extract(const vec_t<Length, Type> & a, slice_t slice)
483
+ {
484
+ vec_t<SliceLength, Type> ret;
485
+
486
+ assert(slice.start >= 0 && slice.start <= (int)Length);
487
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
488
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
489
+ assert(slice_get_length(slice) == SliceLength);
490
+
491
+ bool is_reversed = slice.step < 0;
492
+
493
+ int ii = 0;
494
+ for (
495
+ int i = slice.start;
496
+ is_reversed ? (i > slice.stop) : (i < slice.stop);
497
+ i += slice.step
498
+ )
499
+ {
500
+ ret[ii] = a[i];
501
+ ++ii;
502
+ }
503
+
504
+ assert(ii == SliceLength);
505
+ return ret;
506
+ }
507
+
453
508
  template<unsigned Length, typename Type>
454
509
  inline CUDA_CALLABLE Type* index(vec_t<Length, Type>& v, int idx)
455
510
  {
456
511
  #ifndef NDEBUG
457
- if (idx < 0 || idx >= Length)
512
+ if (idx < -(int)Length || idx >= (int)Length)
458
513
  {
459
514
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
460
515
  assert(0);
461
516
  }
462
517
  #endif
463
518
 
519
+ if (idx < 0)
520
+ {
521
+ idx += Length;
522
+ }
523
+
464
524
  return &v[idx];
465
525
  }
466
526
 
@@ -468,13 +528,18 @@ template<unsigned Length, typename Type>
468
528
  inline CUDA_CALLABLE Type* indexref(vec_t<Length, Type>* v, int idx)
469
529
  {
470
530
  #ifndef NDEBUG
471
- if (idx < 0 || idx >= Length)
531
+ if (idx < -(int)Length || idx >= (int)Length)
472
532
  {
473
533
  printf("vec store %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
474
534
  assert(0);
475
535
  }
476
536
  #endif
477
537
 
538
+ if (idx < 0)
539
+ {
540
+ idx += Length;
541
+ }
542
+
478
543
  return &((*v)[idx]);
479
544
  }
480
545
 
@@ -498,120 +563,325 @@ template<unsigned Length, typename Type>
498
563
  inline CUDA_CALLABLE void add_inplace(vec_t<Length, Type>& v, int idx, Type value)
499
564
  {
500
565
  #ifndef NDEBUG
501
- if (idx < 0 || idx >= Length)
566
+ if (idx < -(int)Length || idx >= (int)Length)
502
567
  {
503
568
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
504
569
  assert(0);
505
570
  }
506
571
  #endif
507
572
 
573
+ if (idx < 0)
574
+ {
575
+ idx += Length;
576
+ }
577
+
508
578
  v[idx] += value;
509
579
  }
510
580
 
511
581
 
582
+ template<unsigned SliceLength, unsigned Length, typename Type>
583
+ inline CUDA_CALLABLE void add_inplace(vec_t<Length, Type>& v, slice_t slice, const vec_t<SliceLength, Type> &a)
584
+ {
585
+ assert(slice.start >= 0 && slice.start <= (int)Length);
586
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
587
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
588
+ assert(slice_get_length(slice) == SliceLength);
589
+
590
+ bool is_reversed = slice.step < 0;
591
+
592
+ int ii = 0;
593
+ for (
594
+ int i = slice.start;
595
+ is_reversed ? (i > slice.stop) : (i < slice.stop);
596
+ i += slice.step
597
+ )
598
+ {
599
+ v[i] += a[ii];
600
+ ++ii;
601
+ }
602
+
603
+ assert(ii == SliceLength);
604
+ }
605
+
606
+
512
607
  template<unsigned Length, typename Type>
513
608
  inline CUDA_CALLABLE void adj_add_inplace(vec_t<Length, Type>& v, int idx, Type value,
514
609
  vec_t<Length, Type>& adj_v, int adj_idx, Type& adj_value)
515
610
  {
516
611
  #ifndef NDEBUG
517
- if (idx < 0 || idx >= Length)
612
+ if (idx < -(int)Length || idx >= (int)Length)
518
613
  {
519
614
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
520
615
  assert(0);
521
616
  }
522
617
  #endif
523
618
 
619
+ if (idx < 0)
620
+ {
621
+ idx += Length;
622
+ }
623
+
524
624
  adj_value += adj_v[idx];
525
625
  }
526
626
 
527
627
 
628
+ template<unsigned SliceLength, unsigned Length, typename Type>
629
+ inline CUDA_CALLABLE void adj_add_inplace(
630
+ const vec_t<Length, Type>& v, slice_t slice, const vec_t<SliceLength, Type> &a,
631
+ vec_t<Length, Type>& adj_v, slice_t& adj_slice, vec_t<SliceLength, Type>& adj_a
632
+ )
633
+ {
634
+ assert(slice.start >= 0 && slice.start <= (int)Length);
635
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
636
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
637
+ assert(slice_get_length(slice) == SliceLength);
638
+
639
+ bool is_reversed = slice.step < 0;
640
+
641
+ int ii = 0;
642
+ for (
643
+ int i = slice.start;
644
+ is_reversed ? (i > slice.stop) : (i < slice.stop);
645
+ i += slice.step
646
+ )
647
+ {
648
+ adj_a[ii] += adj_v[i];
649
+ ++ii;
650
+ }
651
+
652
+ assert(ii == SliceLength);
653
+ }
654
+
655
+
528
656
  template<unsigned Length, typename Type>
529
657
  inline CUDA_CALLABLE void sub_inplace(vec_t<Length, Type>& v, int idx, Type value)
530
658
  {
531
659
  #ifndef NDEBUG
532
- if (idx < 0 || idx >= Length)
660
+ if (idx < -(int)Length || idx >= (int)Length)
533
661
  {
534
662
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
535
663
  assert(0);
536
664
  }
537
665
  #endif
538
666
 
667
+ if (idx < 0)
668
+ {
669
+ idx += Length;
670
+ }
671
+
539
672
  v[idx] -= value;
540
673
  }
541
674
 
542
675
 
676
+ template<unsigned SliceLength, unsigned Length, typename Type>
677
+ inline CUDA_CALLABLE void sub_inplace(vec_t<Length, Type>& v, slice_t slice, const vec_t<SliceLength, Type> &a)
678
+ {
679
+ assert(slice.start >= 0 && slice.start <= (int)Length);
680
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
681
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
682
+ assert(slice_get_length(slice) == SliceLength);
683
+
684
+ bool is_reversed = slice.step < 0;
685
+
686
+ int ii = 0;
687
+ for (
688
+ int i = slice.start;
689
+ is_reversed ? (i > slice.stop) : (i < slice.stop);
690
+ i += slice.step
691
+ )
692
+ {
693
+ v[i] -= a[ii];
694
+ ++ii;
695
+ }
696
+
697
+ assert(ii == SliceLength);
698
+ }
699
+
700
+
543
701
  template<unsigned Length, typename Type>
544
702
  inline CUDA_CALLABLE void adj_sub_inplace(vec_t<Length, Type>& v, int idx, Type value,
545
703
  vec_t<Length, Type>& adj_v, int adj_idx, Type& adj_value)
546
704
  {
547
705
  #ifndef NDEBUG
548
- if (idx < 0 || idx >= Length)
706
+ if (idx < -(int)Length || idx >= (int)Length)
549
707
  {
550
708
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
551
709
  assert(0);
552
710
  }
553
711
  #endif
554
712
 
713
+ if (idx < 0)
714
+ {
715
+ idx += Length;
716
+ }
717
+
555
718
  adj_value -= adj_v[idx];
556
719
  }
557
720
 
558
721
 
722
+ template<unsigned SliceLength, unsigned Length, typename Type>
723
+ inline CUDA_CALLABLE void adj_sub_inplace(
724
+ const vec_t<Length, Type>& v, slice_t slice, const vec_t<SliceLength, Type> &a,
725
+ vec_t<Length, Type>& adj_v, slice_t& adj_slice, vec_t<SliceLength, Type>& adj_a
726
+ )
727
+ {
728
+ assert(slice.start >= 0 && slice.start <= (int)Length);
729
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
730
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
731
+ assert(slice_get_length(slice) == SliceLength);
732
+
733
+ bool is_reversed = slice.step < 0;
734
+
735
+ int ii = 0;
736
+ for (
737
+ int i = slice.start;
738
+ is_reversed ? (i > slice.stop) : (i < slice.stop);
739
+ i += slice.step
740
+ )
741
+ {
742
+ adj_a[ii] -= adj_v[i];
743
+ ++ii;
744
+ }
745
+
746
+ assert(ii == SliceLength);
747
+ }
748
+
749
+
559
750
  template<unsigned Length, typename Type>
560
751
  inline CUDA_CALLABLE void assign_inplace(vec_t<Length, Type>& v, int idx, Type value)
561
752
  {
562
753
  #ifndef NDEBUG
563
- if (idx < 0 || idx >= Length)
754
+ if (idx < -(int)Length || idx >= (int)Length)
564
755
  {
565
756
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
566
757
  assert(0);
567
758
  }
568
759
  #endif
569
760
 
761
+ if (idx < 0)
762
+ {
763
+ idx += Length;
764
+ }
765
+
570
766
  v[idx] = value;
571
767
  }
572
768
 
769
+ template<unsigned SliceLength, unsigned Length, typename Type>
770
+ inline CUDA_CALLABLE void assign_inplace(vec_t<Length, Type>& v, slice_t slice, const vec_t<SliceLength, Type> &a)
771
+ {
772
+ assert(slice.start >= 0 && slice.start <= (int)Length);
773
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
774
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
775
+ assert(slice_get_length(slice) == SliceLength);
776
+
777
+ bool is_reversed = slice.step < 0;
778
+
779
+ int ii = 0;
780
+ for (
781
+ int i = slice.start;
782
+ is_reversed ? (i > slice.stop) : (i < slice.stop);
783
+ i += slice.step
784
+ )
785
+ {
786
+ v[i] = a[ii];
787
+ ++ii;
788
+ }
789
+
790
+ assert(ii == SliceLength);
791
+ }
792
+
573
793
  template<unsigned Length, typename Type>
574
794
  inline CUDA_CALLABLE void adj_assign_inplace(vec_t<Length, Type>& v, int idx, Type value, vec_t<Length, Type>& adj_v, int& adj_idx, Type& adj_value)
575
795
  {
576
796
  #ifndef NDEBUG
577
- if (idx < 0 || idx >= Length)
797
+ if (idx < -(int)Length || idx >= (int)Length)
578
798
  {
579
799
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
580
800
  assert(0);
581
801
  }
582
802
  #endif
583
803
 
804
+ if (idx < 0)
805
+ {
806
+ idx += Length;
807
+ }
808
+
584
809
  adj_value += adj_v[idx];
585
810
  }
586
811
 
812
+ template<unsigned SliceLength, unsigned Length, typename Type>
813
+ inline CUDA_CALLABLE void adj_assign_inplace(
814
+ const vec_t<Length, Type>& v, slice_t slice, const vec_t<SliceLength, Type> &a,
815
+ vec_t<Length, Type>& adj_v, slice_t& adj_slice, vec_t<SliceLength, Type>& adj_a
816
+ )
817
+ {
818
+ assert(slice.start >= 0 && slice.start <= (int)Length);
819
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
820
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
821
+ assert(slice_get_length(slice) == SliceLength);
822
+
823
+ bool is_reversed = slice.step < 0;
824
+
825
+ int ii = 0;
826
+ for (
827
+ int i = slice.start;
828
+ is_reversed ? (i > slice.stop) : (i < slice.stop);
829
+ i += slice.step
830
+ )
831
+ {
832
+ adj_a[ii] += adj_v[i];
833
+ ++ii;
834
+ }
835
+
836
+ assert(ii == SliceLength);
837
+ }
838
+
587
839
 
588
840
  template<unsigned Length, typename Type>
589
841
  inline CUDA_CALLABLE vec_t<Length, Type> assign_copy(vec_t<Length, Type>& v, int idx, Type value)
590
842
  {
591
843
  #ifndef NDEBUG
592
- if (idx < 0 || idx >= Length)
844
+ if (idx < -(int)Length || idx >= (int)Length)
593
845
  {
594
846
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
595
847
  assert(0);
596
848
  }
597
849
  #endif
598
850
 
851
+ if (idx < 0)
852
+ {
853
+ idx += Length;
854
+ }
855
+
599
856
  vec_t<Length, Type> ret(v);
600
857
  ret[idx] = value;
601
858
  return ret;
602
859
  }
603
860
 
861
+ template<unsigned SliceLength, unsigned Length, typename Type>
862
+ inline CUDA_CALLABLE vec_t<Length, Type> assign_copy(vec_t<Length, Type>& v, slice_t slice, const vec_t<SliceLength, Type> &a)
863
+ {
864
+ vec_t<Length, Type> ret(v);
865
+ assign_inplace<SliceLength>(ret, slice, a);
866
+ return ret;
867
+ }
868
+
604
869
  template<unsigned Length, typename Type>
605
870
  inline CUDA_CALLABLE void adj_assign_copy(vec_t<Length, Type>& v, int idx, Type value, vec_t<Length, Type>& adj_v, int& adj_idx, Type& adj_value, const vec_t<Length, Type>& adj_ret)
606
871
  {
607
872
  #ifndef NDEBUG
608
- if (idx < 0 || idx >= Length)
873
+ if (idx < -(int)Length || idx >= (int)Length)
609
874
  {
610
875
  printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
611
876
  assert(0);
612
877
  }
613
878
  #endif
614
879
 
880
+ if (idx < 0)
881
+ {
882
+ idx += Length;
883
+ }
884
+
615
885
  adj_value += adj_ret[idx];
616
886
  for(unsigned i=0; i < Length; ++i)
617
887
  {
@@ -620,6 +890,40 @@ inline CUDA_CALLABLE void adj_assign_copy(vec_t<Length, Type>& v, int idx, Type
620
890
  }
621
891
  }
622
892
 
893
+ template<unsigned SliceLength, unsigned Length, typename Type>
894
+ inline CUDA_CALLABLE void adj_assign_copy(
895
+ vec_t<Length, Type>& v, slice_t slice, const vec_t<SliceLength, Type> &a,
896
+ vec_t<Length, Type>& adj_v, slice_t& adj_slice, vec_t<SliceLength, Type>& adj_a,
897
+ const vec_t<Length, Type>& adj_ret)
898
+ {
899
+ assert(slice.start >= 0 && slice.start <= (int)Length);
900
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
901
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
902
+ assert(slice_get_length(slice) == SliceLength);
903
+
904
+ bool is_reversed = slice.step < 0;
905
+
906
+ int ii = 0;
907
+ for (int i = 0; i < Length; ++i)
908
+ {
909
+ bool in_slice = is_reversed
910
+ ? (i <= slice.start && i > slice.stop && (slice.start - i) % (-slice.step) == 0)
911
+ : (i >= slice.start && i < slice.stop && (i - slice.start) % slice.step == 0);
912
+
913
+ if (!in_slice)
914
+ {
915
+ adj_v[i] += adj_ret[i];
916
+ }
917
+ else
918
+ {
919
+ adj_a[ii] += adj_ret[i];
920
+ ++ii;
921
+ }
922
+ }
923
+
924
+ assert(ii == SliceLength);
925
+ }
926
+
623
927
  template<unsigned Length, typename Type>
624
928
  inline CUDA_CALLABLE Type length(vec_t<Length, Type> a)
625
929
  {
@@ -969,11 +1273,11 @@ template<unsigned Length, typename Type>
969
1273
  inline CUDA_CALLABLE void adj_div(Type s, vec_t<Length, Type> a, Type& adj_s, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
970
1274
  {
971
1275
 
972
- adj_s -= dot(a , adj_ret)/ (s * s); // - a / s^2
973
-
974
- for( unsigned i=0; i < Length; ++i )
1276
+ for (unsigned i=0; i < Length; ++i)
975
1277
  {
976
- adj_a[i] += s / adj_ret[i];
1278
+ Type inv = Type(1) / a[i];
1279
+ adj_a[i] -= s * adj_ret[i] * inv * inv;
1280
+ adj_s += adj_ret[i] * inv;
977
1281
  }
978
1282
 
979
1283
  #if FP_CHECK
@@ -999,6 +1303,21 @@ inline CUDA_CALLABLE void adj_add(vec_t<Length, Type> a, vec_t<Length, Type> b,
999
1303
  adj_b += adj_ret;
1000
1304
  }
1001
1305
 
1306
+ template<unsigned Length, typename Type>
1307
+ inline CUDA_CALLABLE void adj_add(
1308
+ Type a, vec_t<Length, Type> b,
1309
+ Type& adj_a, vec_t<Length, Type>& adj_b,
1310
+ const vec_t<Length, Type>& adj_ret
1311
+ )
1312
+ {
1313
+ for (unsigned i = 0; i < Length; ++i)
1314
+ {
1315
+ adj_a += adj_ret.c[i];
1316
+ }
1317
+
1318
+ adj_b += adj_ret;
1319
+ }
1320
+
1002
1321
  template<typename Type>
1003
1322
  inline CUDA_CALLABLE void adj_add(vec_t<2, Type> a, vec_t<2, Type> b, vec_t<2, Type>& adj_a, vec_t<2, Type>& adj_b, const vec_t<2, Type>& adj_ret)
1004
1323
  {
@@ -1026,6 +1345,21 @@ inline CUDA_CALLABLE void adj_sub(vec_t<Length, Type> a, vec_t<Length, Type> b,
1026
1345
  adj_b -= adj_ret;
1027
1346
  }
1028
1347
 
1348
+ template<unsigned Length, typename Type>
1349
+ inline CUDA_CALLABLE void adj_sub(
1350
+ Type a, vec_t<Length, Type> b,
1351
+ Type& adj_a, vec_t<Length, Type>& adj_b,
1352
+ const vec_t<Length, Type>& adj_ret
1353
+ )
1354
+ {
1355
+ for (unsigned i = 0; i < Length; ++i)
1356
+ {
1357
+ adj_a += adj_ret.c[i];
1358
+ }
1359
+
1360
+ adj_b -= adj_ret;
1361
+ }
1362
+
1029
1363
  template<typename Type>
1030
1364
  inline CUDA_CALLABLE void adj_sub(vec_t<2, Type> a, vec_t<2, Type> b, vec_t<2, Type>& adj_a, vec_t<2, Type>& adj_b, const vec_t<2, Type>& adj_ret)
1031
1365
  {
@@ -1106,16 +1440,49 @@ template<unsigned Length, typename Type>
1106
1440
  inline CUDA_CALLABLE void adj_extract(const vec_t<Length, Type> & a, int idx, vec_t<Length, Type> & adj_a, int & adj_idx, Type & adj_ret)
1107
1441
  {
1108
1442
  #ifndef NDEBUG
1109
- if (idx < 0 || idx > Length)
1443
+ if (idx < -(int)Length || idx >= (int)Length)
1110
1444
  {
1111
- printf("Tvec2<Scalar> index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
1445
+ printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
1112
1446
  assert(0);
1113
1447
  }
1114
1448
  #endif
1115
1449
 
1450
+ if (idx < 0)
1451
+ {
1452
+ idx += Length;
1453
+ }
1454
+
1116
1455
  adj_a[idx] += adj_ret;
1117
1456
  }
1118
1457
 
1458
+ template<unsigned SliceLength, unsigned Length, typename Type>
1459
+ inline CUDA_CALLABLE void adj_extract(
1460
+ const vec_t<Length, Type>& a, slice_t slice,
1461
+ vec_t<Length, Type>& adj_a, slice_t& adj_slice,
1462
+ const vec_t<SliceLength, Type>& adj_ret
1463
+ )
1464
+ {
1465
+ assert(slice.start >= 0 && slice.start <= (int)Length);
1466
+ assert(slice.stop >= -1 && slice.stop <= (int)Length);
1467
+ assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
1468
+ assert(slice_get_length(slice) == SliceLength);
1469
+
1470
+ bool is_reversed = slice.step < 0;
1471
+
1472
+ int ii = 0;
1473
+ for (
1474
+ int i = slice.start;
1475
+ is_reversed ? (i > slice.stop) : (i < slice.stop);
1476
+ i += slice.step
1477
+ )
1478
+ {
1479
+ adj_a[i] += adj_ret[ii];
1480
+ ++ii;
1481
+ }
1482
+
1483
+ assert(ii == SliceLength);
1484
+ }
1485
+
1119
1486
  template<unsigned Length, typename Type>
1120
1487
  inline CUDA_CALLABLE void adj_length(vec_t<Length, Type> a, Type ret, vec_t<Length, Type>& adj_a, const Type adj_ret)
1121
1488
  {