warp-lang 1.4.1__py3-none-manylinux2014_x86_64.whl → 1.5.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (164) hide show
  1. warp/__init__.py +4 -0
  2. warp/autograd.py +43 -8
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +21 -2
  6. warp/build_dll.py +23 -6
  7. warp/builtins.py +1920 -111
  8. warp/codegen.py +186 -62
  9. warp/config.py +2 -2
  10. warp/context.py +322 -73
  11. warp/examples/assets/pixel.jpg +0 -0
  12. warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
  13. warp/examples/benchmarks/benchmark_gemm.py +121 -0
  14. warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
  15. warp/examples/benchmarks/benchmark_tile.py +179 -0
  16. warp/examples/core/example_dem.py +2 -1
  17. warp/examples/core/example_mesh_intersect.py +3 -3
  18. warp/examples/fem/example_adaptive_grid.py +37 -10
  19. warp/examples/fem/example_apic_fluid.py +3 -2
  20. warp/examples/fem/example_convection_diffusion_dg.py +4 -5
  21. warp/examples/fem/example_deformed_geometry.py +1 -1
  22. warp/examples/fem/example_diffusion_3d.py +47 -4
  23. warp/examples/fem/example_distortion_energy.py +220 -0
  24. warp/examples/fem/example_magnetostatics.py +127 -85
  25. warp/examples/fem/example_nonconforming_contact.py +5 -5
  26. warp/examples/fem/example_stokes.py +3 -1
  27. warp/examples/fem/example_streamlines.py +12 -19
  28. warp/examples/fem/utils.py +38 -15
  29. warp/examples/optim/example_walker.py +2 -2
  30. warp/examples/sim/example_cloth.py +2 -25
  31. warp/examples/sim/example_jacobian_ik.py +6 -2
  32. warp/examples/sim/example_quadruped.py +2 -1
  33. warp/examples/tile/example_tile_convolution.py +58 -0
  34. warp/examples/tile/example_tile_fft.py +47 -0
  35. warp/examples/tile/example_tile_filtering.py +105 -0
  36. warp/examples/tile/example_tile_matmul.py +79 -0
  37. warp/examples/tile/example_tile_mlp.py +375 -0
  38. warp/fem/__init__.py +8 -0
  39. warp/fem/cache.py +16 -12
  40. warp/fem/dirichlet.py +1 -1
  41. warp/fem/domain.py +44 -1
  42. warp/fem/field/__init__.py +1 -2
  43. warp/fem/field/field.py +31 -19
  44. warp/fem/field/nodal_field.py +101 -49
  45. warp/fem/field/virtual.py +794 -0
  46. warp/fem/geometry/__init__.py +2 -2
  47. warp/fem/geometry/deformed_geometry.py +3 -105
  48. warp/fem/geometry/element.py +13 -0
  49. warp/fem/geometry/geometry.py +165 -5
  50. warp/fem/geometry/grid_2d.py +3 -6
  51. warp/fem/geometry/grid_3d.py +31 -28
  52. warp/fem/geometry/hexmesh.py +3 -46
  53. warp/fem/geometry/nanogrid.py +3 -2
  54. warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
  55. warp/fem/geometry/tetmesh.py +2 -43
  56. warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
  57. warp/fem/integrate.py +683 -261
  58. warp/fem/linalg.py +404 -0
  59. warp/fem/operator.py +101 -18
  60. warp/fem/polynomial.py +5 -5
  61. warp/fem/quadrature/quadrature.py +45 -21
  62. warp/fem/space/__init__.py +45 -11
  63. warp/fem/space/basis_function_space.py +451 -0
  64. warp/fem/space/basis_space.py +58 -11
  65. warp/fem/space/function_space.py +146 -5
  66. warp/fem/space/grid_2d_function_space.py +80 -66
  67. warp/fem/space/grid_3d_function_space.py +113 -68
  68. warp/fem/space/hexmesh_function_space.py +96 -108
  69. warp/fem/space/nanogrid_function_space.py +62 -110
  70. warp/fem/space/quadmesh_function_space.py +208 -0
  71. warp/fem/space/shape/__init__.py +45 -7
  72. warp/fem/space/shape/cube_shape_function.py +328 -54
  73. warp/fem/space/shape/shape_function.py +10 -1
  74. warp/fem/space/shape/square_shape_function.py +328 -60
  75. warp/fem/space/shape/tet_shape_function.py +269 -19
  76. warp/fem/space/shape/triangle_shape_function.py +238 -19
  77. warp/fem/space/tetmesh_function_space.py +69 -37
  78. warp/fem/space/topology.py +38 -0
  79. warp/fem/space/trimesh_function_space.py +179 -0
  80. warp/fem/utils.py +6 -331
  81. warp/jax_experimental.py +3 -1
  82. warp/native/array.h +55 -40
  83. warp/native/builtin.h +124 -43
  84. warp/native/bvh.h +4 -0
  85. warp/native/coloring.cpp +600 -0
  86. warp/native/cuda_util.cpp +14 -0
  87. warp/native/cuda_util.h +2 -1
  88. warp/native/fabric.h +8 -0
  89. warp/native/hashgrid.h +4 -0
  90. warp/native/marching.cu +8 -0
  91. warp/native/mat.h +14 -3
  92. warp/native/mathdx.cpp +59 -0
  93. warp/native/mesh.h +4 -0
  94. warp/native/range.h +13 -1
  95. warp/native/reduce.cpp +9 -1
  96. warp/native/reduce.cu +7 -0
  97. warp/native/runlength_encode.cpp +9 -1
  98. warp/native/runlength_encode.cu +7 -1
  99. warp/native/scan.cpp +8 -0
  100. warp/native/scan.cu +8 -0
  101. warp/native/scan.h +8 -1
  102. warp/native/sparse.cpp +8 -0
  103. warp/native/sparse.cu +8 -0
  104. warp/native/temp_buffer.h +7 -0
  105. warp/native/tile.h +1857 -0
  106. warp/native/tile_gemm.h +341 -0
  107. warp/native/tile_reduce.h +210 -0
  108. warp/native/volume_builder.cu +8 -0
  109. warp/native/volume_builder.h +8 -0
  110. warp/native/warp.cpp +10 -2
  111. warp/native/warp.cu +369 -15
  112. warp/native/warp.h +12 -2
  113. warp/optim/adam.py +39 -4
  114. warp/paddle.py +29 -12
  115. warp/render/render_opengl.py +137 -65
  116. warp/sim/graph_coloring.py +292 -0
  117. warp/sim/integrator_euler.py +4 -2
  118. warp/sim/integrator_featherstone.py +115 -44
  119. warp/sim/integrator_vbd.py +6 -0
  120. warp/sim/model.py +90 -17
  121. warp/stubs.py +651 -85
  122. warp/tape.py +12 -7
  123. warp/tests/assets/pixel.npy +0 -0
  124. warp/tests/aux_test_instancing_gc.py +18 -0
  125. warp/tests/test_array.py +207 -48
  126. warp/tests/test_closest_point_edge_edge.py +8 -8
  127. warp/tests/test_codegen.py +120 -1
  128. warp/tests/test_codegen_instancing.py +30 -0
  129. warp/tests/test_collision.py +110 -0
  130. warp/tests/test_coloring.py +241 -0
  131. warp/tests/test_context.py +34 -0
  132. warp/tests/test_examples.py +18 -4
  133. warp/tests/test_fabricarray.py +33 -0
  134. warp/tests/test_fem.py +453 -113
  135. warp/tests/test_func.py +48 -1
  136. warp/tests/test_generics.py +52 -0
  137. warp/tests/test_iter.py +68 -0
  138. warp/tests/test_mat_scalar_ops.py +1 -1
  139. warp/tests/test_mesh_query_point.py +5 -4
  140. warp/tests/test_module_hashing.py +23 -0
  141. warp/tests/test_paddle.py +27 -87
  142. warp/tests/test_print.py +191 -1
  143. warp/tests/test_spatial.py +1 -1
  144. warp/tests/test_tile.py +700 -0
  145. warp/tests/test_tile_mathdx.py +144 -0
  146. warp/tests/test_tile_mlp.py +383 -0
  147. warp/tests/test_tile_reduce.py +374 -0
  148. warp/tests/test_tile_shared_memory.py +190 -0
  149. warp/tests/test_vbd.py +12 -20
  150. warp/tests/test_volume.py +43 -0
  151. warp/tests/unittest_suites.py +23 -2
  152. warp/tests/unittest_utils.py +4 -0
  153. warp/types.py +339 -73
  154. warp/utils.py +22 -1
  155. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
  156. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
  157. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
  158. warp/fem/field/test.py +0 -180
  159. warp/fem/field/trial.py +0 -183
  160. warp/fem/space/collocated_function_space.py +0 -102
  161. warp/fem/space/quadmesh_2d_function_space.py +0 -261
  162. warp/fem/space/trimesh_2d_function_space.py +0 -153
  163. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
  164. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
warp/native/array.h CHANGED
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #pragma once
2
10
 
3
11
  #include "builtin.h"
@@ -285,6 +293,13 @@ CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i)
285
293
  template <typename T>
286
294
  CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j)
287
295
  {
296
+ // if (i < 0 || i >= arr.shape[0])
297
+ // printf("i: %d > arr.shape[0]: %d\n", i, arr.shape[0]);
298
+
299
+ // if (j < 0 || j >= arr.shape[1])
300
+ // printf("j: %d > arr.shape[1]: %d\n", j, arr.shape[1]);
301
+
302
+
288
303
  assert(i >= 0 && i < arr.shape[0]);
289
304
  assert(j >= 0 && j < arr.shape[1]);
290
305
 
@@ -811,7 +826,7 @@ CUDA_CALLABLE inline void adj_atomic_add(bool* buf, bool value) { }
811
826
 
812
827
  // only generate gradients for T types
813
828
  template<typename T>
814
- inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int& adj_i, const T& adj_output)
829
+ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int adj_i, const T& adj_output)
815
830
  {
816
831
  if (adj_buf.data)
817
832
  adj_atomic_add(&index(adj_buf, i), adj_output);
@@ -819,7 +834,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_
819
834
  adj_atomic_add(&index_grad(buf, i), adj_output);
820
835
  }
821
836
  template<typename T>
822
- inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output)
837
+ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int adj_i, int adj_j, const T& adj_output)
823
838
  {
824
839
  if (adj_buf.data)
825
840
  adj_atomic_add(&index(adj_buf, i, j), adj_output);
@@ -827,7 +842,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const
827
842
  adj_atomic_add(&index_grad(buf, i, j), adj_output);
828
843
  }
829
844
  template<typename T>
830
- inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output)
845
+ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, const T& adj_output)
831
846
  {
832
847
  if (adj_buf.data)
833
848
  adj_atomic_add(&index(adj_buf, i, j, k), adj_output);
@@ -835,7 +850,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k
835
850
  adj_atomic_add(&index_grad(buf, i, j, k), adj_output);
836
851
  }
837
852
  template<typename T>
838
- inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output)
853
+ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, const T& adj_output)
839
854
  {
840
855
  if (adj_buf.data)
841
856
  adj_atomic_add(&index(adj_buf, i, j, k, l), adj_output);
@@ -844,7 +859,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k
844
859
  }
845
860
 
846
861
  template<typename T>
847
- inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value)
862
+ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value)
848
863
  {
849
864
  if (adj_buf.data)
850
865
  adj_value += index(adj_buf, i);
@@ -854,7 +869,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value,
854
869
  FP_VERIFY_ADJ_1(value, adj_value)
855
870
  }
856
871
  template<typename T>
857
- inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value)
872
+ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value)
858
873
  {
859
874
  if (adj_buf.data)
860
875
  adj_value += index(adj_buf, i, j);
@@ -864,7 +879,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T
864
879
  FP_VERIFY_ADJ_2(value, adj_value)
865
880
  }
866
881
  template<typename T>
867
- inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value)
882
+ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value)
868
883
  {
869
884
  if (adj_buf.data)
870
885
  adj_value += index(adj_buf, i, j, k);
@@ -874,7 +889,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, i
874
889
  FP_VERIFY_ADJ_3(value, adj_value)
875
890
  }
876
891
  template<typename T>
877
- inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value)
892
+ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value)
878
893
  {
879
894
  if (adj_buf.data)
880
895
  adj_value += index(adj_buf, i, j, k, l);
@@ -898,7 +913,7 @@ inline CUDA_CALLABLE void adj_load(const T* address, const T& adj_address, T& ad
898
913
  }
899
914
 
900
915
  template<typename T>
901
- inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret)
916
+ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret)
902
917
  {
903
918
  if (adj_buf.data)
904
919
  adj_value += index(adj_buf, i);
@@ -908,7 +923,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value,
908
923
  FP_VERIFY_ADJ_1(value, adj_value)
909
924
  }
910
925
  template<typename T>
911
- inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret)
926
+ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret)
912
927
  {
913
928
  if (adj_buf.data)
914
929
  adj_value += index(adj_buf, i, j);
@@ -918,7 +933,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T
918
933
  FP_VERIFY_ADJ_2(value, adj_value)
919
934
  }
920
935
  template<typename T>
921
- inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret)
936
+ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret)
922
937
  {
923
938
  if (adj_buf.data)
924
939
  adj_value += index(adj_buf, i, j, k);
@@ -928,7 +943,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, in
928
943
  FP_VERIFY_ADJ_3(value, adj_value)
929
944
  }
930
945
  template<typename T>
931
- inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret)
946
+ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret)
932
947
  {
933
948
  if (adj_buf.data)
934
949
  adj_value += index(adj_buf, i, j, k, l);
@@ -939,7 +954,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, in
939
954
  }
940
955
 
941
956
  template<typename T>
942
- inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret)
957
+ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret)
943
958
  {
944
959
  if (adj_buf.data)
945
960
  adj_value -= index(adj_buf, i);
@@ -949,7 +964,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value,
949
964
  FP_VERIFY_ADJ_1(value, adj_value)
950
965
  }
951
966
  template<typename T>
952
- inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret)
967
+ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret)
953
968
  {
954
969
  if (adj_buf.data)
955
970
  adj_value -= index(adj_buf, i, j);
@@ -959,7 +974,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T
959
974
  FP_VERIFY_ADJ_2(value, adj_value)
960
975
  }
961
976
  template<typename T>
962
- inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret)
977
+ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret)
963
978
  {
964
979
  if (adj_buf.data)
965
980
  adj_value -= index(adj_buf, i, j, k);
@@ -969,7 +984,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, in
969
984
  FP_VERIFY_ADJ_3(value, adj_value)
970
985
  }
971
986
  template<typename T>
972
- inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret)
987
+ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret)
973
988
  {
974
989
  if (adj_buf.data)
975
990
  adj_value -= index(adj_buf, i, j, k, l);
@@ -981,44 +996,44 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, in
981
996
 
982
997
  // generic array types that do not support gradient computation (indexedarray, etc.)
983
998
  template<template<typename> class A1, template<typename> class A2, typename T>
984
- inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int& adj_i, const T& adj_output) {}
999
+ inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int adj_i, const T& adj_output) {}
985
1000
  template<template<typename> class A1, template<typename> class A2, typename T>
986
- inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output) {}
1001
+ inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int adj_i, int adj_j, const T& adj_output) {}
987
1002
  template<template<typename> class A1, template<typename> class A2, typename T>
988
- inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output) {}
1003
+ inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, const T& adj_output) {}
989
1004
  template<template<typename> class A1, template<typename> class A2, typename T>
990
- inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output) {}
1005
+ inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, const T& adj_output) {}
991
1006
 
992
1007
  template<template<typename> class A1, template<typename> class A2, typename T>
993
- inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value) {}
1008
+ inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value) {}
994
1009
  template<template<typename> class A1, template<typename> class A2, typename T>
995
- inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value) {}
1010
+ inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value) {}
996
1011
  template<template<typename> class A1, template<typename> class A2, typename T>
997
- inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value) {}
1012
+ inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value) {}
998
1013
  template<template<typename> class A1, template<typename> class A2, typename T>
999
- inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value) {}
1014
+ inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value) {}
1000
1015
 
1001
1016
  template<template<typename> class A1, template<typename> class A2, typename T>
1002
- inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {}
1017
+ inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
1003
1018
  template<template<typename> class A1, template<typename> class A2, typename T>
1004
- inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {}
1019
+ inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
1005
1020
  template<template<typename> class A1, template<typename> class A2, typename T>
1006
- inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {}
1021
+ inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
1007
1022
  template<template<typename> class A1, template<typename> class A2, typename T>
1008
- inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {}
1023
+ inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
1009
1024
 
1010
1025
  template<template<typename> class A1, template<typename> class A2, typename T>
1011
- inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {}
1026
+ inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
1012
1027
  template<template<typename> class A1, template<typename> class A2, typename T>
1013
- inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {}
1028
+ inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
1014
1029
  template<template<typename> class A1, template<typename> class A2, typename T>
1015
- inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {}
1030
+ inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
1016
1031
  template<template<typename> class A1, template<typename> class A2, typename T>
1017
- inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {}
1032
+ inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
1018
1033
 
1019
1034
  // generic handler for scalar values
1020
1035
  template<template<typename> class A1, template<typename> class A2, typename T>
1021
- inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {
1036
+ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
1022
1037
  if (adj_buf.data)
1023
1038
  adj_atomic_minmax(&index(buf, i), &index(adj_buf, i), value, adj_value);
1024
1039
  else if (buf.grad)
@@ -1027,7 +1042,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const
1027
1042
  FP_VERIFY_ADJ_1(value, adj_value)
1028
1043
  }
1029
1044
  template<template<typename> class A1, template<typename> class A2, typename T>
1030
- inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {
1045
+ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
1031
1046
  if (adj_buf.data)
1032
1047
  adj_atomic_minmax(&index(buf, i, j), &index(adj_buf, i, j), value, adj_value);
1033
1048
  else if (buf.grad)
@@ -1036,7 +1051,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value
1036
1051
  FP_VERIFY_ADJ_2(value, adj_value)
1037
1052
  }
1038
1053
  template<template<typename> class A1, template<typename> class A2, typename T>
1039
- inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {
1054
+ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
1040
1055
  if (adj_buf.data)
1041
1056
  adj_atomic_minmax(&index(buf, i, j, k), &index(adj_buf, i, j, k), value, adj_value);
1042
1057
  else if (buf.grad)
@@ -1045,7 +1060,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k,
1045
1060
  FP_VERIFY_ADJ_3(value, adj_value)
1046
1061
  }
1047
1062
  template<template<typename> class A1, template<typename> class A2, typename T>
1048
- inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {
1063
+ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
1049
1064
  if (adj_buf.data)
1050
1065
  adj_atomic_minmax(&index(buf, i, j, k, l), &index(adj_buf, i, j, k, l), value, adj_value);
1051
1066
  else if (buf.grad)
@@ -1055,7 +1070,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k,
1055
1070
  }
1056
1071
 
1057
1072
  template<template<typename> class A1, template<typename> class A2, typename T>
1058
- inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {
1073
+ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
1059
1074
  if (adj_buf.data)
1060
1075
  adj_atomic_minmax(&index(buf, i), &index(adj_buf, i), value, adj_value);
1061
1076
  else if (buf.grad)
@@ -1064,7 +1079,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const
1064
1079
  FP_VERIFY_ADJ_1(value, adj_value)
1065
1080
  }
1066
1081
  template<template<typename> class A1, template<typename> class A2, typename T>
1067
- inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {
1082
+ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
1068
1083
  if (adj_buf.data)
1069
1084
  adj_atomic_minmax(&index(buf, i, j), &index(adj_buf, i, j), value, adj_value);
1070
1085
  else if (buf.grad)
@@ -1073,7 +1088,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value
1073
1088
  FP_VERIFY_ADJ_2(value, adj_value)
1074
1089
  }
1075
1090
  template<template<typename> class A1, template<typename> class A2, typename T>
1076
- inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {
1091
+ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
1077
1092
  if (adj_buf.data)
1078
1093
  adj_atomic_minmax(&index(buf, i, j, k), &index(adj_buf, i, j, k), value, adj_value);
1079
1094
  else if (buf.grad)
@@ -1082,7 +1097,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k,
1082
1097
  FP_VERIFY_ADJ_3(value, adj_value)
1083
1098
  }
1084
1099
  template<template<typename> class A1, template<typename> class A2, typename T>
1085
- inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {
1100
+ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
1086
1101
  if (adj_buf.data)
1087
1102
  adj_atomic_minmax(&index(buf, i, j, k, l), &index(adj_buf, i, j, k, l), value, adj_value);
1088
1103
  else if (buf.grad)
warp/native/builtin.h CHANGED
@@ -1145,7 +1145,47 @@ struct launch_bounds_t
1145
1145
  size_t size; // total number of threads
1146
1146
  };
1147
1147
 
1148
- inline CUDA_CALLABLE int tid(size_t index)
1148
+ // represents coordinate in the launch grid
1149
+ struct launch_coord_t
1150
+ {
1151
+ int i;
1152
+ int j;
1153
+ int k;
1154
+ int l;
1155
+ };
1156
+
1157
+ // unravels a linear thread index to the corresponding launch grid coord (up to 4d)
1158
+ inline CUDA_CALLABLE launch_coord_t launch_coord(size_t linear, const launch_bounds_t& bounds)
1159
+ {
1160
+ launch_coord_t coord = {0, 0, 0, 0};
1161
+
1162
+ if (bounds.ndim > 3)
1163
+ {
1164
+ coord.l = linear%bounds.shape[3];
1165
+ linear /= bounds.shape[3];
1166
+ }
1167
+
1168
+ if (bounds.ndim > 2)
1169
+ {
1170
+ coord.k = linear%bounds.shape[2];
1171
+ linear /= bounds.shape[2];
1172
+ }
1173
+
1174
+ if (bounds.ndim > 1)
1175
+ {
1176
+ coord.j = linear%bounds.shape[1];
1177
+ linear /= bounds.shape[1];
1178
+ }
1179
+
1180
+ if (bounds.ndim > 0)
1181
+ {
1182
+ coord.i = linear;
1183
+ }
1184
+
1185
+ return coord;
1186
+ }
1187
+
1188
+ inline CUDA_CALLABLE int tid(size_t index, const launch_bounds_t& bounds)
1149
1189
  {
1150
1190
  // For the 1-D tid() we need to warn the user if we're about to provide a truncated index
1151
1191
  // Only do this in _DEBUG when called from device to avoid excessive register allocation
@@ -1154,40 +1194,33 @@ inline CUDA_CALLABLE int tid(size_t index)
1154
1194
  printf("Warp warning: tid() is returning an overflowed int\n");
1155
1195
  }
1156
1196
  #endif
1157
- return static_cast<int>(index);
1197
+
1198
+ launch_coord_t c = launch_coord(index, bounds);
1199
+ return static_cast<int>(c.i);
1158
1200
  }
1159
1201
 
1160
- inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& launch_bounds)
1202
+ inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& bounds)
1161
1203
  {
1162
- const size_t n = launch_bounds.shape[1];
1163
-
1164
- // convert to work item
1165
- i = index/n;
1166
- j = index%n;
1204
+ launch_coord_t c = launch_coord(index, bounds);
1205
+ i = c.i;
1206
+ j = c.j;
1167
1207
  }
1168
1208
 
1169
- inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& launch_bounds)
1209
+ inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& bounds)
1170
1210
  {
1171
- const size_t n = launch_bounds.shape[1];
1172
- const size_t o = launch_bounds.shape[2];
1173
-
1174
- // convert to work item
1175
- i = index/(n*o);
1176
- j = index%(n*o)/o;
1177
- k = index%o;
1211
+ launch_coord_t c = launch_coord(index, bounds);
1212
+ i = c.i;
1213
+ j = c.j;
1214
+ k = c.k;
1178
1215
  }
1179
1216
 
1180
- inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& launch_bounds)
1217
+ inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& bounds)
1181
1218
  {
1182
- const size_t n = launch_bounds.shape[1];
1183
- const size_t o = launch_bounds.shape[2];
1184
- const size_t p = launch_bounds.shape[3];
1185
-
1186
- // convert to work item
1187
- i = index/(n*o*p);
1188
- j = index%(n*o*p)/(o*p);
1189
- k = index%(o*p)/p;
1190
- l = index%p;
1219
+ launch_coord_t c = launch_coord(index, bounds);
1220
+ i = c.i;
1221
+ j = c.j;
1222
+ k = c.k;
1223
+ l = c.l;
1191
1224
  }
1192
1225
 
1193
1226
  template<typename T>
@@ -1575,32 +1608,73 @@ inline CUDA_CALLABLE void print(transform_t<Type> t)
1575
1608
  printf("(%g %g %g) (%g %g %g %g)\n", float(t.p[0]), float(t.p[1]), float(t.p[2]), float(t.q.x), float(t.q.y), float(t.q.z), float(t.q.w));
1576
1609
  }
1577
1610
 
1578
- inline CUDA_CALLABLE void adj_print(int i, int adj_i) { printf("%d adj: %d\n", i, adj_i); }
1579
- inline CUDA_CALLABLE void adj_print(float f, float adj_f) { printf("%g adj: %g\n", f, adj_f); }
1580
- inline CUDA_CALLABLE void adj_print(short f, short adj_f) { printf("%hd adj: %hd\n", f, adj_f); }
1581
- inline CUDA_CALLABLE void adj_print(long f, long adj_f) { printf("%ld adj: %ld\n", f, adj_f); }
1582
- inline CUDA_CALLABLE void adj_print(long long f, long long adj_f) { printf("%lld adj: %lld\n", f, adj_f); }
1583
- inline CUDA_CALLABLE void adj_print(unsigned f, unsigned adj_f) { printf("%u adj: %u\n", f, adj_f); }
1584
- inline CUDA_CALLABLE void adj_print(unsigned short f, unsigned short adj_f) { printf("%hu adj: %hu\n", f, adj_f); }
1585
- inline CUDA_CALLABLE void adj_print(unsigned long f, unsigned long adj_f) { printf("%lu adj: %lu\n", f, adj_f); }
1586
- inline CUDA_CALLABLE void adj_print(unsigned long long f, unsigned long long adj_f) { printf("%llu adj: %llu\n", f, adj_f); }
1587
- inline CUDA_CALLABLE void adj_print(half h, half adj_h) { printf("%g adj: %g\n", half_to_float(h), half_to_float(adj_h)); }
1588
- inline CUDA_CALLABLE void adj_print(double f, double adj_f) { printf("%g adj: %g\n", f, adj_f); }
1611
+ template<typename T>
1612
+ inline CUDA_CALLABLE void adj_print(const T& x, const T& adj_x)
1613
+ {
1614
+ printf("adj: <type without print implementation>\n");
1615
+ }
1616
+
1617
+ // note: adj_print() only prints the adjoint value, since the value itself gets printed in replay print()
1618
+ inline CUDA_CALLABLE void adj_print(half x, half adj_x) { printf("adj: %g\n", half_to_float(adj_x)); }
1619
+ inline CUDA_CALLABLE void adj_print(float x, float adj_x) { printf("adj: %g\n", adj_x); }
1620
+ inline CUDA_CALLABLE void adj_print(double x, double adj_x) { printf("adj: %g\n", adj_x); }
1621
+
1622
+ inline CUDA_CALLABLE void adj_print(signed char x, signed char adj_x) { printf("adj: %d\n", adj_x); }
1623
+ inline CUDA_CALLABLE void adj_print(short x, short adj_x) { printf("adj: %d\n", adj_x); }
1624
+ inline CUDA_CALLABLE void adj_print(int x, int adj_x) { printf("adj: %d\n", adj_x); }
1625
+ inline CUDA_CALLABLE void adj_print(long x, long adj_x) { printf("adj: %ld\n", adj_x); }
1626
+ inline CUDA_CALLABLE void adj_print(long long x, long long adj_x) { printf("adj: %lld\n", adj_x); }
1627
+
1628
+ inline CUDA_CALLABLE void adj_print(unsigned char x, unsigned char adj_x) { printf("adj: %u\n", adj_x); }
1629
+ inline CUDA_CALLABLE void adj_print(unsigned short x, unsigned short adj_x) { printf("adj: %u\n", adj_x); }
1630
+ inline CUDA_CALLABLE void adj_print(unsigned x, unsigned adj_x) { printf("adj: %u\n", adj_x); }
1631
+ inline CUDA_CALLABLE void adj_print(unsigned long x, unsigned long adj_x) { printf("adj: %lu\n", adj_x); }
1632
+ inline CUDA_CALLABLE void adj_print(unsigned long long x, unsigned long long adj_x) { printf("adj: %llu\n", adj_x); }
1633
+
1634
+ inline CUDA_CALLABLE void adj_print(bool x, bool adj_x) { printf("adj: %s\n", (adj_x ? "True" : "False")); }
1589
1635
 
1590
1636
  template<unsigned Length, typename Type>
1591
- inline CUDA_CALLABLE void adj_print(vec_t<Length, Type> v, vec_t<Length, Type>& adj_v) { printf("%g %g adj: %g %g \n", v[0], v[1], adj_v[0], adj_v[1]); }
1637
+ inline CUDA_CALLABLE void adj_print(const vec_t<Length, Type>& v, const vec_t<Length, Type>& adj_v)
1638
+ {
1639
+ printf("adj:");
1640
+ for (unsigned i = 0; i < Length; i++)
1641
+ printf(" %g", float(adj_v[i]));
1642
+ printf("\n");
1643
+ }
1592
1644
 
1593
1645
  template<unsigned Rows, unsigned Cols, typename Type>
1594
- inline CUDA_CALLABLE void adj_print(mat_t<Rows, Cols, Type> m, mat_t<Rows, Cols, Type>& adj_m) { }
1646
+ inline CUDA_CALLABLE void adj_print(const mat_t<Rows, Cols, Type>& m, const mat_t<Rows, Cols, Type>& adj_m)
1647
+ {
1648
+ for (unsigned i = 0; i < Rows; i++)
1649
+ {
1650
+ if (i == 0)
1651
+ printf("adj:");
1652
+ else
1653
+ printf(" ");
1654
+ for (unsigned j = 0; j < Cols; j++)
1655
+ printf(" %g", float(adj_m.data[i][j]));
1656
+ printf("\n");
1657
+ }
1658
+ }
1595
1659
 
1596
1660
  template<typename Type>
1597
- inline CUDA_CALLABLE void adj_print(quat_t<Type> q, quat_t<Type>& adj_q) { printf("%g %g %g %g adj: %g %g %g %g\n", q.x, q.y, q.z, q.w, adj_q.x, adj_q.y, adj_q.z, adj_q.w); }
1661
+ inline CUDA_CALLABLE void adj_print(const quat_t<Type>& q, const quat_t<Type>& adj_q)
1662
+ {
1663
+ printf("adj: %g %g %g %g\n", float(adj_q.x), float(adj_q.y), float(adj_q.z), float(adj_q.w));
1664
+ }
1598
1665
 
1599
1666
  template<typename Type>
1600
- inline CUDA_CALLABLE void adj_print(transform_t<Type> t, transform_t<Type>& adj_t) {}
1601
-
1602
- inline CUDA_CALLABLE void adj_print(str t, str& adj_t) {}
1667
+ inline CUDA_CALLABLE void adj_print(const transform_t<Type>& t, const transform_t<Type>& adj_t)
1668
+ {
1669
+ printf("adj: (%g %g %g) (%g %g %g %g)\n",
1670
+ float(adj_t.p[0]), float(adj_t.p[1]), float(adj_t.p[2]),
1671
+ float(adj_t.q.x), float(adj_t.q.y), float(adj_t.q.z), float(adj_t.q.w));
1672
+ }
1603
1673
 
1674
+ inline CUDA_CALLABLE void adj_print(str t, str& adj_t)
1675
+ {
1676
+ printf("adj: %s\n", t);
1677
+ }
1604
1678
 
1605
1679
  template <typename T>
1606
1680
  inline CUDA_CALLABLE void expect_eq(const T& actual, const T& expected)
@@ -1683,3 +1757,10 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
1683
1757
  #include "rand.h"
1684
1758
  #include "noise.h"
1685
1759
  #include "matnn.h"
1760
+
1761
+ // only include in kernels for now
1762
+ #if defined(__CUDACC_RTC__)
1763
+ #include "tile.h"
1764
+ #include "tile_gemm.h"
1765
+ #include "tile_reduce.h"
1766
+ #endif
warp/native/bvh.h CHANGED
@@ -404,6 +404,10 @@ CUDA_CALLABLE inline bvh_query_t iter_reverse(const bvh_query_t& query)
404
404
  return query;
405
405
  }
406
406
 
407
+ CUDA_CALLABLE inline void adj_iter_reverse(const bvh_query_t& query, bvh_query_t& adj_query, bvh_query_t& adj_ret)
408
+ {
409
+ }
410
+
407
411
 
408
412
  // stub
409
413
  CUDA_CALLABLE inline void adj_bvh_query_next(bvh_query_t& query, int& index, bvh_query_t&, int&, bool&)