warp-lang 1.4.1__py3-none-manylinux2014_x86_64.whl → 1.5.0__py3-none-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +4 -0
- warp/autograd.py +43 -8
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +21 -2
- warp/build_dll.py +23 -6
- warp/builtins.py +1920 -111
- warp/codegen.py +186 -62
- warp/config.py +2 -2
- warp/context.py +322 -73
- warp/examples/assets/pixel.jpg +0 -0
- warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
- warp/examples/benchmarks/benchmark_gemm.py +121 -0
- warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
- warp/examples/benchmarks/benchmark_tile.py +179 -0
- warp/examples/core/example_dem.py +2 -1
- warp/examples/core/example_mesh_intersect.py +3 -3
- warp/examples/fem/example_adaptive_grid.py +37 -10
- warp/examples/fem/example_apic_fluid.py +3 -2
- warp/examples/fem/example_convection_diffusion_dg.py +4 -5
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion_3d.py +47 -4
- warp/examples/fem/example_distortion_energy.py +220 -0
- warp/examples/fem/example_magnetostatics.py +127 -85
- warp/examples/fem/example_nonconforming_contact.py +5 -5
- warp/examples/fem/example_stokes.py +3 -1
- warp/examples/fem/example_streamlines.py +12 -19
- warp/examples/fem/utils.py +38 -15
- warp/examples/optim/example_walker.py +2 -2
- warp/examples/sim/example_cloth.py +2 -25
- warp/examples/sim/example_jacobian_ik.py +6 -2
- warp/examples/sim/example_quadruped.py +2 -1
- warp/examples/tile/example_tile_convolution.py +58 -0
- warp/examples/tile/example_tile_fft.py +47 -0
- warp/examples/tile/example_tile_filtering.py +105 -0
- warp/examples/tile/example_tile_matmul.py +79 -0
- warp/examples/tile/example_tile_mlp.py +375 -0
- warp/fem/__init__.py +8 -0
- warp/fem/cache.py +16 -12
- warp/fem/dirichlet.py +1 -1
- warp/fem/domain.py +44 -1
- warp/fem/field/__init__.py +1 -2
- warp/fem/field/field.py +31 -19
- warp/fem/field/nodal_field.py +101 -49
- warp/fem/field/virtual.py +794 -0
- warp/fem/geometry/__init__.py +2 -2
- warp/fem/geometry/deformed_geometry.py +3 -105
- warp/fem/geometry/element.py +13 -0
- warp/fem/geometry/geometry.py +165 -5
- warp/fem/geometry/grid_2d.py +3 -6
- warp/fem/geometry/grid_3d.py +31 -28
- warp/fem/geometry/hexmesh.py +3 -46
- warp/fem/geometry/nanogrid.py +3 -2
- warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
- warp/fem/geometry/tetmesh.py +2 -43
- warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
- warp/fem/integrate.py +683 -261
- warp/fem/linalg.py +404 -0
- warp/fem/operator.py +101 -18
- warp/fem/polynomial.py +5 -5
- warp/fem/quadrature/quadrature.py +45 -21
- warp/fem/space/__init__.py +45 -11
- warp/fem/space/basis_function_space.py +451 -0
- warp/fem/space/basis_space.py +58 -11
- warp/fem/space/function_space.py +146 -5
- warp/fem/space/grid_2d_function_space.py +80 -66
- warp/fem/space/grid_3d_function_space.py +113 -68
- warp/fem/space/hexmesh_function_space.py +96 -108
- warp/fem/space/nanogrid_function_space.py +62 -110
- warp/fem/space/quadmesh_function_space.py +208 -0
- warp/fem/space/shape/__init__.py +45 -7
- warp/fem/space/shape/cube_shape_function.py +328 -54
- warp/fem/space/shape/shape_function.py +10 -1
- warp/fem/space/shape/square_shape_function.py +328 -60
- warp/fem/space/shape/tet_shape_function.py +269 -19
- warp/fem/space/shape/triangle_shape_function.py +238 -19
- warp/fem/space/tetmesh_function_space.py +69 -37
- warp/fem/space/topology.py +38 -0
- warp/fem/space/trimesh_function_space.py +179 -0
- warp/fem/utils.py +6 -331
- warp/jax_experimental.py +3 -1
- warp/native/array.h +55 -40
- warp/native/builtin.h +124 -43
- warp/native/bvh.h +4 -0
- warp/native/coloring.cpp +600 -0
- warp/native/cuda_util.cpp +14 -0
- warp/native/cuda_util.h +2 -1
- warp/native/fabric.h +8 -0
- warp/native/hashgrid.h +4 -0
- warp/native/marching.cu +8 -0
- warp/native/mat.h +14 -3
- warp/native/mathdx.cpp +59 -0
- warp/native/mesh.h +4 -0
- warp/native/range.h +13 -1
- warp/native/reduce.cpp +9 -1
- warp/native/reduce.cu +7 -0
- warp/native/runlength_encode.cpp +9 -1
- warp/native/runlength_encode.cu +7 -1
- warp/native/scan.cpp +8 -0
- warp/native/scan.cu +8 -0
- warp/native/scan.h +8 -1
- warp/native/sparse.cpp +8 -0
- warp/native/sparse.cu +8 -0
- warp/native/temp_buffer.h +7 -0
- warp/native/tile.h +1857 -0
- warp/native/tile_gemm.h +341 -0
- warp/native/tile_reduce.h +210 -0
- warp/native/volume_builder.cu +8 -0
- warp/native/volume_builder.h +8 -0
- warp/native/warp.cpp +10 -2
- warp/native/warp.cu +369 -15
- warp/native/warp.h +12 -2
- warp/optim/adam.py +39 -4
- warp/paddle.py +29 -12
- warp/render/render_opengl.py +137 -65
- warp/sim/graph_coloring.py +292 -0
- warp/sim/integrator_euler.py +4 -2
- warp/sim/integrator_featherstone.py +115 -44
- warp/sim/integrator_vbd.py +6 -0
- warp/sim/model.py +90 -17
- warp/stubs.py +651 -85
- warp/tape.py +12 -7
- warp/tests/assets/pixel.npy +0 -0
- warp/tests/aux_test_instancing_gc.py +18 -0
- warp/tests/test_array.py +207 -48
- warp/tests/test_closest_point_edge_edge.py +8 -8
- warp/tests/test_codegen.py +120 -1
- warp/tests/test_codegen_instancing.py +30 -0
- warp/tests/test_collision.py +110 -0
- warp/tests/test_coloring.py +241 -0
- warp/tests/test_context.py +34 -0
- warp/tests/test_examples.py +18 -4
- warp/tests/test_fabricarray.py +33 -0
- warp/tests/test_fem.py +453 -113
- warp/tests/test_func.py +48 -1
- warp/tests/test_generics.py +52 -0
- warp/tests/test_iter.py +68 -0
- warp/tests/test_mat_scalar_ops.py +1 -1
- warp/tests/test_mesh_query_point.py +5 -4
- warp/tests/test_module_hashing.py +23 -0
- warp/tests/test_paddle.py +27 -87
- warp/tests/test_print.py +191 -1
- warp/tests/test_spatial.py +1 -1
- warp/tests/test_tile.py +700 -0
- warp/tests/test_tile_mathdx.py +144 -0
- warp/tests/test_tile_mlp.py +383 -0
- warp/tests/test_tile_reduce.py +374 -0
- warp/tests/test_tile_shared_memory.py +190 -0
- warp/tests/test_vbd.py +12 -20
- warp/tests/test_volume.py +43 -0
- warp/tests/unittest_suites.py +23 -2
- warp/tests/unittest_utils.py +4 -0
- warp/types.py +339 -73
- warp/utils.py +22 -1
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
- warp/fem/field/test.py +0 -180
- warp/fem/field/trial.py +0 -183
- warp/fem/space/collocated_function_space.py +0 -102
- warp/fem/space/quadmesh_2d_function_space.py +0 -261
- warp/fem/space/trimesh_2d_function_space.py +0 -153
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
warp/native/array.h
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
/** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
* and proprietary rights in and to this software, related documentation
|
|
4
|
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
* distribution of this software and related documentation without an express
|
|
6
|
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
*/
|
|
8
|
+
|
|
1
9
|
#pragma once
|
|
2
10
|
|
|
3
11
|
#include "builtin.h"
|
|
@@ -285,6 +293,13 @@ CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i)
|
|
|
285
293
|
template <typename T>
|
|
286
294
|
CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j)
|
|
287
295
|
{
|
|
296
|
+
// if (i < 0 || i >= arr.shape[0])
|
|
297
|
+
// printf("i: %d > arr.shape[0]: %d\n", i, arr.shape[0]);
|
|
298
|
+
|
|
299
|
+
// if (j < 0 || j >= arr.shape[1])
|
|
300
|
+
// printf("j: %d > arr.shape[1]: %d\n", j, arr.shape[1]);
|
|
301
|
+
|
|
302
|
+
|
|
288
303
|
assert(i >= 0 && i < arr.shape[0]);
|
|
289
304
|
assert(j >= 0 && j < arr.shape[1]);
|
|
290
305
|
|
|
@@ -811,7 +826,7 @@ CUDA_CALLABLE inline void adj_atomic_add(bool* buf, bool value) { }
|
|
|
811
826
|
|
|
812
827
|
// only generate gradients for T types
|
|
813
828
|
template<typename T>
|
|
814
|
-
inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int
|
|
829
|
+
inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int adj_i, const T& adj_output)
|
|
815
830
|
{
|
|
816
831
|
if (adj_buf.data)
|
|
817
832
|
adj_atomic_add(&index(adj_buf, i), adj_output);
|
|
@@ -819,7 +834,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_
|
|
|
819
834
|
adj_atomic_add(&index_grad(buf, i), adj_output);
|
|
820
835
|
}
|
|
821
836
|
template<typename T>
|
|
822
|
-
inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int
|
|
837
|
+
inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int adj_i, int adj_j, const T& adj_output)
|
|
823
838
|
{
|
|
824
839
|
if (adj_buf.data)
|
|
825
840
|
adj_atomic_add(&index(adj_buf, i, j), adj_output);
|
|
@@ -827,7 +842,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const
|
|
|
827
842
|
adj_atomic_add(&index_grad(buf, i, j), adj_output);
|
|
828
843
|
}
|
|
829
844
|
template<typename T>
|
|
830
|
-
inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int
|
|
845
|
+
inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, const T& adj_output)
|
|
831
846
|
{
|
|
832
847
|
if (adj_buf.data)
|
|
833
848
|
adj_atomic_add(&index(adj_buf, i, j, k), adj_output);
|
|
@@ -835,7 +850,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k
|
|
|
835
850
|
adj_atomic_add(&index_grad(buf, i, j, k), adj_output);
|
|
836
851
|
}
|
|
837
852
|
template<typename T>
|
|
838
|
-
inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int
|
|
853
|
+
inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, const T& adj_output)
|
|
839
854
|
{
|
|
840
855
|
if (adj_buf.data)
|
|
841
856
|
adj_atomic_add(&index(adj_buf, i, j, k, l), adj_output);
|
|
@@ -844,7 +859,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k
|
|
|
844
859
|
}
|
|
845
860
|
|
|
846
861
|
template<typename T>
|
|
847
|
-
inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int
|
|
862
|
+
inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value)
|
|
848
863
|
{
|
|
849
864
|
if (adj_buf.data)
|
|
850
865
|
adj_value += index(adj_buf, i);
|
|
@@ -854,7 +869,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value,
|
|
|
854
869
|
FP_VERIFY_ADJ_1(value, adj_value)
|
|
855
870
|
}
|
|
856
871
|
template<typename T>
|
|
857
|
-
inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int
|
|
872
|
+
inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value)
|
|
858
873
|
{
|
|
859
874
|
if (adj_buf.data)
|
|
860
875
|
adj_value += index(adj_buf, i, j);
|
|
@@ -864,7 +879,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T
|
|
|
864
879
|
FP_VERIFY_ADJ_2(value, adj_value)
|
|
865
880
|
}
|
|
866
881
|
template<typename T>
|
|
867
|
-
inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int
|
|
882
|
+
inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value)
|
|
868
883
|
{
|
|
869
884
|
if (adj_buf.data)
|
|
870
885
|
adj_value += index(adj_buf, i, j, k);
|
|
@@ -874,7 +889,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, i
|
|
|
874
889
|
FP_VERIFY_ADJ_3(value, adj_value)
|
|
875
890
|
}
|
|
876
891
|
template<typename T>
|
|
877
|
-
inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int
|
|
892
|
+
inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value)
|
|
878
893
|
{
|
|
879
894
|
if (adj_buf.data)
|
|
880
895
|
adj_value += index(adj_buf, i, j, k, l);
|
|
@@ -898,7 +913,7 @@ inline CUDA_CALLABLE void adj_load(const T* address, const T& adj_address, T& ad
|
|
|
898
913
|
}
|
|
899
914
|
|
|
900
915
|
template<typename T>
|
|
901
|
-
inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int
|
|
916
|
+
inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret)
|
|
902
917
|
{
|
|
903
918
|
if (adj_buf.data)
|
|
904
919
|
adj_value += index(adj_buf, i);
|
|
@@ -908,7 +923,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value,
|
|
|
908
923
|
FP_VERIFY_ADJ_1(value, adj_value)
|
|
909
924
|
}
|
|
910
925
|
template<typename T>
|
|
911
|
-
inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int
|
|
926
|
+
inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret)
|
|
912
927
|
{
|
|
913
928
|
if (adj_buf.data)
|
|
914
929
|
adj_value += index(adj_buf, i, j);
|
|
@@ -918,7 +933,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T
|
|
|
918
933
|
FP_VERIFY_ADJ_2(value, adj_value)
|
|
919
934
|
}
|
|
920
935
|
template<typename T>
|
|
921
|
-
inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int
|
|
936
|
+
inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret)
|
|
922
937
|
{
|
|
923
938
|
if (adj_buf.data)
|
|
924
939
|
adj_value += index(adj_buf, i, j, k);
|
|
@@ -928,7 +943,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, in
|
|
|
928
943
|
FP_VERIFY_ADJ_3(value, adj_value)
|
|
929
944
|
}
|
|
930
945
|
template<typename T>
|
|
931
|
-
inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int
|
|
946
|
+
inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret)
|
|
932
947
|
{
|
|
933
948
|
if (adj_buf.data)
|
|
934
949
|
adj_value += index(adj_buf, i, j, k, l);
|
|
@@ -939,7 +954,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, in
|
|
|
939
954
|
}
|
|
940
955
|
|
|
941
956
|
template<typename T>
|
|
942
|
-
inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int
|
|
957
|
+
inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret)
|
|
943
958
|
{
|
|
944
959
|
if (adj_buf.data)
|
|
945
960
|
adj_value -= index(adj_buf, i);
|
|
@@ -949,7 +964,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value,
|
|
|
949
964
|
FP_VERIFY_ADJ_1(value, adj_value)
|
|
950
965
|
}
|
|
951
966
|
template<typename T>
|
|
952
|
-
inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int
|
|
967
|
+
inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret)
|
|
953
968
|
{
|
|
954
969
|
if (adj_buf.data)
|
|
955
970
|
adj_value -= index(adj_buf, i, j);
|
|
@@ -959,7 +974,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T
|
|
|
959
974
|
FP_VERIFY_ADJ_2(value, adj_value)
|
|
960
975
|
}
|
|
961
976
|
template<typename T>
|
|
962
|
-
inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int
|
|
977
|
+
inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret)
|
|
963
978
|
{
|
|
964
979
|
if (adj_buf.data)
|
|
965
980
|
adj_value -= index(adj_buf, i, j, k);
|
|
@@ -969,7 +984,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, in
|
|
|
969
984
|
FP_VERIFY_ADJ_3(value, adj_value)
|
|
970
985
|
}
|
|
971
986
|
template<typename T>
|
|
972
|
-
inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int
|
|
987
|
+
inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret)
|
|
973
988
|
{
|
|
974
989
|
if (adj_buf.data)
|
|
975
990
|
adj_value -= index(adj_buf, i, j, k, l);
|
|
@@ -981,44 +996,44 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, in
|
|
|
981
996
|
|
|
982
997
|
// generic array types that do not support gradient computation (indexedarray, etc.)
|
|
983
998
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
984
|
-
inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int
|
|
999
|
+
inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int adj_i, const T& adj_output) {}
|
|
985
1000
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
986
|
-
inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int
|
|
1001
|
+
inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int adj_i, int adj_j, const T& adj_output) {}
|
|
987
1002
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
988
|
-
inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int
|
|
1003
|
+
inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, const T& adj_output) {}
|
|
989
1004
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
990
|
-
inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int
|
|
1005
|
+
inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, const T& adj_output) {}
|
|
991
1006
|
|
|
992
1007
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
993
|
-
inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int
|
|
1008
|
+
inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value) {}
|
|
994
1009
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
995
|
-
inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int
|
|
1010
|
+
inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value) {}
|
|
996
1011
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
997
|
-
inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int
|
|
1012
|
+
inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value) {}
|
|
998
1013
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
999
|
-
inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int
|
|
1014
|
+
inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value) {}
|
|
1000
1015
|
|
|
1001
1016
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1002
|
-
inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int
|
|
1017
|
+
inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
|
|
1003
1018
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1004
|
-
inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int
|
|
1019
|
+
inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
|
|
1005
1020
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1006
|
-
inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int
|
|
1021
|
+
inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
|
|
1007
1022
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1008
|
-
inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int
|
|
1023
|
+
inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
|
|
1009
1024
|
|
|
1010
1025
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1011
|
-
inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int
|
|
1026
|
+
inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
|
|
1012
1027
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1013
|
-
inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int
|
|
1028
|
+
inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
|
|
1014
1029
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1015
|
-
inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int
|
|
1030
|
+
inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
|
|
1016
1031
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1017
|
-
inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int
|
|
1032
|
+
inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
|
|
1018
1033
|
|
|
1019
1034
|
// generic handler for scalar values
|
|
1020
1035
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1021
|
-
inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int
|
|
1036
|
+
inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
|
|
1022
1037
|
if (adj_buf.data)
|
|
1023
1038
|
adj_atomic_minmax(&index(buf, i), &index(adj_buf, i), value, adj_value);
|
|
1024
1039
|
else if (buf.grad)
|
|
@@ -1027,7 +1042,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const
|
|
|
1027
1042
|
FP_VERIFY_ADJ_1(value, adj_value)
|
|
1028
1043
|
}
|
|
1029
1044
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1030
|
-
inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int
|
|
1045
|
+
inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
|
|
1031
1046
|
if (adj_buf.data)
|
|
1032
1047
|
adj_atomic_minmax(&index(buf, i, j), &index(adj_buf, i, j), value, adj_value);
|
|
1033
1048
|
else if (buf.grad)
|
|
@@ -1036,7 +1051,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value
|
|
|
1036
1051
|
FP_VERIFY_ADJ_2(value, adj_value)
|
|
1037
1052
|
}
|
|
1038
1053
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1039
|
-
inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int
|
|
1054
|
+
inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
|
|
1040
1055
|
if (adj_buf.data)
|
|
1041
1056
|
adj_atomic_minmax(&index(buf, i, j, k), &index(adj_buf, i, j, k), value, adj_value);
|
|
1042
1057
|
else if (buf.grad)
|
|
@@ -1045,7 +1060,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k,
|
|
|
1045
1060
|
FP_VERIFY_ADJ_3(value, adj_value)
|
|
1046
1061
|
}
|
|
1047
1062
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1048
|
-
inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int
|
|
1063
|
+
inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
|
|
1049
1064
|
if (adj_buf.data)
|
|
1050
1065
|
adj_atomic_minmax(&index(buf, i, j, k, l), &index(adj_buf, i, j, k, l), value, adj_value);
|
|
1051
1066
|
else if (buf.grad)
|
|
@@ -1055,7 +1070,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k,
|
|
|
1055
1070
|
}
|
|
1056
1071
|
|
|
1057
1072
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1058
|
-
inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int
|
|
1073
|
+
inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
|
|
1059
1074
|
if (adj_buf.data)
|
|
1060
1075
|
adj_atomic_minmax(&index(buf, i), &index(adj_buf, i), value, adj_value);
|
|
1061
1076
|
else if (buf.grad)
|
|
@@ -1064,7 +1079,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const
|
|
|
1064
1079
|
FP_VERIFY_ADJ_1(value, adj_value)
|
|
1065
1080
|
}
|
|
1066
1081
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1067
|
-
inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int
|
|
1082
|
+
inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
|
|
1068
1083
|
if (adj_buf.data)
|
|
1069
1084
|
adj_atomic_minmax(&index(buf, i, j), &index(adj_buf, i, j), value, adj_value);
|
|
1070
1085
|
else if (buf.grad)
|
|
@@ -1073,7 +1088,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value
|
|
|
1073
1088
|
FP_VERIFY_ADJ_2(value, adj_value)
|
|
1074
1089
|
}
|
|
1075
1090
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1076
|
-
inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int
|
|
1091
|
+
inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
|
|
1077
1092
|
if (adj_buf.data)
|
|
1078
1093
|
adj_atomic_minmax(&index(buf, i, j, k), &index(adj_buf, i, j, k), value, adj_value);
|
|
1079
1094
|
else if (buf.grad)
|
|
@@ -1082,7 +1097,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k,
|
|
|
1082
1097
|
FP_VERIFY_ADJ_3(value, adj_value)
|
|
1083
1098
|
}
|
|
1084
1099
|
template<template<typename> class A1, template<typename> class A2, typename T>
|
|
1085
|
-
inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int
|
|
1100
|
+
inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
|
|
1086
1101
|
if (adj_buf.data)
|
|
1087
1102
|
adj_atomic_minmax(&index(buf, i, j, k, l), &index(adj_buf, i, j, k, l), value, adj_value);
|
|
1088
1103
|
else if (buf.grad)
|
warp/native/builtin.h
CHANGED
|
@@ -1145,7 +1145,47 @@ struct launch_bounds_t
|
|
|
1145
1145
|
size_t size; // total number of threads
|
|
1146
1146
|
};
|
|
1147
1147
|
|
|
1148
|
-
|
|
1148
|
+
// represents coordinate in the launch grid
|
|
1149
|
+
struct launch_coord_t
|
|
1150
|
+
{
|
|
1151
|
+
int i;
|
|
1152
|
+
int j;
|
|
1153
|
+
int k;
|
|
1154
|
+
int l;
|
|
1155
|
+
};
|
|
1156
|
+
|
|
1157
|
+
// unravels a linear thread index to the corresponding launch grid coord (up to 4d)
|
|
1158
|
+
inline CUDA_CALLABLE launch_coord_t launch_coord(size_t linear, const launch_bounds_t& bounds)
|
|
1159
|
+
{
|
|
1160
|
+
launch_coord_t coord = {0, 0, 0, 0};
|
|
1161
|
+
|
|
1162
|
+
if (bounds.ndim > 3)
|
|
1163
|
+
{
|
|
1164
|
+
coord.l = linear%bounds.shape[3];
|
|
1165
|
+
linear /= bounds.shape[3];
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
if (bounds.ndim > 2)
|
|
1169
|
+
{
|
|
1170
|
+
coord.k = linear%bounds.shape[2];
|
|
1171
|
+
linear /= bounds.shape[2];
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
if (bounds.ndim > 1)
|
|
1175
|
+
{
|
|
1176
|
+
coord.j = linear%bounds.shape[1];
|
|
1177
|
+
linear /= bounds.shape[1];
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
if (bounds.ndim > 0)
|
|
1181
|
+
{
|
|
1182
|
+
coord.i = linear;
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
return coord;
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
inline CUDA_CALLABLE int tid(size_t index, const launch_bounds_t& bounds)
|
|
1149
1189
|
{
|
|
1150
1190
|
// For the 1-D tid() we need to warn the user if we're about to provide a truncated index
|
|
1151
1191
|
// Only do this in _DEBUG when called from device to avoid excessive register allocation
|
|
@@ -1154,40 +1194,33 @@ inline CUDA_CALLABLE int tid(size_t index)
|
|
|
1154
1194
|
printf("Warp warning: tid() is returning an overflowed int\n");
|
|
1155
1195
|
}
|
|
1156
1196
|
#endif
|
|
1157
|
-
|
|
1197
|
+
|
|
1198
|
+
launch_coord_t c = launch_coord(index, bounds);
|
|
1199
|
+
return static_cast<int>(c.i);
|
|
1158
1200
|
}
|
|
1159
1201
|
|
|
1160
|
-
inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t&
|
|
1202
|
+
inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& bounds)
|
|
1161
1203
|
{
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
i = index/n;
|
|
1166
|
-
j = index%n;
|
|
1204
|
+
launch_coord_t c = launch_coord(index, bounds);
|
|
1205
|
+
i = c.i;
|
|
1206
|
+
j = c.j;
|
|
1167
1207
|
}
|
|
1168
1208
|
|
|
1169
|
-
inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t&
|
|
1209
|
+
inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& bounds)
|
|
1170
1210
|
{
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
i = index/(n*o);
|
|
1176
|
-
j = index%(n*o)/o;
|
|
1177
|
-
k = index%o;
|
|
1211
|
+
launch_coord_t c = launch_coord(index, bounds);
|
|
1212
|
+
i = c.i;
|
|
1213
|
+
j = c.j;
|
|
1214
|
+
k = c.k;
|
|
1178
1215
|
}
|
|
1179
1216
|
|
|
1180
|
-
inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t&
|
|
1217
|
+
inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& bounds)
|
|
1181
1218
|
{
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
i = index/(n*o*p);
|
|
1188
|
-
j = index%(n*o*p)/(o*p);
|
|
1189
|
-
k = index%(o*p)/p;
|
|
1190
|
-
l = index%p;
|
|
1219
|
+
launch_coord_t c = launch_coord(index, bounds);
|
|
1220
|
+
i = c.i;
|
|
1221
|
+
j = c.j;
|
|
1222
|
+
k = c.k;
|
|
1223
|
+
l = c.l;
|
|
1191
1224
|
}
|
|
1192
1225
|
|
|
1193
1226
|
template<typename T>
|
|
@@ -1575,32 +1608,73 @@ inline CUDA_CALLABLE void print(transform_t<Type> t)
|
|
|
1575
1608
|
printf("(%g %g %g) (%g %g %g %g)\n", float(t.p[0]), float(t.p[1]), float(t.p[2]), float(t.q.x), float(t.q.y), float(t.q.z), float(t.q.w));
|
|
1576
1609
|
}
|
|
1577
1610
|
|
|
1578
|
-
|
|
1579
|
-
inline CUDA_CALLABLE void adj_print(
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
inline CUDA_CALLABLE void adj_print(
|
|
1586
|
-
inline CUDA_CALLABLE void adj_print(
|
|
1587
|
-
inline CUDA_CALLABLE void adj_print(
|
|
1588
|
-
|
|
1611
|
+
template<typename T>
|
|
1612
|
+
inline CUDA_CALLABLE void adj_print(const T& x, const T& adj_x)
|
|
1613
|
+
{
|
|
1614
|
+
printf("adj: <type without print implementation>\n");
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1617
|
+
// note: adj_print() only prints the adjoint value, since the value itself gets printed in replay print()
|
|
1618
|
+
inline CUDA_CALLABLE void adj_print(half x, half adj_x) { printf("adj: %g\n", half_to_float(adj_x)); }
|
|
1619
|
+
inline CUDA_CALLABLE void adj_print(float x, float adj_x) { printf("adj: %g\n", adj_x); }
|
|
1620
|
+
inline CUDA_CALLABLE void adj_print(double x, double adj_x) { printf("adj: %g\n", adj_x); }
|
|
1621
|
+
|
|
1622
|
+
inline CUDA_CALLABLE void adj_print(signed char x, signed char adj_x) { printf("adj: %d\n", adj_x); }
|
|
1623
|
+
inline CUDA_CALLABLE void adj_print(short x, short adj_x) { printf("adj: %d\n", adj_x); }
|
|
1624
|
+
inline CUDA_CALLABLE void adj_print(int x, int adj_x) { printf("adj: %d\n", adj_x); }
|
|
1625
|
+
inline CUDA_CALLABLE void adj_print(long x, long adj_x) { printf("adj: %ld\n", adj_x); }
|
|
1626
|
+
inline CUDA_CALLABLE void adj_print(long long x, long long adj_x) { printf("adj: %lld\n", adj_x); }
|
|
1627
|
+
|
|
1628
|
+
inline CUDA_CALLABLE void adj_print(unsigned char x, unsigned char adj_x) { printf("adj: %u\n", adj_x); }
|
|
1629
|
+
inline CUDA_CALLABLE void adj_print(unsigned short x, unsigned short adj_x) { printf("adj: %u\n", adj_x); }
|
|
1630
|
+
inline CUDA_CALLABLE void adj_print(unsigned x, unsigned adj_x) { printf("adj: %u\n", adj_x); }
|
|
1631
|
+
inline CUDA_CALLABLE void adj_print(unsigned long x, unsigned long adj_x) { printf("adj: %lu\n", adj_x); }
|
|
1632
|
+
inline CUDA_CALLABLE void adj_print(unsigned long long x, unsigned long long adj_x) { printf("adj: %llu\n", adj_x); }
|
|
1633
|
+
|
|
1634
|
+
inline CUDA_CALLABLE void adj_print(bool x, bool adj_x) { printf("adj: %s\n", (adj_x ? "True" : "False")); }
|
|
1589
1635
|
|
|
1590
1636
|
template<unsigned Length, typename Type>
|
|
1591
|
-
inline CUDA_CALLABLE void adj_print(vec_t<Length, Type
|
|
1637
|
+
inline CUDA_CALLABLE void adj_print(const vec_t<Length, Type>& v, const vec_t<Length, Type>& adj_v)
|
|
1638
|
+
{
|
|
1639
|
+
printf("adj:");
|
|
1640
|
+
for (unsigned i = 0; i < Length; i++)
|
|
1641
|
+
printf(" %g", float(adj_v[i]));
|
|
1642
|
+
printf("\n");
|
|
1643
|
+
}
|
|
1592
1644
|
|
|
1593
1645
|
template<unsigned Rows, unsigned Cols, typename Type>
|
|
1594
|
-
inline CUDA_CALLABLE void adj_print(mat_t<Rows, Cols, Type
|
|
1646
|
+
inline CUDA_CALLABLE void adj_print(const mat_t<Rows, Cols, Type>& m, const mat_t<Rows, Cols, Type>& adj_m)
|
|
1647
|
+
{
|
|
1648
|
+
for (unsigned i = 0; i < Rows; i++)
|
|
1649
|
+
{
|
|
1650
|
+
if (i == 0)
|
|
1651
|
+
printf("adj:");
|
|
1652
|
+
else
|
|
1653
|
+
printf(" ");
|
|
1654
|
+
for (unsigned j = 0; j < Cols; j++)
|
|
1655
|
+
printf(" %g", float(adj_m.data[i][j]));
|
|
1656
|
+
printf("\n");
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1595
1659
|
|
|
1596
1660
|
template<typename Type>
|
|
1597
|
-
inline CUDA_CALLABLE void adj_print(quat_t<Type
|
|
1661
|
+
inline CUDA_CALLABLE void adj_print(const quat_t<Type>& q, const quat_t<Type>& adj_q)
|
|
1662
|
+
{
|
|
1663
|
+
printf("adj: %g %g %g %g\n", float(adj_q.x), float(adj_q.y), float(adj_q.z), float(adj_q.w));
|
|
1664
|
+
}
|
|
1598
1665
|
|
|
1599
1666
|
template<typename Type>
|
|
1600
|
-
inline CUDA_CALLABLE void adj_print(transform_t<Type
|
|
1601
|
-
|
|
1602
|
-
|
|
1667
|
+
inline CUDA_CALLABLE void adj_print(const transform_t<Type>& t, const transform_t<Type>& adj_t)
|
|
1668
|
+
{
|
|
1669
|
+
printf("adj: (%g %g %g) (%g %g %g %g)\n",
|
|
1670
|
+
float(adj_t.p[0]), float(adj_t.p[1]), float(adj_t.p[2]),
|
|
1671
|
+
float(adj_t.q.x), float(adj_t.q.y), float(adj_t.q.z), float(adj_t.q.w));
|
|
1672
|
+
}
|
|
1603
1673
|
|
|
1674
|
+
inline CUDA_CALLABLE void adj_print(str t, str& adj_t)
|
|
1675
|
+
{
|
|
1676
|
+
printf("adj: %s\n", t);
|
|
1677
|
+
}
|
|
1604
1678
|
|
|
1605
1679
|
template <typename T>
|
|
1606
1680
|
inline CUDA_CALLABLE void expect_eq(const T& actual, const T& expected)
|
|
@@ -1683,3 +1757,10 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
|
|
|
1683
1757
|
#include "rand.h"
|
|
1684
1758
|
#include "noise.h"
|
|
1685
1759
|
#include "matnn.h"
|
|
1760
|
+
|
|
1761
|
+
// only include in kernels for now
|
|
1762
|
+
#if defined(__CUDACC_RTC__)
|
|
1763
|
+
#include "tile.h"
|
|
1764
|
+
#include "tile_gemm.h"
|
|
1765
|
+
#include "tile_reduce.h"
|
|
1766
|
+
#endif
|
warp/native/bvh.h
CHANGED
|
@@ -404,6 +404,10 @@ CUDA_CALLABLE inline bvh_query_t iter_reverse(const bvh_query_t& query)
|
|
|
404
404
|
return query;
|
|
405
405
|
}
|
|
406
406
|
|
|
407
|
+
CUDA_CALLABLE inline void adj_iter_reverse(const bvh_query_t& query, bvh_query_t& adj_query, bvh_query_t& adj_ret)
|
|
408
|
+
{
|
|
409
|
+
}
|
|
410
|
+
|
|
407
411
|
|
|
408
412
|
// stub
|
|
409
413
|
CUDA_CALLABLE inline void adj_bvh_query_next(bvh_query_t& query, int& index, bvh_query_t&, int&, bool&)
|