warp-lang 1.4.1__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (164) hide show
  1. warp/__init__.py +4 -0
  2. warp/autograd.py +43 -8
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +21 -2
  6. warp/build_dll.py +23 -6
  7. warp/builtins.py +1920 -111
  8. warp/codegen.py +186 -62
  9. warp/config.py +2 -2
  10. warp/context.py +322 -73
  11. warp/examples/assets/pixel.jpg +0 -0
  12. warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
  13. warp/examples/benchmarks/benchmark_gemm.py +121 -0
  14. warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
  15. warp/examples/benchmarks/benchmark_tile.py +179 -0
  16. warp/examples/core/example_dem.py +2 -1
  17. warp/examples/core/example_mesh_intersect.py +3 -3
  18. warp/examples/fem/example_adaptive_grid.py +37 -10
  19. warp/examples/fem/example_apic_fluid.py +3 -2
  20. warp/examples/fem/example_convection_diffusion_dg.py +4 -5
  21. warp/examples/fem/example_deformed_geometry.py +1 -1
  22. warp/examples/fem/example_diffusion_3d.py +47 -4
  23. warp/examples/fem/example_distortion_energy.py +220 -0
  24. warp/examples/fem/example_magnetostatics.py +127 -85
  25. warp/examples/fem/example_nonconforming_contact.py +5 -5
  26. warp/examples/fem/example_stokes.py +3 -1
  27. warp/examples/fem/example_streamlines.py +12 -19
  28. warp/examples/fem/utils.py +38 -15
  29. warp/examples/optim/example_walker.py +2 -2
  30. warp/examples/sim/example_cloth.py +2 -25
  31. warp/examples/sim/example_jacobian_ik.py +6 -2
  32. warp/examples/sim/example_quadruped.py +2 -1
  33. warp/examples/tile/example_tile_convolution.py +58 -0
  34. warp/examples/tile/example_tile_fft.py +47 -0
  35. warp/examples/tile/example_tile_filtering.py +105 -0
  36. warp/examples/tile/example_tile_matmul.py +79 -0
  37. warp/examples/tile/example_tile_mlp.py +375 -0
  38. warp/fem/__init__.py +8 -0
  39. warp/fem/cache.py +16 -12
  40. warp/fem/dirichlet.py +1 -1
  41. warp/fem/domain.py +44 -1
  42. warp/fem/field/__init__.py +1 -2
  43. warp/fem/field/field.py +31 -19
  44. warp/fem/field/nodal_field.py +101 -49
  45. warp/fem/field/virtual.py +794 -0
  46. warp/fem/geometry/__init__.py +2 -2
  47. warp/fem/geometry/deformed_geometry.py +3 -105
  48. warp/fem/geometry/element.py +13 -0
  49. warp/fem/geometry/geometry.py +165 -5
  50. warp/fem/geometry/grid_2d.py +3 -6
  51. warp/fem/geometry/grid_3d.py +31 -28
  52. warp/fem/geometry/hexmesh.py +3 -46
  53. warp/fem/geometry/nanogrid.py +3 -2
  54. warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
  55. warp/fem/geometry/tetmesh.py +2 -43
  56. warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
  57. warp/fem/integrate.py +683 -261
  58. warp/fem/linalg.py +404 -0
  59. warp/fem/operator.py +101 -18
  60. warp/fem/polynomial.py +5 -5
  61. warp/fem/quadrature/quadrature.py +45 -21
  62. warp/fem/space/__init__.py +45 -11
  63. warp/fem/space/basis_function_space.py +451 -0
  64. warp/fem/space/basis_space.py +58 -11
  65. warp/fem/space/function_space.py +146 -5
  66. warp/fem/space/grid_2d_function_space.py +80 -66
  67. warp/fem/space/grid_3d_function_space.py +113 -68
  68. warp/fem/space/hexmesh_function_space.py +96 -108
  69. warp/fem/space/nanogrid_function_space.py +62 -110
  70. warp/fem/space/quadmesh_function_space.py +208 -0
  71. warp/fem/space/shape/__init__.py +45 -7
  72. warp/fem/space/shape/cube_shape_function.py +328 -54
  73. warp/fem/space/shape/shape_function.py +10 -1
  74. warp/fem/space/shape/square_shape_function.py +328 -60
  75. warp/fem/space/shape/tet_shape_function.py +269 -19
  76. warp/fem/space/shape/triangle_shape_function.py +238 -19
  77. warp/fem/space/tetmesh_function_space.py +69 -37
  78. warp/fem/space/topology.py +38 -0
  79. warp/fem/space/trimesh_function_space.py +179 -0
  80. warp/fem/utils.py +6 -331
  81. warp/jax_experimental.py +3 -1
  82. warp/native/array.h +55 -40
  83. warp/native/builtin.h +124 -43
  84. warp/native/bvh.h +4 -0
  85. warp/native/coloring.cpp +600 -0
  86. warp/native/cuda_util.cpp +14 -0
  87. warp/native/cuda_util.h +2 -1
  88. warp/native/fabric.h +8 -0
  89. warp/native/hashgrid.h +4 -0
  90. warp/native/marching.cu +8 -0
  91. warp/native/mat.h +14 -3
  92. warp/native/mathdx.cpp +59 -0
  93. warp/native/mesh.h +4 -0
  94. warp/native/range.h +13 -1
  95. warp/native/reduce.cpp +9 -1
  96. warp/native/reduce.cu +7 -0
  97. warp/native/runlength_encode.cpp +9 -1
  98. warp/native/runlength_encode.cu +7 -1
  99. warp/native/scan.cpp +8 -0
  100. warp/native/scan.cu +8 -0
  101. warp/native/scan.h +8 -1
  102. warp/native/sparse.cpp +8 -0
  103. warp/native/sparse.cu +8 -0
  104. warp/native/temp_buffer.h +7 -0
  105. warp/native/tile.h +1857 -0
  106. warp/native/tile_gemm.h +341 -0
  107. warp/native/tile_reduce.h +210 -0
  108. warp/native/volume_builder.cu +8 -0
  109. warp/native/volume_builder.h +8 -0
  110. warp/native/warp.cpp +10 -2
  111. warp/native/warp.cu +369 -15
  112. warp/native/warp.h +12 -2
  113. warp/optim/adam.py +39 -4
  114. warp/paddle.py +29 -12
  115. warp/render/render_opengl.py +137 -65
  116. warp/sim/graph_coloring.py +292 -0
  117. warp/sim/integrator_euler.py +4 -2
  118. warp/sim/integrator_featherstone.py +115 -44
  119. warp/sim/integrator_vbd.py +6 -0
  120. warp/sim/model.py +90 -17
  121. warp/stubs.py +651 -85
  122. warp/tape.py +12 -7
  123. warp/tests/assets/pixel.npy +0 -0
  124. warp/tests/aux_test_instancing_gc.py +18 -0
  125. warp/tests/test_array.py +207 -48
  126. warp/tests/test_closest_point_edge_edge.py +8 -8
  127. warp/tests/test_codegen.py +120 -1
  128. warp/tests/test_codegen_instancing.py +30 -0
  129. warp/tests/test_collision.py +110 -0
  130. warp/tests/test_coloring.py +241 -0
  131. warp/tests/test_context.py +34 -0
  132. warp/tests/test_examples.py +18 -4
  133. warp/tests/test_fabricarray.py +33 -0
  134. warp/tests/test_fem.py +453 -113
  135. warp/tests/test_func.py +48 -1
  136. warp/tests/test_generics.py +52 -0
  137. warp/tests/test_iter.py +68 -0
  138. warp/tests/test_mat_scalar_ops.py +1 -1
  139. warp/tests/test_mesh_query_point.py +5 -4
  140. warp/tests/test_module_hashing.py +23 -0
  141. warp/tests/test_paddle.py +27 -87
  142. warp/tests/test_print.py +191 -1
  143. warp/tests/test_spatial.py +1 -1
  144. warp/tests/test_tile.py +700 -0
  145. warp/tests/test_tile_mathdx.py +144 -0
  146. warp/tests/test_tile_mlp.py +383 -0
  147. warp/tests/test_tile_reduce.py +374 -0
  148. warp/tests/test_tile_shared_memory.py +190 -0
  149. warp/tests/test_vbd.py +12 -20
  150. warp/tests/test_volume.py +43 -0
  151. warp/tests/unittest_suites.py +23 -2
  152. warp/tests/unittest_utils.py +4 -0
  153. warp/types.py +339 -73
  154. warp/utils.py +22 -1
  155. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
  156. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
  157. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
  158. warp/fem/field/test.py +0 -180
  159. warp/fem/field/trial.py +0 -183
  160. warp/fem/space/collocated_function_space.py +0 -102
  161. warp/fem/space/quadmesh_2d_function_space.py +0 -261
  162. warp/fem/space/trimesh_2d_function_space.py +0 -153
  163. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
  164. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,341 @@
1
+ /** Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
9
+ #pragma once
10
+
11
+ #include "builtin.h"
12
+
13
+ #define USE_CUTE 0
14
+
15
+ #if USE_CUTE
16
+ #include "cutlass/include/cute/tensor.hpp"
17
+ #include "cutlass/include/cute/algorithm/cooperative_gemm.hpp"
18
+ #endif // USE_CUTE
19
+
20
+ namespace wp
21
+ {
22
+
23
+ /*
24
+ // 2D tile zero
25
+ template <typename T, int M, int N, int Index>
26
+ inline CUDA_CALLABLE array_t<T> tile_zeros()
27
+ {
28
+ const int length = M*N;
29
+
30
+ WP_TILE_SHARED __align__(16) T data[length];
31
+
32
+ WP_PRAGMA_UNROLL
33
+ for (int t=threadIdx.x; t < length; t += blockDim.x)
34
+ {
35
+ data[t] = T(0.0);
36
+ }
37
+
38
+ return array_t<T>(data, M, N, nullptr);
39
+ }
40
+
41
+ // 2D tile load
42
+ template <typename T, int M, int N, int Index>
43
+ inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
44
+ {
45
+ const int length = M*N;
46
+
47
+ WP_TILE_SHARED __align__(16) T data[length];
48
+
49
+ //---------------
50
+ // naive-synchronous load
51
+ //
52
+ // WP_PRAGMA_UNROLL
53
+ // for (int t=threadIdx.x; t < length; t += blockDim.x)
54
+ // {
55
+ // data[t] = index(src, i*M + t/N, j*N + t%N);
56
+ // }
57
+
58
+ //---------------
59
+ // async 128 bit loads (assumes row-major i.e.: stride 1 on y axis and 4-element alignment on dimension)
60
+ const int s = 4;
61
+
62
+ WP_PRAGMA_UNROLL
63
+ for (int t=threadIdx.x*s; t < length; t += blockDim.x*s)
64
+ {
65
+ __pipeline_memcpy_async(&data[t],
66
+ &index(src, i*M + t/N, j*N + t%N),
67
+ sizeof(T)*s);
68
+ }
69
+
70
+ __pipeline_commit();
71
+
72
+
73
+ return array_t<T>(data, M, N, nullptr);
74
+ }
75
+
76
+ // 2D tile store
77
+ template <typename T>
78
+ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array_t<T>& src)
79
+ {
80
+ const int M = src.shape[0];
81
+ const int N = src.shape[1];
82
+
83
+ const int length = M*N;
84
+
85
+ // cooperatively store the tile, using a block-stride iterator
86
+ WP_PRAGMA_UNROLL
87
+ for (int t=threadIdx.x; t < length; t += blockDim.x)
88
+ {
89
+ index(dest, i*M + t/N, j*N + t%N) = src.data[t];
90
+ }
91
+ }
92
+ */
93
+
94
+ template <typename T>
95
+ inline CUDA_CALLABLE const T& index(const T* __restrict__ p, int i, int j, int stride)
96
+ {
97
+ return p[i*stride + j];
98
+ }
99
+
100
+ template <typename T>
101
+ inline CUDA_CALLABLE T& index(T* __restrict__ p, int i, int j, int stride)
102
+ {
103
+ return p[i*stride + j];
104
+ }
105
+
106
+ template <unsigned M, unsigned N, typename T>
107
+ struct partition_t
108
+ {
109
+ inline partition_t(array_t<T> A)
110
+ {
111
+ data = A;
112
+
113
+ // todo: do ceil div for non-multiples of M,N
114
+ shape[0] = A.shape[0]/M;
115
+ shape[1] = A.shape[1]/N;
116
+ }
117
+
118
+ // underlying data
119
+ array_t<T> data;
120
+
121
+ // partition dimensions
122
+ int shape[2];
123
+ };
124
+
125
+ template <unsigned M, unsigned N, typename T>
126
+ inline int partition_size(const partition_t<M, N, T>& tile)
127
+ {
128
+ return tile.shape[0]*tile.shape[1];
129
+ }
130
+
131
+ // returns the x, y coordinates of a tile given a linear index
132
+ template <unsigned M, unsigned N, typename T>
133
+ inline void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
134
+ {
135
+ i = t/tile.shape[1];
136
+ j = t%tile.shape[1];
137
+ }
138
+
139
+ template <unsigned M, unsigned N, typename T>
140
+ inline mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
141
+ {
142
+ mat_t<M, N, T> out;
143
+
144
+ const int tile_i = i*M;
145
+ const int tile_j = j*N;
146
+
147
+ WP_PRAGMA_UNROLL
148
+ for (int i=0; i < M; ++i)
149
+ {
150
+ WP_PRAGMA_UNROLL
151
+ for (int j=0; j < N; ++j)
152
+ {
153
+ out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
154
+ }
155
+ }
156
+
157
+ return out;
158
+ }
159
+
160
+ template <unsigned M, unsigned N, typename T>
161
+ inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
162
+ {
163
+ mat_t<M, N, T> out;
164
+
165
+ const int tile_i = M*i;
166
+ const int tile_j = N*j;
167
+
168
+ WP_PRAGMA_UNROLL
169
+ for (int i=0; i < M; ++i)
170
+ {
171
+ WP_PRAGMA_UNROLL
172
+ for (int j=0; j < N; ++j)
173
+ {
174
+ index(tile.data, tile_i + i, tile_j + j) = value.data[i][j];
175
+ }
176
+ }
177
+ }
178
+
179
+
180
+ #if !USE_CUTE
181
+
182
+ template <typename T>
183
+ inline CUDA_CALLABLE void gemm(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
184
+ {
185
+ const int TILE_M = 4;
186
+ const int TILE_N = 4;
187
+ const int TILE_K = 4;
188
+
189
+ partition_t A_tile = partition_t<TILE_M, TILE_K, T>(A);
190
+ partition_t B_tile = partition_t<TILE_K, TILE_N, T>(B);
191
+ partition_t C_tile = partition_t<TILE_M, TILE_N, T>(out);
192
+
193
+ const int length = partition_size(C_tile);
194
+
195
+ __pipeline_wait_prior(0);
196
+
197
+ WP_TILE_SYNC();
198
+
199
+ for (int t=threadIdx.x; t < length; t += blockDim.x)
200
+ {
201
+ int i, j;
202
+ partition_coord(C_tile, t, i, j);
203
+
204
+ // accumulator
205
+ mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
206
+
207
+ WP_PRAGMA_UNROLL
208
+ for (int k=0; k < A_tile.shape[1]; k++)
209
+ {
210
+ const mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
211
+ const mat_t<TILE_K, TILE_N, T> b = partition_load(B_tile, k, j);
212
+
213
+ sum += mul(a, b);
214
+ }
215
+
216
+ partition_store(C_tile, i, j, sum);
217
+ }
218
+
219
+ WP_TILE_SYNC();
220
+ }
221
+
222
+
223
+ // 2D gemm accumulate out += A*B
224
+ template <typename TileA, typename TileB, typename TileC>
225
+ inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
226
+ const TileB& B,
227
+ TileC& out)
228
+ {
229
+ const int length = tile_size(out);
230
+
231
+ WP_TILE_SYNC();
232
+
233
+ using T = typename TileA::Type;
234
+
235
+ WP_PRAGMA_UNROLL
236
+ for (int t=threadIdx.x; t < length; t += WP_TILE_BLOCK_DIM)
237
+ {
238
+ // compute output index
239
+ const int i = t/out.N;
240
+ const int j = t%out.N;
241
+
242
+ T sum(0.0);
243
+
244
+ WP_PRAGMA_UNROLL
245
+ for (int k=0; k < A.N; ++k)
246
+ {
247
+ T a = A(i,k);
248
+ T b = B(k,j);
249
+
250
+ sum += a*b; // todo: use fmaf()
251
+ }
252
+
253
+ out(i,j) += sum;
254
+ }
255
+
256
+ WP_TILE_SYNC();
257
+ }
258
+
259
+ #else
260
+
261
+
262
+ template <typename T>
263
+ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
264
+ {
265
+ using namespace cute;
266
+
267
+ __pipeline_wait_prior(0);
268
+
269
+ // ensure smem tile is ready
270
+ WP_TILE_SYNC();
271
+
272
+ // Define CTA matrix size (static)
273
+ auto bM = Int<64>{};
274
+ auto bN = Int<64>{};
275
+ auto bK = Int<8>{};
276
+
277
+ // Define the smem layouts (static)
278
+ auto sA = make_layout(make_shape(bM, bK), LayoutRight{});
279
+ auto sB = make_layout(make_shape(bN, bK));
280
+ auto sC = make_layout(make_shape(bM, bN), LayoutRight{});
281
+
282
+ Tensor s_a_tensor = make_tensor(make_smem_ptr<float>(A.data), sA);
283
+ Tensor s_b_tensor = make_tensor(make_smem_ptr<float>(B.data), sB);
284
+ Tensor s_c_tensor = make_tensor(make_smem_ptr<float>(out.data), sC);
285
+
286
+
287
+ // TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
288
+ // Layout<Shape<_16,_8,_1>>{}); // 16x8x1 UniversalFMA, assumes blockDim=128
289
+
290
+
291
+ // TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
292
+ // Layout<Shape<_8,_16>,Stride<_16,_1>>{}); // 8x16x1 UniversalFMA, assumes blockDim=128
293
+
294
+
295
+
296
+ TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
297
+ Layout<Shape<_2,_64>,Stride<_64,_1>>{}); // 8x16x1 UniversalFMA, assumes blockDim=128
298
+
299
+
300
+ cooperative_gemm< AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
301
+ AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
302
+ AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>
303
+ >(
304
+ threadIdx.x, tiled_mma,
305
+ 1.0f, s_a_tensor, s_b_tensor, 1.0f, s_c_tensor,
306
+ cute::identity(), cute::identity(), cute::identity(), cute::identity()
307
+ );
308
+
309
+ WP_TILE_SYNC();
310
+
311
+ }
312
+
313
+ #endif // USE_CUTE
314
+
315
+
316
+ #if 0
317
+
318
+ template <typename TileA, typename TileB, typename TileC>
319
+ void tile_matmul(TileA& a, TileB& b, TileC& c)
320
+ {
321
+ static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error, tile datatypes must match");
322
+ static_assert(TileA::N == TileB::M, "Error, inner dimensions must match");
323
+ static_assert(TileC::M == TileA::M, "Error, first output dimension must match");
324
+ static_assert(TileC::N == TileB::N, "Error, second output dimension must match");
325
+
326
+ tile_matmul_scalar(a, b, c);
327
+ }
328
+
329
+
330
+ template <typename TileA, typename TileB, typename TileC,
331
+ typename AdjTileA, typename AdjTileB, typename AdjTileC>
332
+ void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
333
+ AdjTileA& adj_a, AdjTileB& adj_b, AdjTileC& adj_c)
334
+ {
335
+ tile_matmul_scalar(adj_c, wp::tile_transpose(b), adj_a);
336
+ tile_matmul_scalar(wp::tile_transpose(a), adj_c, adj_b);
337
+ }
338
+
339
+ #endif // 0
340
+
341
+ } // namespace wp
@@ -0,0 +1,210 @@
1
+ /** Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
9
+ #pragma once
10
+
11
+ #include "tile.h"
12
+
13
+ #define WP_TILE_WARP_SIZE 32
14
+
15
+ namespace wp
16
+ {
17
+
18
+ template <typename T>
19
+ inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset, int mask)
20
+ {
21
+ typedef unsigned int Word;
22
+
23
+ union
24
+ {
25
+ T output;
26
+ Word output_storage;
27
+ };
28
+
29
+ union
30
+ {
31
+ T input;
32
+ Word input_storage;
33
+ };
34
+
35
+ input = val;
36
+
37
+ Word* dest = reinterpret_cast<Word*>(&output);
38
+ Word* src = reinterpret_cast<Word*>(&input);
39
+
40
+ unsigned int shuffle_word;
41
+
42
+ constexpr int word_count = (sizeof(T) + sizeof(Word) - 1) / sizeof(Word);
43
+
44
+ WP_PRAGMA_UNROLL
45
+ for (int i=0; i < word_count; ++i)
46
+ {
47
+ shuffle_word = __shfl_down_sync(mask, src[i], offset, WP_TILE_WARP_SIZE);
48
+ dest[i] = shuffle_word;
49
+ }
50
+
51
+ return output;
52
+ }
53
+
54
+ template <typename T, typename Op>
55
+ inline CUDA_CALLABLE T warp_reduce(T val, Op f, unsigned int mask)
56
+ {
57
+ T sum = val;
58
+
59
+ if (mask == 0xFFFFFFFF)
60
+ {
61
+ // handle case where entire warp is active
62
+ for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
63
+ {
64
+ sum = f(sum, warp_shuffle_down(sum, offset, mask));
65
+ }
66
+ }
67
+ else
68
+ {
69
+ // handle partial warp case
70
+ for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
71
+ {
72
+ T shfl_val = warp_shuffle_down(sum, offset, mask);
73
+ if ((mask & (1 << ((threadIdx.x + offset)%WP_TILE_WARP_SIZE))) != 0)
74
+ sum = f(sum, shfl_val);
75
+ }
76
+ }
77
+
78
+ return sum;
79
+ }
80
+
81
+ // non-axis version which computes sum
82
+ // across the entire tile using the whole block
83
+ template <typename Tile, typename Op>
84
+ auto tile_reduce_impl(Op f, Tile& t)
85
+ {
86
+ using T = typename Tile::Type;
87
+
88
+ auto input = t.copy_to_register();
89
+ auto output = tile_register_t<T, 1, 1>();
90
+
91
+ const int warp_count = (WP_TILE_BLOCK_DIM + WP_TILE_WARP_SIZE - 1)/WP_TILE_WARP_SIZE;
92
+ const int warp_index = threadIdx.x/WP_TILE_WARP_SIZE;
93
+ const int lane_index = threadIdx.x%WP_TILE_WARP_SIZE;
94
+
95
+ T thread_sum = input.data[0];
96
+
97
+ // thread reduction
98
+ WP_PRAGMA_UNROLL
99
+ for (int i=1; i < input.NumRegs; ++i)
100
+ {
101
+ int linear = t.index(i);
102
+ if (!Tile::Aligned && linear >= Tile::Size)
103
+ break;
104
+
105
+ thread_sum = f(thread_sum, input.data[i]);
106
+ }
107
+
108
+ // ensure that only threads with at least one valid item participate in the reduction
109
+ unsigned int mask = __ballot_sync(__activemask(), t.index(0) < Tile::Size);
110
+
111
+ // warp reduction
112
+ T warp_sum = warp_reduce(thread_sum, f, mask);
113
+
114
+ // fixed size scratch pad for partial results in shared memory
115
+ WP_TILE_SHARED T partials[warp_count];
116
+
117
+ // count of active warps
118
+ WP_TILE_SHARED int active_warps;
119
+ if (threadIdx.x == 0)
120
+ active_warps = 0;
121
+
122
+ // ensure active_warps is initialized
123
+ WP_TILE_SYNC();
124
+
125
+ if (lane_index == 0)
126
+ {
127
+ partials[warp_index] = warp_sum;
128
+ atomicAdd(&active_warps, 1);
129
+ }
130
+
131
+ // ensure partials are ready
132
+ WP_TILE_SYNC();
133
+
134
+ // reduce across block, todo: use warp_reduce() here
135
+ if (threadIdx.x == 0)
136
+ {
137
+ T block_sum = partials[0];
138
+
139
+ WP_PRAGMA_UNROLL
140
+ for (int i=1; i < active_warps; ++i)
141
+ block_sum = f(block_sum, partials[i]);
142
+
143
+ output.data[0] = block_sum;
144
+ }
145
+
146
+ return output;
147
+ }
148
+
149
+ void adj_tile_reduce_impl()
150
+ {
151
+ // todo: general purpose reduction gradients not implemented
152
+ }
153
+
154
+ // entry point for Python code-gen, wraps op in a lambda to perform overload resolution
155
+ #define tile_reduce(op, t) tile_reduce_impl([](auto x, auto y) { return op(x, y);}, t)
156
+ #define adj_tile_reduce(op, a, adj_op, adj_a, adj_ret) adj_tile_reduce_impl()
157
+
158
+ // convenience methods for specific reductions
159
+
160
+ template <typename Tile>
161
+ auto tile_sum(Tile& t)
162
+ {
163
+ return tile_reduce(add, t);
164
+ }
165
+
166
+ // special case adjoint for summation
167
+ template <typename Tile, typename AdjTile>
168
+ void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
169
+ {
170
+ using T = typename Tile::Type;
171
+
172
+ // broadcast incoming adjoint to block
173
+ WP_TILE_SHARED T scratch;
174
+ if (threadIdx.x == 0)
175
+ scratch = adj_ret.data[0];
176
+
177
+ WP_TILE_SYNC();
178
+
179
+ // broadcast scalar across input dimensions (note zero strides)
180
+ auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, 0, 0>(&scratch, NULL).copy_to_register();
181
+ adj_t.grad_add(adj_ret_reg);
182
+ }
183
+
184
+ template <typename Tile>
185
+ auto tile_max(Tile& t)
186
+ {
187
+ return tile_reduce(max, t);
188
+ }
189
+
190
+ template <typename Tile, typename AdjTile>
191
+ void adj_tile_max(Tile& t, Tile& adj_t, AdjTile& adj_ret)
192
+ {
193
+ // todo: not implemented
194
+ }
195
+
196
+ template <typename Tile>
197
+ auto tile_min(Tile& t)
198
+ {
199
+ return tile_reduce(min, t);
200
+ }
201
+
202
+ template <typename Tile, typename AdjTile>
203
+ void adj_tile_min(Tile& t, Tile& adj_t, AdjTile& adj_ret)
204
+ {
205
+ // todo: not implemented
206
+ }
207
+
208
+
209
+
210
+ } // namespace wp
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #include "volume_builder.h"
2
10
 
3
11
  #include <nanovdb/tools/cuda/PointsToGrid.cuh>
@@ -1,3 +1,11 @@
1
+ /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ * and proprietary rights in and to this software, related documentation
4
+ * and any modifications thereto. Any use, reproduction, disclosure or
5
+ * distribution of this software and related documentation without an express
6
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+ */
8
+
1
9
  #pragma once
2
10
 
3
11
  #include <nanovdb/NanoVDB.h>
warp/native/warp.cpp CHANGED
@@ -147,6 +147,11 @@ int is_cutlass_enabled()
147
147
  return int(WP_ENABLE_CUTLASS);
148
148
  }
149
149
 
150
+ int is_mathdx_enabled()
151
+ {
152
+ return int(WP_ENABLE_MATHDX);
153
+ }
154
+
150
155
  int is_debug_enabled()
151
156
  {
152
157
  return int(WP_ENABLE_DEBUG);
@@ -1033,12 +1038,15 @@ WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret
1033
1038
  WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; }
1034
1039
  WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; }
1035
1040
 
1036
- WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_file) { return 0; }
1041
+ WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { return 0; }
1037
1042
 
1038
1043
  WP_API void* cuda_load_module(void* context, const char* ptx) { return NULL; }
1039
1044
  WP_API void cuda_unload_module(void* context, void* module) {}
1040
1045
  WP_API void* cuda_get_kernel(void* context, void* module, const char* name) { return NULL; }
1041
- WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args, void* stream) { return 0; }
1046
+ WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream) { return 0; }
1047
+
1048
+ WP_API int cuda_get_max_shared_memory(void* context) { return 0; }
1049
+ WP_API bool cuda_configure_kernel_shared_memory(void* kernel, int size) { return false; }
1042
1050
 
1043
1051
  WP_API void cuda_set_context_restore_policy(bool always_restore) {}
1044
1052
  WP_API int cuda_get_context_restore_policy() { return false; }