warp-lang 1.4.2__py3-none-macosx_10_13_universal2.whl → 1.5.0__py3-none-macosx_10_13_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +4 -0
- warp/autograd.py +43 -8
- warp/bin/libwarp-clang.dylib +0 -0
- warp/bin/libwarp.dylib +0 -0
- warp/build.py +21 -2
- warp/build_dll.py +23 -6
- warp/builtins.py +1783 -2
- warp/codegen.py +177 -45
- warp/config.py +2 -2
- warp/context.py +321 -73
- warp/examples/assets/pixel.jpg +0 -0
- warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
- warp/examples/benchmarks/benchmark_gemm.py +121 -0
- warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
- warp/examples/benchmarks/benchmark_tile.py +179 -0
- warp/examples/fem/example_adaptive_grid.py +37 -10
- warp/examples/fem/example_apic_fluid.py +3 -2
- warp/examples/fem/example_convection_diffusion_dg.py +4 -5
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion_3d.py +47 -4
- warp/examples/fem/example_distortion_energy.py +220 -0
- warp/examples/fem/example_magnetostatics.py +127 -85
- warp/examples/fem/example_nonconforming_contact.py +5 -5
- warp/examples/fem/example_stokes.py +3 -1
- warp/examples/fem/example_streamlines.py +12 -19
- warp/examples/fem/utils.py +38 -15
- warp/examples/sim/example_cloth.py +2 -25
- warp/examples/sim/example_quadruped.py +2 -1
- warp/examples/tile/example_tile_convolution.py +58 -0
- warp/examples/tile/example_tile_fft.py +47 -0
- warp/examples/tile/example_tile_filtering.py +105 -0
- warp/examples/tile/example_tile_matmul.py +79 -0
- warp/examples/tile/example_tile_mlp.py +375 -0
- warp/fem/__init__.py +8 -0
- warp/fem/cache.py +16 -12
- warp/fem/dirichlet.py +1 -1
- warp/fem/domain.py +44 -1
- warp/fem/field/__init__.py +1 -2
- warp/fem/field/field.py +31 -19
- warp/fem/field/nodal_field.py +101 -49
- warp/fem/field/virtual.py +794 -0
- warp/fem/geometry/__init__.py +2 -2
- warp/fem/geometry/deformed_geometry.py +3 -105
- warp/fem/geometry/element.py +13 -0
- warp/fem/geometry/geometry.py +165 -5
- warp/fem/geometry/grid_2d.py +3 -6
- warp/fem/geometry/grid_3d.py +31 -28
- warp/fem/geometry/hexmesh.py +3 -46
- warp/fem/geometry/nanogrid.py +3 -2
- warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
- warp/fem/geometry/tetmesh.py +2 -43
- warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
- warp/fem/integrate.py +683 -261
- warp/fem/linalg.py +404 -0
- warp/fem/operator.py +101 -18
- warp/fem/polynomial.py +5 -5
- warp/fem/quadrature/quadrature.py +45 -21
- warp/fem/space/__init__.py +45 -11
- warp/fem/space/basis_function_space.py +451 -0
- warp/fem/space/basis_space.py +58 -11
- warp/fem/space/function_space.py +146 -5
- warp/fem/space/grid_2d_function_space.py +80 -66
- warp/fem/space/grid_3d_function_space.py +113 -68
- warp/fem/space/hexmesh_function_space.py +96 -108
- warp/fem/space/nanogrid_function_space.py +62 -110
- warp/fem/space/quadmesh_function_space.py +208 -0
- warp/fem/space/shape/__init__.py +45 -7
- warp/fem/space/shape/cube_shape_function.py +328 -54
- warp/fem/space/shape/shape_function.py +10 -1
- warp/fem/space/shape/square_shape_function.py +328 -60
- warp/fem/space/shape/tet_shape_function.py +269 -19
- warp/fem/space/shape/triangle_shape_function.py +238 -19
- warp/fem/space/tetmesh_function_space.py +69 -37
- warp/fem/space/topology.py +38 -0
- warp/fem/space/trimesh_function_space.py +179 -0
- warp/fem/utils.py +6 -331
- warp/jax_experimental.py +3 -1
- warp/native/array.h +15 -0
- warp/native/builtin.h +66 -26
- warp/native/bvh.h +4 -0
- warp/native/coloring.cpp +600 -0
- warp/native/cuda_util.cpp +14 -0
- warp/native/cuda_util.h +2 -1
- warp/native/fabric.h +8 -0
- warp/native/hashgrid.h +4 -0
- warp/native/marching.cu +8 -0
- warp/native/mat.h +14 -3
- warp/native/mathdx.cpp +59 -0
- warp/native/mesh.h +4 -0
- warp/native/range.h +13 -1
- warp/native/reduce.cpp +9 -1
- warp/native/reduce.cu +7 -0
- warp/native/runlength_encode.cpp +9 -1
- warp/native/runlength_encode.cu +7 -1
- warp/native/scan.cpp +8 -0
- warp/native/scan.cu +8 -0
- warp/native/scan.h +8 -1
- warp/native/sparse.cpp +8 -0
- warp/native/sparse.cu +8 -0
- warp/native/temp_buffer.h +7 -0
- warp/native/tile.h +1857 -0
- warp/native/tile_gemm.h +341 -0
- warp/native/tile_reduce.h +210 -0
- warp/native/volume_builder.cu +8 -0
- warp/native/volume_builder.h +8 -0
- warp/native/warp.cpp +10 -2
- warp/native/warp.cu +369 -15
- warp/native/warp.h +12 -2
- warp/optim/adam.py +39 -4
- warp/paddle.py +29 -12
- warp/render/render_opengl.py +137 -65
- warp/sim/graph_coloring.py +292 -0
- warp/sim/integrator_euler.py +4 -2
- warp/sim/integrator_featherstone.py +115 -44
- warp/sim/integrator_vbd.py +6 -0
- warp/sim/model.py +88 -15
- warp/stubs.py +569 -4
- warp/tape.py +12 -7
- warp/tests/assets/pixel.npy +0 -0
- warp/tests/aux_test_instancing_gc.py +18 -0
- warp/tests/test_array.py +39 -0
- warp/tests/test_codegen.py +81 -1
- warp/tests/test_codegen_instancing.py +30 -0
- warp/tests/test_collision.py +110 -0
- warp/tests/test_coloring.py +241 -0
- warp/tests/test_context.py +34 -0
- warp/tests/test_examples.py +18 -4
- warp/tests/test_fem.py +453 -113
- warp/tests/test_func.py +13 -0
- warp/tests/test_generics.py +52 -0
- warp/tests/test_iter.py +68 -0
- warp/tests/test_mat_scalar_ops.py +1 -1
- warp/tests/test_mesh_query_point.py +1 -1
- warp/tests/test_module_hashing.py +23 -0
- warp/tests/test_paddle.py +27 -87
- warp/tests/test_print.py +56 -1
- warp/tests/test_spatial.py +1 -1
- warp/tests/test_tile.py +700 -0
- warp/tests/test_tile_mathdx.py +144 -0
- warp/tests/test_tile_mlp.py +383 -0
- warp/tests/test_tile_reduce.py +374 -0
- warp/tests/test_tile_shared_memory.py +190 -0
- warp/tests/test_vbd.py +12 -20
- warp/tests/test_volume.py +43 -0
- warp/tests/unittest_suites.py +19 -2
- warp/tests/unittest_utils.py +4 -0
- warp/types.py +338 -72
- warp/utils.py +22 -1
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/RECORD +153 -126
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
- warp/fem/field/test.py +0 -180
- warp/fem/field/trial.py +0 -183
- warp/fem/space/collocated_function_space.py +0 -102
- warp/fem/space/quadmesh_2d_function_space.py +0 -261
- warp/fem/space/trimesh_2d_function_space.py +0 -153
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
warp/native/tile_gemm.h
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
/** Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
* and proprietary rights in and to this software, related documentation
|
|
4
|
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
* distribution of this software and related documentation without an express
|
|
6
|
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#pragma once
|
|
10
|
+
|
|
11
|
+
#include "builtin.h"
|
|
12
|
+
|
|
13
|
+
#define USE_CUTE 0
|
|
14
|
+
|
|
15
|
+
#if USE_CUTE
|
|
16
|
+
#include "cutlass/include/cute/tensor.hpp"
|
|
17
|
+
#include "cutlass/include/cute/algorithm/cooperative_gemm.hpp"
|
|
18
|
+
#endif // USE_CUTE
|
|
19
|
+
|
|
20
|
+
namespace wp
|
|
21
|
+
{
|
|
22
|
+
|
|
23
|
+
/*
|
|
24
|
+
// 2D tile zero
|
|
25
|
+
template <typename T, int M, int N, int Index>
|
|
26
|
+
inline CUDA_CALLABLE array_t<T> tile_zeros()
|
|
27
|
+
{
|
|
28
|
+
const int length = M*N;
|
|
29
|
+
|
|
30
|
+
WP_TILE_SHARED __align__(16) T data[length];
|
|
31
|
+
|
|
32
|
+
WP_PRAGMA_UNROLL
|
|
33
|
+
for (int t=threadIdx.x; t < length; t += blockDim.x)
|
|
34
|
+
{
|
|
35
|
+
data[t] = T(0.0);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return array_t<T>(data, M, N, nullptr);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// 2D tile load
|
|
42
|
+
template <typename T, int M, int N, int Index>
|
|
43
|
+
inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
|
|
44
|
+
{
|
|
45
|
+
const int length = M*N;
|
|
46
|
+
|
|
47
|
+
WP_TILE_SHARED __align__(16) T data[length];
|
|
48
|
+
|
|
49
|
+
//---------------
|
|
50
|
+
// naive-synchronous load
|
|
51
|
+
//
|
|
52
|
+
// WP_PRAGMA_UNROLL
|
|
53
|
+
// for (int t=threadIdx.x; t < length; t += blockDim.x)
|
|
54
|
+
// {
|
|
55
|
+
// data[t] = index(src, i*M + t/N, j*N + t%N);
|
|
56
|
+
// }
|
|
57
|
+
|
|
58
|
+
//---------------
|
|
59
|
+
// async 128 bit loads (assumes row-major i.e.: stride 1 on y axis and 4-element alignment on dimension)
|
|
60
|
+
const int s = 4;
|
|
61
|
+
|
|
62
|
+
WP_PRAGMA_UNROLL
|
|
63
|
+
for (int t=threadIdx.x*s; t < length; t += blockDim.x*s)
|
|
64
|
+
{
|
|
65
|
+
__pipeline_memcpy_async(&data[t],
|
|
66
|
+
&index(src, i*M + t/N, j*N + t%N),
|
|
67
|
+
sizeof(T)*s);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
__pipeline_commit();
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
return array_t<T>(data, M, N, nullptr);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// 2D tile store
|
|
77
|
+
template <typename T>
|
|
78
|
+
inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array_t<T>& src)
|
|
79
|
+
{
|
|
80
|
+
const int M = src.shape[0];
|
|
81
|
+
const int N = src.shape[1];
|
|
82
|
+
|
|
83
|
+
const int length = M*N;
|
|
84
|
+
|
|
85
|
+
// cooperatively store the tile, using a block-stride iterator
|
|
86
|
+
WP_PRAGMA_UNROLL
|
|
87
|
+
for (int t=threadIdx.x; t < length; t += blockDim.x)
|
|
88
|
+
{
|
|
89
|
+
index(dest, i*M + t/N, j*N + t%N) = src.data[t];
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
*/
|
|
93
|
+
|
|
94
|
+
template <typename T>
|
|
95
|
+
inline CUDA_CALLABLE const T& index(const T* __restrict__ p, int i, int j, int stride)
|
|
96
|
+
{
|
|
97
|
+
return p[i*stride + j];
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
template <typename T>
|
|
101
|
+
inline CUDA_CALLABLE T& index(T* __restrict__ p, int i, int j, int stride)
|
|
102
|
+
{
|
|
103
|
+
return p[i*stride + j];
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
template <unsigned M, unsigned N, typename T>
|
|
107
|
+
struct partition_t
|
|
108
|
+
{
|
|
109
|
+
inline partition_t(array_t<T> A)
|
|
110
|
+
{
|
|
111
|
+
data = A;
|
|
112
|
+
|
|
113
|
+
// todo: do ceil div for non-multiples of M,N
|
|
114
|
+
shape[0] = A.shape[0]/M;
|
|
115
|
+
shape[1] = A.shape[1]/N;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// underlying data
|
|
119
|
+
array_t<T> data;
|
|
120
|
+
|
|
121
|
+
// partition dimensions
|
|
122
|
+
int shape[2];
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
template <unsigned M, unsigned N, typename T>
|
|
126
|
+
inline int partition_size(const partition_t<M, N, T>& tile)
|
|
127
|
+
{
|
|
128
|
+
return tile.shape[0]*tile.shape[1];
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// returns the x, y coordinates of a tile given a linear index
|
|
132
|
+
template <unsigned M, unsigned N, typename T>
|
|
133
|
+
inline void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
|
|
134
|
+
{
|
|
135
|
+
i = t/tile.shape[1];
|
|
136
|
+
j = t%tile.shape[1];
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
template <unsigned M, unsigned N, typename T>
|
|
140
|
+
inline mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
|
|
141
|
+
{
|
|
142
|
+
mat_t<M, N, T> out;
|
|
143
|
+
|
|
144
|
+
const int tile_i = i*M;
|
|
145
|
+
const int tile_j = j*N;
|
|
146
|
+
|
|
147
|
+
WP_PRAGMA_UNROLL
|
|
148
|
+
for (int i=0; i < M; ++i)
|
|
149
|
+
{
|
|
150
|
+
WP_PRAGMA_UNROLL
|
|
151
|
+
for (int j=0; j < N; ++j)
|
|
152
|
+
{
|
|
153
|
+
out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return out;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
template <unsigned M, unsigned N, typename T>
|
|
161
|
+
inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
|
|
162
|
+
{
|
|
163
|
+
mat_t<M, N, T> out;
|
|
164
|
+
|
|
165
|
+
const int tile_i = M*i;
|
|
166
|
+
const int tile_j = N*j;
|
|
167
|
+
|
|
168
|
+
WP_PRAGMA_UNROLL
|
|
169
|
+
for (int i=0; i < M; ++i)
|
|
170
|
+
{
|
|
171
|
+
WP_PRAGMA_UNROLL
|
|
172
|
+
for (int j=0; j < N; ++j)
|
|
173
|
+
{
|
|
174
|
+
index(tile.data, tile_i + i, tile_j + j) = value.data[i][j];
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
#if !USE_CUTE
|
|
181
|
+
|
|
182
|
+
template <typename T>
|
|
183
|
+
inline CUDA_CALLABLE void gemm(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
|
|
184
|
+
{
|
|
185
|
+
const int TILE_M = 4;
|
|
186
|
+
const int TILE_N = 4;
|
|
187
|
+
const int TILE_K = 4;
|
|
188
|
+
|
|
189
|
+
partition_t A_tile = partition_t<TILE_M, TILE_K, T>(A);
|
|
190
|
+
partition_t B_tile = partition_t<TILE_K, TILE_N, T>(B);
|
|
191
|
+
partition_t C_tile = partition_t<TILE_M, TILE_N, T>(out);
|
|
192
|
+
|
|
193
|
+
const int length = partition_size(C_tile);
|
|
194
|
+
|
|
195
|
+
__pipeline_wait_prior(0);
|
|
196
|
+
|
|
197
|
+
WP_TILE_SYNC();
|
|
198
|
+
|
|
199
|
+
for (int t=threadIdx.x; t < length; t += blockDim.x)
|
|
200
|
+
{
|
|
201
|
+
int i, j;
|
|
202
|
+
partition_coord(C_tile, t, i, j);
|
|
203
|
+
|
|
204
|
+
// accumulator
|
|
205
|
+
mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
|
|
206
|
+
|
|
207
|
+
WP_PRAGMA_UNROLL
|
|
208
|
+
for (int k=0; k < A_tile.shape[1]; k++)
|
|
209
|
+
{
|
|
210
|
+
const mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
|
|
211
|
+
const mat_t<TILE_K, TILE_N, T> b = partition_load(B_tile, k, j);
|
|
212
|
+
|
|
213
|
+
sum += mul(a, b);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
partition_store(C_tile, i, j, sum);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
WP_TILE_SYNC();
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
// 2D gemm accumulate out += A*B
|
|
224
|
+
template <typename TileA, typename TileB, typename TileC>
|
|
225
|
+
inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
|
|
226
|
+
const TileB& B,
|
|
227
|
+
TileC& out)
|
|
228
|
+
{
|
|
229
|
+
const int length = tile_size(out);
|
|
230
|
+
|
|
231
|
+
WP_TILE_SYNC();
|
|
232
|
+
|
|
233
|
+
using T = typename TileA::Type;
|
|
234
|
+
|
|
235
|
+
WP_PRAGMA_UNROLL
|
|
236
|
+
for (int t=threadIdx.x; t < length; t += WP_TILE_BLOCK_DIM)
|
|
237
|
+
{
|
|
238
|
+
// compute output index
|
|
239
|
+
const int i = t/out.N;
|
|
240
|
+
const int j = t%out.N;
|
|
241
|
+
|
|
242
|
+
T sum(0.0);
|
|
243
|
+
|
|
244
|
+
WP_PRAGMA_UNROLL
|
|
245
|
+
for (int k=0; k < A.N; ++k)
|
|
246
|
+
{
|
|
247
|
+
T a = A(i,k);
|
|
248
|
+
T b = B(k,j);
|
|
249
|
+
|
|
250
|
+
sum += a*b; // todo: use fmaf()
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
out(i,j) += sum;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
WP_TILE_SYNC();
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
#else
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
template <typename T>
|
|
263
|
+
inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
|
|
264
|
+
{
|
|
265
|
+
using namespace cute;
|
|
266
|
+
|
|
267
|
+
__pipeline_wait_prior(0);
|
|
268
|
+
|
|
269
|
+
// ensure smem tile is ready
|
|
270
|
+
WP_TILE_SYNC();
|
|
271
|
+
|
|
272
|
+
// Define CTA matrix size (static)
|
|
273
|
+
auto bM = Int<64>{};
|
|
274
|
+
auto bN = Int<64>{};
|
|
275
|
+
auto bK = Int<8>{};
|
|
276
|
+
|
|
277
|
+
// Define the smem layouts (static)
|
|
278
|
+
auto sA = make_layout(make_shape(bM, bK), LayoutRight{});
|
|
279
|
+
auto sB = make_layout(make_shape(bN, bK));
|
|
280
|
+
auto sC = make_layout(make_shape(bM, bN), LayoutRight{});
|
|
281
|
+
|
|
282
|
+
Tensor s_a_tensor = make_tensor(make_smem_ptr<float>(A.data), sA);
|
|
283
|
+
Tensor s_b_tensor = make_tensor(make_smem_ptr<float>(B.data), sB);
|
|
284
|
+
Tensor s_c_tensor = make_tensor(make_smem_ptr<float>(out.data), sC);
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
// TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
|
|
288
|
+
// Layout<Shape<_16,_8,_1>>{}); // 16x8x1 UniversalFMA, assumes blockDim=128
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
// TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
|
|
292
|
+
// Layout<Shape<_8,_16>,Stride<_16,_1>>{}); // 8x16x1 UniversalFMA, assumes blockDim=128
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
|
|
297
|
+
Layout<Shape<_2,_64>,Stride<_64,_1>>{}); // 8x16x1 UniversalFMA, assumes blockDim=128
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
cooperative_gemm< AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
|
|
301
|
+
AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
|
|
302
|
+
AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>
|
|
303
|
+
>(
|
|
304
|
+
threadIdx.x, tiled_mma,
|
|
305
|
+
1.0f, s_a_tensor, s_b_tensor, 1.0f, s_c_tensor,
|
|
306
|
+
cute::identity(), cute::identity(), cute::identity(), cute::identity()
|
|
307
|
+
);
|
|
308
|
+
|
|
309
|
+
WP_TILE_SYNC();
|
|
310
|
+
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
#endif // USE_CUTE
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
#if 0
|
|
317
|
+
|
|
318
|
+
template <typename TileA, typename TileB, typename TileC>
|
|
319
|
+
void tile_matmul(TileA& a, TileB& b, TileC& c)
|
|
320
|
+
{
|
|
321
|
+
static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error, tile datatypes must match");
|
|
322
|
+
static_assert(TileA::N == TileB::M, "Error, inner dimensions must match");
|
|
323
|
+
static_assert(TileC::M == TileA::M, "Error, first output dimension must match");
|
|
324
|
+
static_assert(TileC::N == TileB::N, "Error, second output dimension must match");
|
|
325
|
+
|
|
326
|
+
tile_matmul_scalar(a, b, c);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
template <typename TileA, typename TileB, typename TileC,
|
|
331
|
+
typename AdjTileA, typename AdjTileB, typename AdjTileC>
|
|
332
|
+
void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
|
|
333
|
+
AdjTileA& adj_a, AdjTileB& adj_b, AdjTileC& adj_c)
|
|
334
|
+
{
|
|
335
|
+
tile_matmul_scalar(adj_c, wp::tile_transpose(b), adj_a);
|
|
336
|
+
tile_matmul_scalar(wp::tile_transpose(a), adj_c, adj_b);
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
#endif // 0
|
|
340
|
+
|
|
341
|
+
} // namespace wp
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
/** Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
* and proprietary rights in and to this software, related documentation
|
|
4
|
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
* distribution of this software and related documentation without an express
|
|
6
|
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#pragma once
|
|
10
|
+
|
|
11
|
+
#include "tile.h"
|
|
12
|
+
|
|
13
|
+
#define WP_TILE_WARP_SIZE 32
|
|
14
|
+
|
|
15
|
+
namespace wp
|
|
16
|
+
{
|
|
17
|
+
|
|
18
|
+
template <typename T>
|
|
19
|
+
inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset, int mask)
|
|
20
|
+
{
|
|
21
|
+
typedef unsigned int Word;
|
|
22
|
+
|
|
23
|
+
union
|
|
24
|
+
{
|
|
25
|
+
T output;
|
|
26
|
+
Word output_storage;
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
union
|
|
30
|
+
{
|
|
31
|
+
T input;
|
|
32
|
+
Word input_storage;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
input = val;
|
|
36
|
+
|
|
37
|
+
Word* dest = reinterpret_cast<Word*>(&output);
|
|
38
|
+
Word* src = reinterpret_cast<Word*>(&input);
|
|
39
|
+
|
|
40
|
+
unsigned int shuffle_word;
|
|
41
|
+
|
|
42
|
+
constexpr int word_count = (sizeof(T) + sizeof(Word) - 1) / sizeof(Word);
|
|
43
|
+
|
|
44
|
+
WP_PRAGMA_UNROLL
|
|
45
|
+
for (int i=0; i < word_count; ++i)
|
|
46
|
+
{
|
|
47
|
+
shuffle_word = __shfl_down_sync(mask, src[i], offset, WP_TILE_WARP_SIZE);
|
|
48
|
+
dest[i] = shuffle_word;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return output;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
template <typename T, typename Op>
|
|
55
|
+
inline CUDA_CALLABLE T warp_reduce(T val, Op f, unsigned int mask)
|
|
56
|
+
{
|
|
57
|
+
T sum = val;
|
|
58
|
+
|
|
59
|
+
if (mask == 0xFFFFFFFF)
|
|
60
|
+
{
|
|
61
|
+
// handle case where entire warp is active
|
|
62
|
+
for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
|
|
63
|
+
{
|
|
64
|
+
sum = f(sum, warp_shuffle_down(sum, offset, mask));
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
else
|
|
68
|
+
{
|
|
69
|
+
// handle partial warp case
|
|
70
|
+
for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
|
|
71
|
+
{
|
|
72
|
+
T shfl_val = warp_shuffle_down(sum, offset, mask);
|
|
73
|
+
if ((mask & (1 << ((threadIdx.x + offset)%WP_TILE_WARP_SIZE))) != 0)
|
|
74
|
+
sum = f(sum, shfl_val);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return sum;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// non-axis version which computes sum
|
|
82
|
+
// across the entire tile using the whole block
|
|
83
|
+
template <typename Tile, typename Op>
|
|
84
|
+
auto tile_reduce_impl(Op f, Tile& t)
|
|
85
|
+
{
|
|
86
|
+
using T = typename Tile::Type;
|
|
87
|
+
|
|
88
|
+
auto input = t.copy_to_register();
|
|
89
|
+
auto output = tile_register_t<T, 1, 1>();
|
|
90
|
+
|
|
91
|
+
const int warp_count = (WP_TILE_BLOCK_DIM + WP_TILE_WARP_SIZE - 1)/WP_TILE_WARP_SIZE;
|
|
92
|
+
const int warp_index = threadIdx.x/WP_TILE_WARP_SIZE;
|
|
93
|
+
const int lane_index = threadIdx.x%WP_TILE_WARP_SIZE;
|
|
94
|
+
|
|
95
|
+
T thread_sum = input.data[0];
|
|
96
|
+
|
|
97
|
+
// thread reduction
|
|
98
|
+
WP_PRAGMA_UNROLL
|
|
99
|
+
for (int i=1; i < input.NumRegs; ++i)
|
|
100
|
+
{
|
|
101
|
+
int linear = t.index(i);
|
|
102
|
+
if (!Tile::Aligned && linear >= Tile::Size)
|
|
103
|
+
break;
|
|
104
|
+
|
|
105
|
+
thread_sum = f(thread_sum, input.data[i]);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// ensure that only threads with at least one valid item participate in the reduction
|
|
109
|
+
unsigned int mask = __ballot_sync(__activemask(), t.index(0) < Tile::Size);
|
|
110
|
+
|
|
111
|
+
// warp reduction
|
|
112
|
+
T warp_sum = warp_reduce(thread_sum, f, mask);
|
|
113
|
+
|
|
114
|
+
// fixed size scratch pad for partial results in shared memory
|
|
115
|
+
WP_TILE_SHARED T partials[warp_count];
|
|
116
|
+
|
|
117
|
+
// count of active warps
|
|
118
|
+
WP_TILE_SHARED int active_warps;
|
|
119
|
+
if (threadIdx.x == 0)
|
|
120
|
+
active_warps = 0;
|
|
121
|
+
|
|
122
|
+
// ensure active_warps is initialized
|
|
123
|
+
WP_TILE_SYNC();
|
|
124
|
+
|
|
125
|
+
if (lane_index == 0)
|
|
126
|
+
{
|
|
127
|
+
partials[warp_index] = warp_sum;
|
|
128
|
+
atomicAdd(&active_warps, 1);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// ensure partials are ready
|
|
132
|
+
WP_TILE_SYNC();
|
|
133
|
+
|
|
134
|
+
// reduce across block, todo: use warp_reduce() here
|
|
135
|
+
if (threadIdx.x == 0)
|
|
136
|
+
{
|
|
137
|
+
T block_sum = partials[0];
|
|
138
|
+
|
|
139
|
+
WP_PRAGMA_UNROLL
|
|
140
|
+
for (int i=1; i < active_warps; ++i)
|
|
141
|
+
block_sum = f(block_sum, partials[i]);
|
|
142
|
+
|
|
143
|
+
output.data[0] = block_sum;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return output;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
void adj_tile_reduce_impl()
|
|
150
|
+
{
|
|
151
|
+
// todo: general purpose reduction gradients not implemented
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// entry point for Python code-gen, wraps op in a lambda to perform overload resolution
|
|
155
|
+
#define tile_reduce(op, t) tile_reduce_impl([](auto x, auto y) { return op(x, y);}, t)
|
|
156
|
+
#define adj_tile_reduce(op, a, adj_op, adj_a, adj_ret) adj_tile_reduce_impl()
|
|
157
|
+
|
|
158
|
+
// convenience methods for specific reductions
|
|
159
|
+
|
|
160
|
+
template <typename Tile>
|
|
161
|
+
auto tile_sum(Tile& t)
|
|
162
|
+
{
|
|
163
|
+
return tile_reduce(add, t);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// special case adjoint for summation
|
|
167
|
+
template <typename Tile, typename AdjTile>
|
|
168
|
+
void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
|
|
169
|
+
{
|
|
170
|
+
using T = typename Tile::Type;
|
|
171
|
+
|
|
172
|
+
// broadcast incoming adjoint to block
|
|
173
|
+
WP_TILE_SHARED T scratch;
|
|
174
|
+
if (threadIdx.x == 0)
|
|
175
|
+
scratch = adj_ret.data[0];
|
|
176
|
+
|
|
177
|
+
WP_TILE_SYNC();
|
|
178
|
+
|
|
179
|
+
// broadcast scalar across input dimensions (note zero strides)
|
|
180
|
+
auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, 0, 0>(&scratch, NULL).copy_to_register();
|
|
181
|
+
adj_t.grad_add(adj_ret_reg);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
template <typename Tile>
|
|
185
|
+
auto tile_max(Tile& t)
|
|
186
|
+
{
|
|
187
|
+
return tile_reduce(max, t);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
template <typename Tile, typename AdjTile>
|
|
191
|
+
void adj_tile_max(Tile& t, Tile& adj_t, AdjTile& adj_ret)
|
|
192
|
+
{
|
|
193
|
+
// todo: not implemented
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
template <typename Tile>
|
|
197
|
+
auto tile_min(Tile& t)
|
|
198
|
+
{
|
|
199
|
+
return tile_reduce(min, t);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
template <typename Tile, typename AdjTile>
|
|
203
|
+
void adj_tile_min(Tile& t, Tile& adj_t, AdjTile& adj_ret)
|
|
204
|
+
{
|
|
205
|
+
// todo: not implemented
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
} // namespace wp
|
warp/native/volume_builder.cu
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
/** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
* and proprietary rights in and to this software, related documentation
|
|
4
|
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
* distribution of this software and related documentation without an express
|
|
6
|
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
*/
|
|
8
|
+
|
|
1
9
|
#include "volume_builder.h"
|
|
2
10
|
|
|
3
11
|
#include <nanovdb/tools/cuda/PointsToGrid.cuh>
|
warp/native/volume_builder.h
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
/** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
* and proprietary rights in and to this software, related documentation
|
|
4
|
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
* distribution of this software and related documentation without an express
|
|
6
|
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
*/
|
|
8
|
+
|
|
1
9
|
#pragma once
|
|
2
10
|
|
|
3
11
|
#include <nanovdb/NanoVDB.h>
|
warp/native/warp.cpp
CHANGED
|
@@ -147,6 +147,11 @@ int is_cutlass_enabled()
|
|
|
147
147
|
return int(WP_ENABLE_CUTLASS);
|
|
148
148
|
}
|
|
149
149
|
|
|
150
|
+
int is_mathdx_enabled()
|
|
151
|
+
{
|
|
152
|
+
return int(WP_ENABLE_MATHDX);
|
|
153
|
+
}
|
|
154
|
+
|
|
150
155
|
int is_debug_enabled()
|
|
151
156
|
{
|
|
152
157
|
return int(WP_ENABLE_DEBUG);
|
|
@@ -1033,12 +1038,15 @@ WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret
|
|
|
1033
1038
|
WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; }
|
|
1034
1039
|
WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; }
|
|
1035
1040
|
|
|
1036
|
-
WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, bool debug, bool verbose, bool verify_fp, bool fast_math, const char*
|
|
1041
|
+
WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { return 0; }
|
|
1037
1042
|
|
|
1038
1043
|
WP_API void* cuda_load_module(void* context, const char* ptx) { return NULL; }
|
|
1039
1044
|
WP_API void cuda_unload_module(void* context, void* module) {}
|
|
1040
1045
|
WP_API void* cuda_get_kernel(void* context, void* module, const char* name) { return NULL; }
|
|
1041
|
-
WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args, void* stream) { return 0; }
|
|
1046
|
+
WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream) { return 0; }
|
|
1047
|
+
|
|
1048
|
+
WP_API int cuda_get_max_shared_memory(void* context) { return 0; }
|
|
1049
|
+
WP_API bool cuda_configure_kernel_shared_memory(void* kernel, int size) { return false; }
|
|
1042
1050
|
|
|
1043
1051
|
WP_API void cuda_set_context_restore_policy(bool always_restore) {}
|
|
1044
1052
|
WP_API int cuda_get_context_restore_policy() { return false; }
|