warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +282 -103
- warp/__init__.pyi +482 -110
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +93 -30
- warp/build_dll.py +48 -63
- warp/builtins.py +955 -137
- warp/codegen.py +327 -209
- warp/config.py +1 -1
- warp/context.py +1363 -800
- warp/examples/core/example_marching_cubes.py +1 -0
- warp/examples/core/example_render_opengl.py +100 -3
- warp/examples/fem/example_apic_fluid.py +98 -52
- warp/examples/fem/example_convection_diffusion_dg.py +25 -4
- warp/examples/fem/example_diffusion_mgpu.py +8 -3
- warp/examples/fem/utils.py +68 -22
- warp/examples/interop/example_jax_callable.py +34 -4
- warp/examples/interop/example_jax_kernel.py +27 -1
- warp/fabric.py +1 -1
- warp/fem/cache.py +27 -19
- warp/fem/domain.py +2 -2
- warp/fem/field/nodal_field.py +2 -2
- warp/fem/field/virtual.py +266 -166
- warp/fem/geometry/geometry.py +5 -5
- warp/fem/integrate.py +200 -91
- warp/fem/space/restriction.py +4 -0
- warp/fem/space/shape/tet_shape_function.py +3 -10
- warp/jax_experimental/custom_call.py +1 -1
- warp/jax_experimental/ffi.py +203 -54
- warp/marching_cubes.py +708 -0
- warp/native/array.h +103 -8
- warp/native/builtin.h +90 -9
- warp/native/bvh.cpp +64 -28
- warp/native/bvh.cu +58 -58
- warp/native/bvh.h +2 -2
- warp/native/clang/clang.cpp +7 -7
- warp/native/coloring.cpp +13 -3
- warp/native/crt.cpp +2 -2
- warp/native/crt.h +3 -5
- warp/native/cuda_util.cpp +42 -11
- warp/native/cuda_util.h +10 -4
- warp/native/exports.h +1842 -1908
- warp/native/fabric.h +2 -1
- warp/native/hashgrid.cpp +37 -37
- warp/native/hashgrid.cu +2 -2
- warp/native/initializer_array.h +1 -1
- warp/native/intersect.h +4 -4
- warp/native/mat.h +1913 -119
- warp/native/mathdx.cpp +43 -43
- warp/native/mesh.cpp +24 -24
- warp/native/mesh.cu +26 -26
- warp/native/mesh.h +5 -3
- warp/native/nanovdb/GridHandle.h +179 -12
- warp/native/nanovdb/HostBuffer.h +8 -7
- warp/native/nanovdb/NanoVDB.h +517 -895
- warp/native/nanovdb/NodeManager.h +323 -0
- warp/native/nanovdb/PNanoVDB.h +2 -2
- warp/native/quat.h +337 -16
- warp/native/rand.h +7 -7
- warp/native/range.h +7 -1
- warp/native/reduce.cpp +10 -10
- warp/native/reduce.cu +13 -14
- warp/native/runlength_encode.cpp +2 -2
- warp/native/runlength_encode.cu +5 -5
- warp/native/scan.cpp +3 -3
- warp/native/scan.cu +4 -4
- warp/native/sort.cpp +10 -10
- warp/native/sort.cu +22 -22
- warp/native/sparse.cpp +8 -8
- warp/native/sparse.cu +14 -14
- warp/native/spatial.h +366 -17
- warp/native/svd.h +23 -8
- warp/native/temp_buffer.h +2 -2
- warp/native/tile.h +303 -70
- warp/native/tile_radix_sort.h +5 -1
- warp/native/tile_reduce.h +16 -25
- warp/native/tuple.h +2 -2
- warp/native/vec.h +385 -18
- warp/native/volume.cpp +54 -54
- warp/native/volume.cu +1 -1
- warp/native/volume.h +2 -1
- warp/native/volume_builder.cu +30 -37
- warp/native/warp.cpp +150 -149
- warp/native/warp.cu +337 -193
- warp/native/warp.h +227 -226
- warp/optim/linear.py +736 -271
- warp/render/imgui_manager.py +289 -0
- warp/render/render_opengl.py +137 -57
- warp/render/render_usd.py +0 -1
- warp/sim/collide.py +1 -2
- warp/sim/graph_coloring.py +2 -2
- warp/sim/integrator_vbd.py +10 -2
- warp/sparse.py +559 -176
- warp/tape.py +2 -0
- warp/tests/aux_test_module_aot.py +7 -0
- warp/tests/cuda/test_async.py +3 -3
- warp/tests/cuda/test_conditional_captures.py +101 -0
- warp/tests/geometry/test_marching_cubes.py +233 -12
- warp/tests/sim/test_cloth.py +89 -6
- warp/tests/sim/test_coloring.py +82 -7
- warp/tests/test_array.py +56 -5
- warp/tests/test_assert.py +53 -0
- warp/tests/test_atomic_cas.py +127 -114
- warp/tests/test_codegen.py +3 -2
- warp/tests/test_context.py +8 -15
- warp/tests/test_enum.py +136 -0
- warp/tests/test_examples.py +2 -2
- warp/tests/test_fem.py +45 -2
- warp/tests/test_fixedarray.py +229 -0
- warp/tests/test_func.py +18 -15
- warp/tests/test_future_annotations.py +7 -5
- warp/tests/test_linear_solvers.py +30 -0
- warp/tests/test_map.py +1 -1
- warp/tests/test_mat.py +1540 -378
- warp/tests/test_mat_assign_copy.py +178 -0
- warp/tests/test_mat_constructors.py +574 -0
- warp/tests/test_module_aot.py +287 -0
- warp/tests/test_print.py +69 -0
- warp/tests/test_quat.py +162 -34
- warp/tests/test_quat_assign_copy.py +145 -0
- warp/tests/test_reload.py +2 -1
- warp/tests/test_sparse.py +103 -0
- warp/tests/test_spatial.py +140 -34
- warp/tests/test_spatial_assign_copy.py +160 -0
- warp/tests/test_static.py +48 -0
- warp/tests/test_struct.py +43 -3
- warp/tests/test_tape.py +38 -0
- warp/tests/test_types.py +0 -20
- warp/tests/test_vec.py +216 -441
- warp/tests/test_vec_assign_copy.py +143 -0
- warp/tests/test_vec_constructors.py +325 -0
- warp/tests/tile/test_tile.py +206 -152
- warp/tests/tile/test_tile_cholesky.py +605 -0
- warp/tests/tile/test_tile_load.py +169 -0
- warp/tests/tile/test_tile_mathdx.py +2 -558
- warp/tests/tile/test_tile_matmul.py +179 -0
- warp/tests/tile/test_tile_mlp.py +1 -1
- warp/tests/tile/test_tile_reduce.py +100 -11
- warp/tests/tile/test_tile_shared_memory.py +16 -16
- warp/tests/tile/test_tile_sort.py +59 -55
- warp/tests/unittest_suites.py +16 -0
- warp/tests/walkthrough_debug.py +1 -1
- warp/thirdparty/unittest_parallel.py +108 -9
- warp/types.py +554 -264
- warp/utils.py +68 -86
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
- warp/native/marching.cpp +0 -19
- warp/native/marching.cu +0 -514
- warp/native/marching.h +0 -19
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/reduce.cu
CHANGED
|
@@ -22,7 +22,6 @@
|
|
|
22
22
|
|
|
23
23
|
#define THRUST_IGNORE_CUB_VERSION_CHECK
|
|
24
24
|
#include <cub/device/device_reduce.cuh>
|
|
25
|
-
#include <cub/iterator/counting_input_iterator.cuh>
|
|
26
25
|
|
|
27
26
|
namespace
|
|
28
27
|
{
|
|
@@ -119,14 +118,14 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
|
|
|
119
118
|
assert((byte_stride % sizeof(T)) == 0);
|
|
120
119
|
const int stride = byte_stride / sizeof(T);
|
|
121
120
|
|
|
122
|
-
ContextGuard guard(
|
|
123
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
121
|
+
ContextGuard guard(wp_cuda_context_get_current());
|
|
122
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
124
123
|
|
|
125
124
|
cub_strided_iterator<const T> ptr_strided{ptr_a, stride};
|
|
126
125
|
|
|
127
126
|
size_t buff_size = 0;
|
|
128
127
|
check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, ptr_strided, ptr_out, count, stream));
|
|
129
|
-
void* temp_buffer =
|
|
128
|
+
void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
|
|
130
129
|
|
|
131
130
|
for (int k = 0; k < type_length; ++k)
|
|
132
131
|
{
|
|
@@ -134,7 +133,7 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
|
|
|
134
133
|
check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, ptr_strided, ptr_out + k, count, stream));
|
|
135
134
|
}
|
|
136
135
|
|
|
137
|
-
|
|
136
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
|
|
138
137
|
}
|
|
139
138
|
|
|
140
139
|
template <typename T>
|
|
@@ -280,18 +279,18 @@ void array_inner_device(const ElemT *ptr_a, const ElemT *ptr_b, ScalarT *ptr_out
|
|
|
280
279
|
const int stride_a = byte_stride_a / sizeof(ElemT);
|
|
281
280
|
const int stride_b = byte_stride_b / sizeof(ElemT);
|
|
282
281
|
|
|
283
|
-
ContextGuard guard(
|
|
284
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
282
|
+
ContextGuard guard(wp_cuda_context_get_current());
|
|
283
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
285
284
|
|
|
286
285
|
cub_inner_product_iterator<ElemT, ScalarT> inner_iterator{ptr_a, ptr_b, stride_a, stride_b, type_length};
|
|
287
286
|
|
|
288
287
|
size_t buff_size = 0;
|
|
289
288
|
check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, inner_iterator, ptr_out, count, stream));
|
|
290
|
-
void* temp_buffer =
|
|
289
|
+
void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
|
|
291
290
|
|
|
292
291
|
check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, inner_iterator, ptr_out, count, stream));
|
|
293
292
|
|
|
294
|
-
|
|
293
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
|
|
295
294
|
}
|
|
296
295
|
|
|
297
296
|
template <typename T>
|
|
@@ -327,10 +326,10 @@ void array_inner_device_dispatch(const T *ptr_a, const T *ptr_b, T *ptr_out, int
|
|
|
327
326
|
|
|
328
327
|
} // anonymous namespace
|
|
329
328
|
|
|
330
|
-
void
|
|
329
|
+
void wp_array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
|
|
331
330
|
int type_len)
|
|
332
331
|
{
|
|
333
|
-
void *context =
|
|
332
|
+
void *context = wp_cuda_context_get_current();
|
|
334
333
|
|
|
335
334
|
const float *ptr_a = (const float *)(a);
|
|
336
335
|
const float *ptr_b = (const float *)(b);
|
|
@@ -339,7 +338,7 @@ void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, i
|
|
|
339
338
|
array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
|
|
340
339
|
}
|
|
341
340
|
|
|
342
|
-
void
|
|
341
|
+
void wp_array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
|
|
343
342
|
int type_len)
|
|
344
343
|
{
|
|
345
344
|
const double *ptr_a = (const double *)(a);
|
|
@@ -349,14 +348,14 @@ void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count,
|
|
|
349
348
|
array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
|
|
350
349
|
}
|
|
351
350
|
|
|
352
|
-
void
|
|
351
|
+
void wp_array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
|
|
353
352
|
{
|
|
354
353
|
const float *ptr_a = (const float *)(a);
|
|
355
354
|
float *ptr_out = (float *)(out);
|
|
356
355
|
array_sum_device_dispatch(ptr_a, ptr_out, count, byte_stride, type_length);
|
|
357
356
|
}
|
|
358
357
|
|
|
359
|
-
void
|
|
358
|
+
void wp_array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
|
|
360
359
|
{
|
|
361
360
|
const double *ptr_a = (const double *)(a);
|
|
362
361
|
double *ptr_out = (double *)(out);
|
warp/native/runlength_encode.cpp
CHANGED
|
@@ -53,7 +53,7 @@ void runlength_encode_host(int n,
|
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
55
|
|
|
56
|
-
void
|
|
56
|
+
void wp_runlength_encode_int_host(
|
|
57
57
|
uint64_t values,
|
|
58
58
|
uint64_t run_values,
|
|
59
59
|
uint64_t run_lengths,
|
|
@@ -68,7 +68,7 @@ void runlength_encode_int_host(
|
|
|
68
68
|
}
|
|
69
69
|
|
|
70
70
|
#if !WP_ENABLE_CUDA
|
|
71
|
-
void
|
|
71
|
+
void wp_runlength_encode_int_device(
|
|
72
72
|
uint64_t values,
|
|
73
73
|
uint64_t run_values,
|
|
74
74
|
uint64_t run_lengths,
|
warp/native/runlength_encode.cu
CHANGED
|
@@ -28,24 +28,24 @@ void runlength_encode_device(int n,
|
|
|
28
28
|
int *run_lengths,
|
|
29
29
|
int *run_count)
|
|
30
30
|
{
|
|
31
|
-
ContextGuard guard(
|
|
32
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
31
|
+
ContextGuard guard(wp_cuda_context_get_current());
|
|
32
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
33
33
|
|
|
34
34
|
size_t buff_size = 0;
|
|
35
35
|
check_cuda(cub::DeviceRunLengthEncode::Encode(
|
|
36
36
|
nullptr, buff_size, values, run_values, run_lengths, run_count,
|
|
37
37
|
n, stream));
|
|
38
38
|
|
|
39
|
-
void* temp_buffer =
|
|
39
|
+
void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
|
|
40
40
|
|
|
41
41
|
check_cuda(cub::DeviceRunLengthEncode::Encode(
|
|
42
42
|
temp_buffer, buff_size, values, run_values, run_lengths, run_count,
|
|
43
43
|
n, stream));
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
void
|
|
48
|
+
void wp_runlength_encode_int_device(
|
|
49
49
|
uint64_t values,
|
|
50
50
|
uint64_t run_values,
|
|
51
51
|
uint64_t run_lengths,
|
warp/native/scan.cpp
CHANGED
|
@@ -28,8 +28,8 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
|
|
|
28
28
|
// compute temporary memory required
|
|
29
29
|
if (!inclusive && n > scan_temp_max_size)
|
|
30
30
|
{
|
|
31
|
-
|
|
32
|
-
scan_temp_memory =
|
|
31
|
+
wp_free_host(scan_temp_memory);
|
|
32
|
+
scan_temp_memory = wp_alloc_host(sizeof(T) * n);
|
|
33
33
|
scan_temp_max_size = n;
|
|
34
34
|
}
|
|
35
35
|
|
|
@@ -39,7 +39,7 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
|
|
|
39
39
|
std::partial_sum(values_in, values_in + n, result);
|
|
40
40
|
if (!inclusive) {
|
|
41
41
|
values_out[0] = (T)0;
|
|
42
|
-
|
|
42
|
+
wp_memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
|
|
43
43
|
}
|
|
44
44
|
}
|
|
45
45
|
|
warp/native/scan.cu
CHANGED
|
@@ -25,9 +25,9 @@
|
|
|
25
25
|
template<typename T>
|
|
26
26
|
void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
|
|
27
27
|
{
|
|
28
|
-
ContextGuard guard(
|
|
28
|
+
ContextGuard guard(wp_cuda_context_get_current());
|
|
29
29
|
|
|
30
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
30
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
31
31
|
|
|
32
32
|
// compute temporary memory required
|
|
33
33
|
size_t scan_temp_size;
|
|
@@ -37,7 +37,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
|
|
|
37
37
|
check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
void* temp_buffer =
|
|
40
|
+
void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
|
|
41
41
|
|
|
42
42
|
// scan
|
|
43
43
|
if (inclusive) {
|
|
@@ -46,7 +46,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
|
|
|
46
46
|
check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
|
|
47
47
|
}
|
|
48
48
|
|
|
49
|
-
|
|
49
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
|
|
50
50
|
}
|
|
51
51
|
|
|
52
52
|
template void scan_device(const int*, int*, int, bool);
|
warp/native/sort.cpp
CHANGED
|
@@ -198,41 +198,41 @@ void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start
|
|
|
198
198
|
|
|
199
199
|
void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
|
|
200
200
|
|
|
201
|
-
void
|
|
201
|
+
void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
|
|
202
202
|
|
|
203
|
-
void
|
|
203
|
+
void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
|
|
204
204
|
|
|
205
|
-
void
|
|
205
|
+
void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
|
|
206
206
|
|
|
207
|
-
void
|
|
207
|
+
void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
|
|
208
208
|
|
|
209
|
-
void
|
|
209
|
+
void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
|
|
210
210
|
|
|
211
211
|
#endif // !WP_ENABLE_CUDA
|
|
212
212
|
|
|
213
213
|
|
|
214
|
-
void
|
|
214
|
+
void wp_radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
|
|
215
215
|
{
|
|
216
216
|
radix_sort_pairs_host(
|
|
217
217
|
reinterpret_cast<int *>(keys),
|
|
218
218
|
reinterpret_cast<int *>(values), n);
|
|
219
219
|
}
|
|
220
220
|
|
|
221
|
-
void
|
|
221
|
+
void wp_radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
|
|
222
222
|
{
|
|
223
223
|
radix_sort_pairs_host(
|
|
224
224
|
reinterpret_cast<int64_t *>(keys),
|
|
225
225
|
reinterpret_cast<int *>(values), n);
|
|
226
226
|
}
|
|
227
227
|
|
|
228
|
-
void
|
|
228
|
+
void wp_radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
|
|
229
229
|
{
|
|
230
230
|
radix_sort_pairs_host(
|
|
231
231
|
reinterpret_cast<float *>(keys),
|
|
232
232
|
reinterpret_cast<int *>(values), n);
|
|
233
233
|
}
|
|
234
234
|
|
|
235
|
-
void
|
|
235
|
+
void wp_segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
|
|
236
236
|
{
|
|
237
237
|
segmented_sort_pairs_host(
|
|
238
238
|
reinterpret_cast<float *>(keys),
|
|
@@ -241,7 +241,7 @@ void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint
|
|
|
241
241
|
reinterpret_cast<int *>(segment_end_indices), num_segments);
|
|
242
242
|
}
|
|
243
243
|
|
|
244
|
-
void
|
|
244
|
+
void wp_segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
|
|
245
245
|
{
|
|
246
246
|
segmented_sort_pairs_host(
|
|
247
247
|
reinterpret_cast<int *>(keys),
|
warp/native/sort.cu
CHANGED
|
@@ -52,17 +52,17 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
|
|
|
52
52
|
d_keys,
|
|
53
53
|
d_values,
|
|
54
54
|
n, 0, sizeof(KeyType)*8,
|
|
55
|
-
(cudaStream_t)
|
|
55
|
+
(cudaStream_t)wp_cuda_stream_get_current()));
|
|
56
56
|
|
|
57
57
|
if (!context)
|
|
58
|
-
context =
|
|
58
|
+
context = wp_cuda_context_get_current();
|
|
59
59
|
|
|
60
60
|
RadixSortTemp& temp = g_radix_sort_temp_map[context];
|
|
61
61
|
|
|
62
62
|
if (sort_temp_size > temp.size)
|
|
63
63
|
{
|
|
64
|
-
|
|
65
|
-
temp.mem =
|
|
64
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
|
|
65
|
+
temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
|
|
66
66
|
temp.size = sort_temp_size;
|
|
67
67
|
}
|
|
68
68
|
|
|
@@ -95,13 +95,13 @@ void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
|
|
|
95
95
|
d_keys,
|
|
96
96
|
d_values,
|
|
97
97
|
n, 0, sizeof(KeyType)*8,
|
|
98
|
-
(cudaStream_t)
|
|
98
|
+
(cudaStream_t)wp_cuda_stream_get_current()));
|
|
99
99
|
|
|
100
100
|
if (d_keys.Current() != keys)
|
|
101
|
-
|
|
101
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
|
|
102
102
|
|
|
103
103
|
if (d_values.Current() != values)
|
|
104
|
-
|
|
104
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
|
|
105
105
|
}
|
|
106
106
|
|
|
107
107
|
void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
|
|
@@ -119,7 +119,7 @@ void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
|
|
|
119
119
|
radix_sort_pairs_device<int64_t>(context, keys, values, n);
|
|
120
120
|
}
|
|
121
121
|
|
|
122
|
-
void
|
|
122
|
+
void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
|
|
123
123
|
{
|
|
124
124
|
radix_sort_pairs_device(
|
|
125
125
|
WP_CURRENT_CONTEXT,
|
|
@@ -127,7 +127,7 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
|
|
|
127
127
|
reinterpret_cast<int *>(values), n);
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
-
void
|
|
130
|
+
void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
|
|
131
131
|
{
|
|
132
132
|
radix_sort_pairs_device(
|
|
133
133
|
WP_CURRENT_CONTEXT,
|
|
@@ -135,7 +135,7 @@ void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
|
|
|
135
135
|
reinterpret_cast<int *>(values), n);
|
|
136
136
|
}
|
|
137
137
|
|
|
138
|
-
void
|
|
138
|
+
void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
|
|
139
139
|
{
|
|
140
140
|
radix_sort_pairs_device(
|
|
141
141
|
WP_CURRENT_CONTEXT,
|
|
@@ -166,17 +166,17 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
|
|
|
166
166
|
end_indices,
|
|
167
167
|
0,
|
|
168
168
|
32,
|
|
169
|
-
(cudaStream_t)
|
|
169
|
+
(cudaStream_t)wp_cuda_stream_get_current()));
|
|
170
170
|
|
|
171
171
|
if (!context)
|
|
172
|
-
context =
|
|
172
|
+
context = wp_cuda_context_get_current();
|
|
173
173
|
|
|
174
174
|
RadixSortTemp& temp = g_radix_sort_temp_map[context];
|
|
175
175
|
|
|
176
176
|
if (sort_temp_size > temp.size)
|
|
177
177
|
{
|
|
178
|
-
|
|
179
|
-
temp.mem =
|
|
178
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
|
|
179
|
+
temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
|
|
180
180
|
temp.size = sort_temp_size;
|
|
181
181
|
}
|
|
182
182
|
|
|
@@ -211,16 +211,16 @@ void segmented_sort_pairs_device(void* context, float* keys, int* values, int n,
|
|
|
211
211
|
segment_end_indices,
|
|
212
212
|
0,
|
|
213
213
|
32,
|
|
214
|
-
(cudaStream_t)
|
|
214
|
+
(cudaStream_t)wp_cuda_stream_get_current()));
|
|
215
215
|
|
|
216
216
|
if (d_keys.Current() != keys)
|
|
217
|
-
|
|
217
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
|
|
218
218
|
|
|
219
219
|
if (d_values.Current() != values)
|
|
220
|
-
|
|
220
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
|
|
221
221
|
}
|
|
222
222
|
|
|
223
|
-
void
|
|
223
|
+
void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
|
|
224
224
|
{
|
|
225
225
|
segmented_sort_pairs_device(
|
|
226
226
|
WP_CURRENT_CONTEXT,
|
|
@@ -256,16 +256,16 @@ void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, i
|
|
|
256
256
|
segment_end_indices,
|
|
257
257
|
0,
|
|
258
258
|
32,
|
|
259
|
-
(cudaStream_t)
|
|
259
|
+
(cudaStream_t)wp_cuda_stream_get_current()));
|
|
260
260
|
|
|
261
261
|
if (d_keys.Current() != keys)
|
|
262
|
-
|
|
262
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
|
|
263
263
|
|
|
264
264
|
if (d_values.Current() != values)
|
|
265
|
-
|
|
265
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
|
|
266
266
|
}
|
|
267
267
|
|
|
268
|
-
void
|
|
268
|
+
void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
|
|
269
269
|
{
|
|
270
270
|
segmented_sort_pairs_device(
|
|
271
271
|
WP_CURRENT_CONTEXT,
|
warp/native/sparse.cpp
CHANGED
|
@@ -36,7 +36,7 @@ template <typename T> bool bsr_block_is_zero(int block_idx, int block_size, cons
|
|
|
36
36
|
} // namespace
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
WP_API void
|
|
39
|
+
WP_API void wp_bsr_matrix_from_triplets_host(
|
|
40
40
|
int block_size,
|
|
41
41
|
int scalar_size_in_bytes,
|
|
42
42
|
int row_count,
|
|
@@ -64,8 +64,8 @@ WP_API void bsr_matrix_from_triplets_host(
|
|
|
64
64
|
bool return_summed_blocks = tpl_block_offsets != nullptr && tpl_block_indices != nullptr;
|
|
65
65
|
if (!return_summed_blocks)
|
|
66
66
|
{
|
|
67
|
-
tpl_block_offsets = static_cast<int*>(
|
|
68
|
-
tpl_block_indices = static_cast<int*>(
|
|
67
|
+
tpl_block_offsets = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
|
|
68
|
+
tpl_block_indices = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
|
|
69
69
|
}
|
|
70
70
|
|
|
71
71
|
std::iota(tpl_block_indices, tpl_block_indices + nnz, 0);
|
|
@@ -156,8 +156,8 @@ WP_API void bsr_matrix_from_triplets_host(
|
|
|
156
156
|
if(!return_summed_blocks)
|
|
157
157
|
{
|
|
158
158
|
// free our temporary buffers
|
|
159
|
-
|
|
160
|
-
|
|
159
|
+
wp_free_host(tpl_block_offsets);
|
|
160
|
+
wp_free_host(tpl_block_indices);
|
|
161
161
|
}
|
|
162
162
|
|
|
163
163
|
if (bsr_nnz != nullptr)
|
|
@@ -166,7 +166,7 @@ WP_API void bsr_matrix_from_triplets_host(
|
|
|
166
166
|
}
|
|
167
167
|
}
|
|
168
168
|
|
|
169
|
-
WP_API void
|
|
169
|
+
WP_API void wp_bsr_transpose_host(
|
|
170
170
|
int row_count, int col_count, int nnz,
|
|
171
171
|
const int* bsr_offsets, const int* bsr_columns,
|
|
172
172
|
int* transposed_bsr_offsets,
|
|
@@ -209,7 +209,7 @@ WP_API void bsr_transpose_host(
|
|
|
209
209
|
}
|
|
210
210
|
|
|
211
211
|
#if !WP_ENABLE_CUDA
|
|
212
|
-
WP_API void
|
|
212
|
+
WP_API void wp_bsr_matrix_from_triplets_device(
|
|
213
213
|
int block_size,
|
|
214
214
|
int scalar_size_in_bytes,
|
|
215
215
|
int row_count,
|
|
@@ -229,7 +229,7 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
229
229
|
void* bsr_nnz_event) {}
|
|
230
230
|
|
|
231
231
|
|
|
232
|
-
WP_API void
|
|
232
|
+
WP_API void wp_bsr_transpose_device(
|
|
233
233
|
int row_count, int col_count, int nnz,
|
|
234
234
|
const int* bsr_offsets, const int* bsr_columns,
|
|
235
235
|
int* transposed_bsr_offsets,
|
warp/native/sparse.cu
CHANGED
|
@@ -50,7 +50,7 @@ template <typename T> struct BsrBlockIsNotZero
|
|
|
50
50
|
T zero_mask;
|
|
51
51
|
|
|
52
52
|
BsrBlockIsNotZero(int block_size, const void* values, const uint64_t zero_mask)
|
|
53
|
-
: block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<
|
|
53
|
+
: block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<T>(zero_mask))
|
|
54
54
|
{}
|
|
55
55
|
|
|
56
56
|
CUDA_CALLABLE_DEVICE bool operator()(int block) const
|
|
@@ -256,7 +256,7 @@ __global__ void bsr_transpose_fill_row_col(const int nnz_upper_bound, const int
|
|
|
256
256
|
} // namespace
|
|
257
257
|
|
|
258
258
|
|
|
259
|
-
WP_API void
|
|
259
|
+
WP_API void wp_bsr_matrix_from_triplets_device(
|
|
260
260
|
const int block_size,
|
|
261
261
|
int scalar_size,
|
|
262
262
|
const int row_count,
|
|
@@ -274,13 +274,13 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
274
274
|
int* bsr_columns,
|
|
275
275
|
int* bsr_nnz, void* bsr_nnz_event)
|
|
276
276
|
{
|
|
277
|
-
void* context =
|
|
277
|
+
void* context = wp_cuda_context_get_current();
|
|
278
278
|
ContextGuard guard(context);
|
|
279
279
|
|
|
280
280
|
// Per-context cached temporary buffers
|
|
281
281
|
// BsrFromTripletsTemp& bsr_temp = g_bsr_from_triplets_temp_map[context];
|
|
282
282
|
|
|
283
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
283
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
284
284
|
|
|
285
285
|
ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * size_t(nnz));
|
|
286
286
|
ScopedTemporary<int> unique_triplet_count(context, 1);
|
|
@@ -289,8 +289,8 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
289
289
|
if(!return_summed_blocks)
|
|
290
290
|
{
|
|
291
291
|
// if not provided, allocate temporary offset and indices buffers
|
|
292
|
-
tpl_block_offsets = static_cast<int*>(
|
|
293
|
-
tpl_block_indices = static_cast<int*>(
|
|
292
|
+
tpl_block_offsets = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
|
|
293
|
+
tpl_block_indices = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
|
|
294
294
|
}
|
|
295
295
|
|
|
296
296
|
|
|
@@ -334,7 +334,7 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
334
334
|
// Ensures the sorted keys are available in summed_block_indices if needed
|
|
335
335
|
if(return_summed_blocks && d_keys.Current() != tpl_block_indices)
|
|
336
336
|
{
|
|
337
|
-
check_cuda(
|
|
337
|
+
check_cuda(cudaMemcpyAsync(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice, stream));
|
|
338
338
|
}
|
|
339
339
|
}
|
|
340
340
|
|
|
@@ -357,11 +357,11 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
357
357
|
{
|
|
358
358
|
// Copy nnz to host, and record an event for the completed transfer if desired
|
|
359
359
|
|
|
360
|
-
|
|
360
|
+
wp_memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
|
|
361
361
|
|
|
362
362
|
if (bsr_nnz_event)
|
|
363
363
|
{
|
|
364
|
-
|
|
364
|
+
wp_cuda_event_record(bsr_nnz_event, stream);
|
|
365
365
|
}
|
|
366
366
|
}
|
|
367
367
|
|
|
@@ -381,21 +381,21 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
381
381
|
stream));
|
|
382
382
|
} else {
|
|
383
383
|
// free our temporary buffers
|
|
384
|
-
|
|
385
|
-
|
|
384
|
+
wp_free_device(context, tpl_block_offsets);
|
|
385
|
+
wp_free_device(context, tpl_block_indices);
|
|
386
386
|
}
|
|
387
387
|
}
|
|
388
388
|
|
|
389
389
|
|
|
390
|
-
WP_API void
|
|
390
|
+
WP_API void wp_bsr_transpose_device(int row_count, int col_count, int nnz,
|
|
391
391
|
const int* bsr_offsets, const int* bsr_columns,
|
|
392
392
|
int* transposed_bsr_offsets, int* transposed_bsr_columns,
|
|
393
393
|
int* src_block_indices)
|
|
394
394
|
{
|
|
395
|
-
void* context =
|
|
395
|
+
void* context = wp_cuda_context_get_current();
|
|
396
396
|
ContextGuard guard(context);
|
|
397
397
|
|
|
398
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
398
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
399
399
|
|
|
400
400
|
ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * nnz);
|
|
401
401
|
|