warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +282 -103
- warp/__init__.pyi +1904 -114
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +93 -30
- warp/build_dll.py +331 -101
- warp/builtins.py +1244 -160
- warp/codegen.py +317 -206
- warp/config.py +1 -1
- warp/context.py +1465 -789
- warp/examples/core/example_marching_cubes.py +1 -0
- warp/examples/core/example_render_opengl.py +100 -3
- warp/examples/fem/example_apic_fluid.py +98 -52
- warp/examples/fem/example_convection_diffusion_dg.py +25 -4
- warp/examples/fem/example_diffusion_mgpu.py +8 -3
- warp/examples/fem/utils.py +68 -22
- warp/examples/interop/example_jax_kernel.py +2 -1
- warp/fabric.py +1 -1
- warp/fem/cache.py +27 -19
- warp/fem/domain.py +2 -2
- warp/fem/field/nodal_field.py +2 -2
- warp/fem/field/virtual.py +264 -166
- warp/fem/geometry/geometry.py +5 -5
- warp/fem/integrate.py +129 -51
- warp/fem/space/restriction.py +4 -0
- warp/fem/space/shape/tet_shape_function.py +3 -10
- warp/jax_experimental/custom_call.py +25 -2
- warp/jax_experimental/ffi.py +22 -1
- warp/jax_experimental/xla_ffi.py +16 -7
- warp/marching_cubes.py +708 -0
- warp/native/array.h +99 -4
- warp/native/builtin.h +86 -9
- warp/native/bvh.cpp +64 -28
- warp/native/bvh.cu +58 -58
- warp/native/bvh.h +2 -2
- warp/native/clang/clang.cpp +7 -7
- warp/native/coloring.cpp +8 -2
- warp/native/crt.cpp +2 -2
- warp/native/crt.h +3 -5
- warp/native/cuda_util.cpp +41 -10
- warp/native/cuda_util.h +10 -4
- warp/native/exports.h +1842 -1908
- warp/native/fabric.h +2 -1
- warp/native/hashgrid.cpp +37 -37
- warp/native/hashgrid.cu +2 -2
- warp/native/initializer_array.h +1 -1
- warp/native/intersect.h +2 -2
- warp/native/mat.h +1910 -116
- warp/native/mathdx.cpp +43 -43
- warp/native/mesh.cpp +24 -24
- warp/native/mesh.cu +26 -26
- warp/native/mesh.h +4 -2
- warp/native/nanovdb/GridHandle.h +179 -12
- warp/native/nanovdb/HostBuffer.h +8 -7
- warp/native/nanovdb/NanoVDB.h +517 -895
- warp/native/nanovdb/NodeManager.h +323 -0
- warp/native/nanovdb/PNanoVDB.h +2 -2
- warp/native/quat.h +331 -14
- warp/native/range.h +7 -1
- warp/native/reduce.cpp +10 -10
- warp/native/reduce.cu +13 -14
- warp/native/runlength_encode.cpp +2 -2
- warp/native/runlength_encode.cu +5 -5
- warp/native/scan.cpp +3 -3
- warp/native/scan.cu +4 -4
- warp/native/sort.cpp +10 -10
- warp/native/sort.cu +40 -31
- warp/native/sort.h +2 -0
- warp/native/sparse.cpp +8 -8
- warp/native/sparse.cu +13 -13
- warp/native/spatial.h +366 -17
- warp/native/temp_buffer.h +2 -2
- warp/native/tile.h +471 -82
- warp/native/vec.h +328 -14
- warp/native/volume.cpp +54 -54
- warp/native/volume.cu +1 -1
- warp/native/volume.h +2 -1
- warp/native/volume_builder.cu +30 -37
- warp/native/warp.cpp +150 -149
- warp/native/warp.cu +377 -216
- warp/native/warp.h +227 -226
- warp/optim/linear.py +736 -271
- warp/render/imgui_manager.py +289 -0
- warp/render/render_opengl.py +99 -18
- warp/render/render_usd.py +1 -0
- warp/sim/graph_coloring.py +2 -2
- warp/sparse.py +558 -175
- warp/tests/aux_test_module_aot.py +7 -0
- warp/tests/cuda/test_async.py +3 -3
- warp/tests/cuda/test_conditional_captures.py +101 -0
- warp/tests/geometry/test_hash_grid.py +38 -0
- warp/tests/geometry/test_marching_cubes.py +233 -12
- warp/tests/interop/test_jax.py +608 -28
- warp/tests/sim/test_coloring.py +6 -6
- warp/tests/test_array.py +58 -5
- warp/tests/test_codegen.py +4 -3
- warp/tests/test_context.py +8 -15
- warp/tests/test_enum.py +136 -0
- warp/tests/test_examples.py +2 -2
- warp/tests/test_fem.py +49 -6
- warp/tests/test_fixedarray.py +229 -0
- warp/tests/test_func.py +18 -15
- warp/tests/test_future_annotations.py +7 -5
- warp/tests/test_linear_solvers.py +30 -0
- warp/tests/test_map.py +15 -1
- warp/tests/test_mat.py +1518 -378
- warp/tests/test_mat_assign_copy.py +178 -0
- warp/tests/test_mat_constructors.py +574 -0
- warp/tests/test_module_aot.py +287 -0
- warp/tests/test_print.py +69 -0
- warp/tests/test_quat.py +140 -34
- warp/tests/test_quat_assign_copy.py +145 -0
- warp/tests/test_reload.py +2 -1
- warp/tests/test_sparse.py +71 -0
- warp/tests/test_spatial.py +140 -34
- warp/tests/test_spatial_assign_copy.py +160 -0
- warp/tests/test_struct.py +43 -3
- warp/tests/test_tuple.py +96 -0
- warp/tests/test_types.py +61 -20
- warp/tests/test_vec.py +179 -34
- warp/tests/test_vec_assign_copy.py +143 -0
- warp/tests/tile/test_tile.py +245 -18
- warp/tests/tile/test_tile_cholesky.py +605 -0
- warp/tests/tile/test_tile_load.py +169 -0
- warp/tests/tile/test_tile_mathdx.py +2 -558
- warp/tests/tile/test_tile_matmul.py +1 -1
- warp/tests/tile/test_tile_mlp.py +1 -1
- warp/tests/tile/test_tile_shared_memory.py +5 -5
- warp/tests/unittest_suites.py +6 -0
- warp/tests/walkthrough_debug.py +1 -1
- warp/thirdparty/unittest_parallel.py +108 -9
- warp/types.py +571 -267
- warp/utils.py +68 -86
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
- warp/native/marching.cpp +0 -19
- warp/native/marching.cu +0 -514
- warp/native/marching.h +0 -19
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0
warp/native/runlength_encode.cpp
CHANGED
|
@@ -53,7 +53,7 @@ void runlength_encode_host(int n,
|
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
55
|
|
|
56
|
-
void
|
|
56
|
+
void wp_runlength_encode_int_host(
|
|
57
57
|
uint64_t values,
|
|
58
58
|
uint64_t run_values,
|
|
59
59
|
uint64_t run_lengths,
|
|
@@ -68,7 +68,7 @@ void runlength_encode_int_host(
|
|
|
68
68
|
}
|
|
69
69
|
|
|
70
70
|
#if !WP_ENABLE_CUDA
|
|
71
|
-
void
|
|
71
|
+
void wp_runlength_encode_int_device(
|
|
72
72
|
uint64_t values,
|
|
73
73
|
uint64_t run_values,
|
|
74
74
|
uint64_t run_lengths,
|
warp/native/runlength_encode.cu
CHANGED
|
@@ -28,24 +28,24 @@ void runlength_encode_device(int n,
|
|
|
28
28
|
int *run_lengths,
|
|
29
29
|
int *run_count)
|
|
30
30
|
{
|
|
31
|
-
ContextGuard guard(
|
|
32
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
31
|
+
ContextGuard guard(wp_cuda_context_get_current());
|
|
32
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
33
33
|
|
|
34
34
|
size_t buff_size = 0;
|
|
35
35
|
check_cuda(cub::DeviceRunLengthEncode::Encode(
|
|
36
36
|
nullptr, buff_size, values, run_values, run_lengths, run_count,
|
|
37
37
|
n, stream));
|
|
38
38
|
|
|
39
|
-
void* temp_buffer =
|
|
39
|
+
void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
|
|
40
40
|
|
|
41
41
|
check_cuda(cub::DeviceRunLengthEncode::Encode(
|
|
42
42
|
temp_buffer, buff_size, values, run_values, run_lengths, run_count,
|
|
43
43
|
n, stream));
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
void
|
|
48
|
+
void wp_runlength_encode_int_device(
|
|
49
49
|
uint64_t values,
|
|
50
50
|
uint64_t run_values,
|
|
51
51
|
uint64_t run_lengths,
|
warp/native/scan.cpp
CHANGED
|
@@ -28,8 +28,8 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
|
|
|
28
28
|
// compute temporary memory required
|
|
29
29
|
if (!inclusive && n > scan_temp_max_size)
|
|
30
30
|
{
|
|
31
|
-
|
|
32
|
-
scan_temp_memory =
|
|
31
|
+
wp_free_host(scan_temp_memory);
|
|
32
|
+
scan_temp_memory = wp_alloc_host(sizeof(T) * n);
|
|
33
33
|
scan_temp_max_size = n;
|
|
34
34
|
}
|
|
35
35
|
|
|
@@ -39,7 +39,7 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
|
|
|
39
39
|
std::partial_sum(values_in, values_in + n, result);
|
|
40
40
|
if (!inclusive) {
|
|
41
41
|
values_out[0] = (T)0;
|
|
42
|
-
|
|
42
|
+
wp_memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
|
|
43
43
|
}
|
|
44
44
|
}
|
|
45
45
|
|
warp/native/scan.cu
CHANGED
|
@@ -25,9 +25,9 @@
|
|
|
25
25
|
template<typename T>
|
|
26
26
|
void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
|
|
27
27
|
{
|
|
28
|
-
ContextGuard guard(
|
|
28
|
+
ContextGuard guard(wp_cuda_context_get_current());
|
|
29
29
|
|
|
30
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
30
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
31
31
|
|
|
32
32
|
// compute temporary memory required
|
|
33
33
|
size_t scan_temp_size;
|
|
@@ -37,7 +37,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
|
|
|
37
37
|
check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
void* temp_buffer =
|
|
40
|
+
void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
|
|
41
41
|
|
|
42
42
|
// scan
|
|
43
43
|
if (inclusive) {
|
|
@@ -46,7 +46,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
|
|
|
46
46
|
check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
|
|
47
47
|
}
|
|
48
48
|
|
|
49
|
-
|
|
49
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
|
|
50
50
|
}
|
|
51
51
|
|
|
52
52
|
template void scan_device(const int*, int*, int, bool);
|
warp/native/sort.cpp
CHANGED
|
@@ -198,41 +198,41 @@ void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start
|
|
|
198
198
|
|
|
199
199
|
void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
|
|
200
200
|
|
|
201
|
-
void
|
|
201
|
+
void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
|
|
202
202
|
|
|
203
|
-
void
|
|
203
|
+
void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
|
|
204
204
|
|
|
205
|
-
void
|
|
205
|
+
void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
|
|
206
206
|
|
|
207
|
-
void
|
|
207
|
+
void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
|
|
208
208
|
|
|
209
|
-
void
|
|
209
|
+
void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
|
|
210
210
|
|
|
211
211
|
#endif // !WP_ENABLE_CUDA
|
|
212
212
|
|
|
213
213
|
|
|
214
|
-
void
|
|
214
|
+
void wp_radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
|
|
215
215
|
{
|
|
216
216
|
radix_sort_pairs_host(
|
|
217
217
|
reinterpret_cast<int *>(keys),
|
|
218
218
|
reinterpret_cast<int *>(values), n);
|
|
219
219
|
}
|
|
220
220
|
|
|
221
|
-
void
|
|
221
|
+
void wp_radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
|
|
222
222
|
{
|
|
223
223
|
radix_sort_pairs_host(
|
|
224
224
|
reinterpret_cast<int64_t *>(keys),
|
|
225
225
|
reinterpret_cast<int *>(values), n);
|
|
226
226
|
}
|
|
227
227
|
|
|
228
|
-
void
|
|
228
|
+
void wp_radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
|
|
229
229
|
{
|
|
230
230
|
radix_sort_pairs_host(
|
|
231
231
|
reinterpret_cast<float *>(keys),
|
|
232
232
|
reinterpret_cast<int *>(values), n);
|
|
233
233
|
}
|
|
234
234
|
|
|
235
|
-
void
|
|
235
|
+
void wp_segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
|
|
236
236
|
{
|
|
237
237
|
segmented_sort_pairs_host(
|
|
238
238
|
reinterpret_cast<float *>(keys),
|
|
@@ -241,7 +241,7 @@ void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint
|
|
|
241
241
|
reinterpret_cast<int *>(segment_end_indices), num_segments);
|
|
242
242
|
}
|
|
243
243
|
|
|
244
|
-
void
|
|
244
|
+
void wp_segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
|
|
245
245
|
{
|
|
246
246
|
segmented_sort_pairs_host(
|
|
247
247
|
reinterpret_cast<int *>(keys),
|
warp/native/sort.cu
CHANGED
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
|
|
24
24
|
#include <cub/cub.cuh>
|
|
25
25
|
|
|
26
|
-
#include <
|
|
26
|
+
#include <unordered_map>
|
|
27
27
|
|
|
28
28
|
// temporary buffer for radix sort
|
|
29
29
|
struct RadixSortTemp
|
|
@@ -32,8 +32,8 @@ struct RadixSortTemp
|
|
|
32
32
|
size_t size = 0;
|
|
33
33
|
};
|
|
34
34
|
|
|
35
|
-
//
|
|
36
|
-
static std::
|
|
35
|
+
// use unique temp buffers per CUDA stream to avoid race conditions
|
|
36
|
+
static std::unordered_map<void*, RadixSortTemp> g_radix_sort_temp_map;
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
template <typename KeyType>
|
|
@@ -44,6 +44,8 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
|
|
|
44
44
|
cub::DoubleBuffer<KeyType> d_keys;
|
|
45
45
|
cub::DoubleBuffer<int> d_values;
|
|
46
46
|
|
|
47
|
+
CUstream stream = static_cast<CUstream>(wp_cuda_stream_get_current());
|
|
48
|
+
|
|
47
49
|
// compute temporary memory required
|
|
48
50
|
size_t sort_temp_size;
|
|
49
51
|
check_cuda(cub::DeviceRadixSort::SortPairs(
|
|
@@ -52,17 +54,14 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
|
|
|
52
54
|
d_keys,
|
|
53
55
|
d_values,
|
|
54
56
|
n, 0, sizeof(KeyType)*8,
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
if (!context)
|
|
58
|
-
context = cuda_context_get_current();
|
|
57
|
+
stream));
|
|
59
58
|
|
|
60
|
-
RadixSortTemp& temp = g_radix_sort_temp_map[
|
|
59
|
+
RadixSortTemp& temp = g_radix_sort_temp_map[stream];
|
|
61
60
|
|
|
62
61
|
if (sort_temp_size > temp.size)
|
|
63
62
|
{
|
|
64
|
-
|
|
65
|
-
temp.mem =
|
|
63
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
|
|
64
|
+
temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
|
|
66
65
|
temp.size = sort_temp_size;
|
|
67
66
|
}
|
|
68
67
|
|
|
@@ -77,6 +76,17 @@ void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
|
|
|
77
76
|
radix_sort_reserve_internal<int>(context, n, mem_out, size_out);
|
|
78
77
|
}
|
|
79
78
|
|
|
79
|
+
void radix_sort_release(void* context, void* stream)
|
|
80
|
+
{
|
|
81
|
+
// release temporary buffer for the given stream, if it exists
|
|
82
|
+
auto it = g_radix_sort_temp_map.find(stream);
|
|
83
|
+
if (it != g_radix_sort_temp_map.end())
|
|
84
|
+
{
|
|
85
|
+
wp_free_device(context, it->second.mem);
|
|
86
|
+
g_radix_sort_temp_map.erase(it);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
80
90
|
template <typename KeyType>
|
|
81
91
|
void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
|
|
82
92
|
{
|
|
@@ -95,13 +105,13 @@ void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
|
|
|
95
105
|
d_keys,
|
|
96
106
|
d_values,
|
|
97
107
|
n, 0, sizeof(KeyType)*8,
|
|
98
|
-
(cudaStream_t)
|
|
108
|
+
(cudaStream_t)wp_cuda_stream_get_current()));
|
|
99
109
|
|
|
100
110
|
if (d_keys.Current() != keys)
|
|
101
|
-
|
|
111
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
|
|
102
112
|
|
|
103
113
|
if (d_values.Current() != values)
|
|
104
|
-
|
|
114
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
|
|
105
115
|
}
|
|
106
116
|
|
|
107
117
|
void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
|
|
@@ -119,7 +129,7 @@ void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
|
|
|
119
129
|
radix_sort_pairs_device<int64_t>(context, keys, values, n);
|
|
120
130
|
}
|
|
121
131
|
|
|
122
|
-
void
|
|
132
|
+
void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
|
|
123
133
|
{
|
|
124
134
|
radix_sort_pairs_device(
|
|
125
135
|
WP_CURRENT_CONTEXT,
|
|
@@ -127,7 +137,7 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
|
|
|
127
137
|
reinterpret_cast<int *>(values), n);
|
|
128
138
|
}
|
|
129
139
|
|
|
130
|
-
void
|
|
140
|
+
void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
|
|
131
141
|
{
|
|
132
142
|
radix_sort_pairs_device(
|
|
133
143
|
WP_CURRENT_CONTEXT,
|
|
@@ -135,7 +145,7 @@ void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
|
|
|
135
145
|
reinterpret_cast<int *>(values), n);
|
|
136
146
|
}
|
|
137
147
|
|
|
138
|
-
void
|
|
148
|
+
void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
|
|
139
149
|
{
|
|
140
150
|
radix_sort_pairs_device(
|
|
141
151
|
WP_CURRENT_CONTEXT,
|
|
@@ -153,6 +163,8 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
|
|
|
153
163
|
int* start_indices = NULL;
|
|
154
164
|
int* end_indices = NULL;
|
|
155
165
|
|
|
166
|
+
CUstream stream = static_cast<CUstream>(wp_cuda_stream_get_current());
|
|
167
|
+
|
|
156
168
|
// compute temporary memory required
|
|
157
169
|
size_t sort_temp_size;
|
|
158
170
|
check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
|
@@ -166,17 +178,14 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
|
|
|
166
178
|
end_indices,
|
|
167
179
|
0,
|
|
168
180
|
32,
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
if (!context)
|
|
172
|
-
context = cuda_context_get_current();
|
|
181
|
+
stream));
|
|
173
182
|
|
|
174
|
-
RadixSortTemp& temp = g_radix_sort_temp_map[
|
|
183
|
+
RadixSortTemp& temp = g_radix_sort_temp_map[stream];
|
|
175
184
|
|
|
176
185
|
if (sort_temp_size > temp.size)
|
|
177
186
|
{
|
|
178
|
-
|
|
179
|
-
temp.mem =
|
|
187
|
+
wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
|
|
188
|
+
temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
|
|
180
189
|
temp.size = sort_temp_size;
|
|
181
190
|
}
|
|
182
191
|
|
|
@@ -211,16 +220,16 @@ void segmented_sort_pairs_device(void* context, float* keys, int* values, int n,
|
|
|
211
220
|
segment_end_indices,
|
|
212
221
|
0,
|
|
213
222
|
32,
|
|
214
|
-
(cudaStream_t)
|
|
223
|
+
(cudaStream_t)wp_cuda_stream_get_current()));
|
|
215
224
|
|
|
216
225
|
if (d_keys.Current() != keys)
|
|
217
|
-
|
|
226
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
|
|
218
227
|
|
|
219
228
|
if (d_values.Current() != values)
|
|
220
|
-
|
|
229
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
|
|
221
230
|
}
|
|
222
231
|
|
|
223
|
-
void
|
|
232
|
+
void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
|
|
224
233
|
{
|
|
225
234
|
segmented_sort_pairs_device(
|
|
226
235
|
WP_CURRENT_CONTEXT,
|
|
@@ -256,16 +265,16 @@ void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, i
|
|
|
256
265
|
segment_end_indices,
|
|
257
266
|
0,
|
|
258
267
|
32,
|
|
259
|
-
(cudaStream_t)
|
|
268
|
+
(cudaStream_t)wp_cuda_stream_get_current()));
|
|
260
269
|
|
|
261
270
|
if (d_keys.Current() != keys)
|
|
262
|
-
|
|
271
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
|
|
263
272
|
|
|
264
273
|
if (d_values.Current() != values)
|
|
265
|
-
|
|
274
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
|
|
266
275
|
}
|
|
267
276
|
|
|
268
|
-
void
|
|
277
|
+
void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
|
|
269
278
|
{
|
|
270
279
|
segmented_sort_pairs_device(
|
|
271
280
|
WP_CURRENT_CONTEXT,
|
warp/native/sort.h
CHANGED
|
@@ -20,6 +20,8 @@
|
|
|
20
20
|
#include <stddef.h>
|
|
21
21
|
|
|
22
22
|
void radix_sort_reserve(void* context, int n, void** mem_out=NULL, size_t* size_out=NULL);
|
|
23
|
+
void radix_sort_release(void* context, void* stream);
|
|
24
|
+
|
|
23
25
|
void radix_sort_pairs_host(int* keys, int* values, int n);
|
|
24
26
|
void radix_sort_pairs_host(float* keys, int* values, int n);
|
|
25
27
|
void radix_sort_pairs_host(int64_t* keys, int* values, int n);
|
warp/native/sparse.cpp
CHANGED
|
@@ -36,7 +36,7 @@ template <typename T> bool bsr_block_is_zero(int block_idx, int block_size, cons
|
|
|
36
36
|
} // namespace
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
WP_API void
|
|
39
|
+
WP_API void wp_bsr_matrix_from_triplets_host(
|
|
40
40
|
int block_size,
|
|
41
41
|
int scalar_size_in_bytes,
|
|
42
42
|
int row_count,
|
|
@@ -64,8 +64,8 @@ WP_API void bsr_matrix_from_triplets_host(
|
|
|
64
64
|
bool return_summed_blocks = tpl_block_offsets != nullptr && tpl_block_indices != nullptr;
|
|
65
65
|
if (!return_summed_blocks)
|
|
66
66
|
{
|
|
67
|
-
tpl_block_offsets = static_cast<int*>(
|
|
68
|
-
tpl_block_indices = static_cast<int*>(
|
|
67
|
+
tpl_block_offsets = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
|
|
68
|
+
tpl_block_indices = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
|
|
69
69
|
}
|
|
70
70
|
|
|
71
71
|
std::iota(tpl_block_indices, tpl_block_indices + nnz, 0);
|
|
@@ -156,8 +156,8 @@ WP_API void bsr_matrix_from_triplets_host(
|
|
|
156
156
|
if(!return_summed_blocks)
|
|
157
157
|
{
|
|
158
158
|
// free our temporary buffers
|
|
159
|
-
|
|
160
|
-
|
|
159
|
+
wp_free_host(tpl_block_offsets);
|
|
160
|
+
wp_free_host(tpl_block_indices);
|
|
161
161
|
}
|
|
162
162
|
|
|
163
163
|
if (bsr_nnz != nullptr)
|
|
@@ -166,7 +166,7 @@ WP_API void bsr_matrix_from_triplets_host(
|
|
|
166
166
|
}
|
|
167
167
|
}
|
|
168
168
|
|
|
169
|
-
WP_API void
|
|
169
|
+
WP_API void wp_bsr_transpose_host(
|
|
170
170
|
int row_count, int col_count, int nnz,
|
|
171
171
|
const int* bsr_offsets, const int* bsr_columns,
|
|
172
172
|
int* transposed_bsr_offsets,
|
|
@@ -209,7 +209,7 @@ WP_API void bsr_transpose_host(
|
|
|
209
209
|
}
|
|
210
210
|
|
|
211
211
|
#if !WP_ENABLE_CUDA
|
|
212
|
-
WP_API void
|
|
212
|
+
WP_API void wp_bsr_matrix_from_triplets_device(
|
|
213
213
|
int block_size,
|
|
214
214
|
int scalar_size_in_bytes,
|
|
215
215
|
int row_count,
|
|
@@ -229,7 +229,7 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
229
229
|
void* bsr_nnz_event) {}
|
|
230
230
|
|
|
231
231
|
|
|
232
|
-
WP_API void
|
|
232
|
+
WP_API void wp_bsr_transpose_device(
|
|
233
233
|
int row_count, int col_count, int nnz,
|
|
234
234
|
const int* bsr_offsets, const int* bsr_columns,
|
|
235
235
|
int* transposed_bsr_offsets,
|
warp/native/sparse.cu
CHANGED
|
@@ -50,7 +50,7 @@ template <typename T> struct BsrBlockIsNotZero
|
|
|
50
50
|
T zero_mask;
|
|
51
51
|
|
|
52
52
|
BsrBlockIsNotZero(int block_size, const void* values, const uint64_t zero_mask)
|
|
53
|
-
: block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<
|
|
53
|
+
: block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<T>(zero_mask))
|
|
54
54
|
{}
|
|
55
55
|
|
|
56
56
|
CUDA_CALLABLE_DEVICE bool operator()(int block) const
|
|
@@ -256,7 +256,7 @@ __global__ void bsr_transpose_fill_row_col(const int nnz_upper_bound, const int
|
|
|
256
256
|
} // namespace
|
|
257
257
|
|
|
258
258
|
|
|
259
|
-
WP_API void
|
|
259
|
+
WP_API void wp_bsr_matrix_from_triplets_device(
|
|
260
260
|
const int block_size,
|
|
261
261
|
int scalar_size,
|
|
262
262
|
const int row_count,
|
|
@@ -274,13 +274,13 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
274
274
|
int* bsr_columns,
|
|
275
275
|
int* bsr_nnz, void* bsr_nnz_event)
|
|
276
276
|
{
|
|
277
|
-
void* context =
|
|
277
|
+
void* context = wp_cuda_context_get_current();
|
|
278
278
|
ContextGuard guard(context);
|
|
279
279
|
|
|
280
280
|
// Per-context cached temporary buffers
|
|
281
281
|
// BsrFromTripletsTemp& bsr_temp = g_bsr_from_triplets_temp_map[context];
|
|
282
282
|
|
|
283
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
283
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
284
284
|
|
|
285
285
|
ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * size_t(nnz));
|
|
286
286
|
ScopedTemporary<int> unique_triplet_count(context, 1);
|
|
@@ -289,8 +289,8 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
289
289
|
if(!return_summed_blocks)
|
|
290
290
|
{
|
|
291
291
|
// if not provided, allocate temporary offset and indices buffers
|
|
292
|
-
tpl_block_offsets = static_cast<int*>(
|
|
293
|
-
tpl_block_indices = static_cast<int*>(
|
|
292
|
+
tpl_block_offsets = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
|
|
293
|
+
tpl_block_indices = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
|
|
294
294
|
}
|
|
295
295
|
|
|
296
296
|
|
|
@@ -357,11 +357,11 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
357
357
|
{
|
|
358
358
|
// Copy nnz to host, and record an event for the completed transfer if desired
|
|
359
359
|
|
|
360
|
-
|
|
360
|
+
wp_memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
|
|
361
361
|
|
|
362
362
|
if (bsr_nnz_event)
|
|
363
363
|
{
|
|
364
|
-
|
|
364
|
+
wp_cuda_event_record(bsr_nnz_event, stream);
|
|
365
365
|
}
|
|
366
366
|
}
|
|
367
367
|
|
|
@@ -381,21 +381,21 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
381
381
|
stream));
|
|
382
382
|
} else {
|
|
383
383
|
// free our temporary buffers
|
|
384
|
-
|
|
385
|
-
|
|
384
|
+
wp_free_device(context, tpl_block_offsets);
|
|
385
|
+
wp_free_device(context, tpl_block_indices);
|
|
386
386
|
}
|
|
387
387
|
}
|
|
388
388
|
|
|
389
389
|
|
|
390
|
-
WP_API void
|
|
390
|
+
WP_API void wp_bsr_transpose_device(int row_count, int col_count, int nnz,
|
|
391
391
|
const int* bsr_offsets, const int* bsr_columns,
|
|
392
392
|
int* transposed_bsr_offsets, int* transposed_bsr_columns,
|
|
393
393
|
int* src_block_indices)
|
|
394
394
|
{
|
|
395
|
-
void* context =
|
|
395
|
+
void* context = wp_cuda_context_get_current();
|
|
396
396
|
ContextGuard guard(context);
|
|
397
397
|
|
|
398
|
-
cudaStream_t stream = static_cast<cudaStream_t>(
|
|
398
|
+
cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
|
|
399
399
|
|
|
400
400
|
ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * nnz);
|
|
401
401
|
|