warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +1904 -114
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +331 -101
  7. warp/builtins.py +1244 -160
  8. warp/codegen.py +317 -206
  9. warp/config.py +1 -1
  10. warp/context.py +1465 -789
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/examples/interop/example_jax_kernel.py +2 -1
  18. warp/fabric.py +1 -1
  19. warp/fem/cache.py +27 -19
  20. warp/fem/domain.py +2 -2
  21. warp/fem/field/nodal_field.py +2 -2
  22. warp/fem/field/virtual.py +264 -166
  23. warp/fem/geometry/geometry.py +5 -5
  24. warp/fem/integrate.py +129 -51
  25. warp/fem/space/restriction.py +4 -0
  26. warp/fem/space/shape/tet_shape_function.py +3 -10
  27. warp/jax_experimental/custom_call.py +25 -2
  28. warp/jax_experimental/ffi.py +22 -1
  29. warp/jax_experimental/xla_ffi.py +16 -7
  30. warp/marching_cubes.py +708 -0
  31. warp/native/array.h +99 -4
  32. warp/native/builtin.h +86 -9
  33. warp/native/bvh.cpp +64 -28
  34. warp/native/bvh.cu +58 -58
  35. warp/native/bvh.h +2 -2
  36. warp/native/clang/clang.cpp +7 -7
  37. warp/native/coloring.cpp +8 -2
  38. warp/native/crt.cpp +2 -2
  39. warp/native/crt.h +3 -5
  40. warp/native/cuda_util.cpp +41 -10
  41. warp/native/cuda_util.h +10 -4
  42. warp/native/exports.h +1842 -1908
  43. warp/native/fabric.h +2 -1
  44. warp/native/hashgrid.cpp +37 -37
  45. warp/native/hashgrid.cu +2 -2
  46. warp/native/initializer_array.h +1 -1
  47. warp/native/intersect.h +2 -2
  48. warp/native/mat.h +1910 -116
  49. warp/native/mathdx.cpp +43 -43
  50. warp/native/mesh.cpp +24 -24
  51. warp/native/mesh.cu +26 -26
  52. warp/native/mesh.h +4 -2
  53. warp/native/nanovdb/GridHandle.h +179 -12
  54. warp/native/nanovdb/HostBuffer.h +8 -7
  55. warp/native/nanovdb/NanoVDB.h +517 -895
  56. warp/native/nanovdb/NodeManager.h +323 -0
  57. warp/native/nanovdb/PNanoVDB.h +2 -2
  58. warp/native/quat.h +331 -14
  59. warp/native/range.h +7 -1
  60. warp/native/reduce.cpp +10 -10
  61. warp/native/reduce.cu +13 -14
  62. warp/native/runlength_encode.cpp +2 -2
  63. warp/native/runlength_encode.cu +5 -5
  64. warp/native/scan.cpp +3 -3
  65. warp/native/scan.cu +4 -4
  66. warp/native/sort.cpp +10 -10
  67. warp/native/sort.cu +40 -31
  68. warp/native/sort.h +2 -0
  69. warp/native/sparse.cpp +8 -8
  70. warp/native/sparse.cu +13 -13
  71. warp/native/spatial.h +366 -17
  72. warp/native/temp_buffer.h +2 -2
  73. warp/native/tile.h +471 -82
  74. warp/native/vec.h +328 -14
  75. warp/native/volume.cpp +54 -54
  76. warp/native/volume.cu +1 -1
  77. warp/native/volume.h +2 -1
  78. warp/native/volume_builder.cu +30 -37
  79. warp/native/warp.cpp +150 -149
  80. warp/native/warp.cu +377 -216
  81. warp/native/warp.h +227 -226
  82. warp/optim/linear.py +736 -271
  83. warp/render/imgui_manager.py +289 -0
  84. warp/render/render_opengl.py +99 -18
  85. warp/render/render_usd.py +1 -0
  86. warp/sim/graph_coloring.py +2 -2
  87. warp/sparse.py +558 -175
  88. warp/tests/aux_test_module_aot.py +7 -0
  89. warp/tests/cuda/test_async.py +3 -3
  90. warp/tests/cuda/test_conditional_captures.py +101 -0
  91. warp/tests/geometry/test_hash_grid.py +38 -0
  92. warp/tests/geometry/test_marching_cubes.py +233 -12
  93. warp/tests/interop/test_jax.py +608 -28
  94. warp/tests/sim/test_coloring.py +6 -6
  95. warp/tests/test_array.py +58 -5
  96. warp/tests/test_codegen.py +4 -3
  97. warp/tests/test_context.py +8 -15
  98. warp/tests/test_enum.py +136 -0
  99. warp/tests/test_examples.py +2 -2
  100. warp/tests/test_fem.py +49 -6
  101. warp/tests/test_fixedarray.py +229 -0
  102. warp/tests/test_func.py +18 -15
  103. warp/tests/test_future_annotations.py +7 -5
  104. warp/tests/test_linear_solvers.py +30 -0
  105. warp/tests/test_map.py +15 -1
  106. warp/tests/test_mat.py +1518 -378
  107. warp/tests/test_mat_assign_copy.py +178 -0
  108. warp/tests/test_mat_constructors.py +574 -0
  109. warp/tests/test_module_aot.py +287 -0
  110. warp/tests/test_print.py +69 -0
  111. warp/tests/test_quat.py +140 -34
  112. warp/tests/test_quat_assign_copy.py +145 -0
  113. warp/tests/test_reload.py +2 -1
  114. warp/tests/test_sparse.py +71 -0
  115. warp/tests/test_spatial.py +140 -34
  116. warp/tests/test_spatial_assign_copy.py +160 -0
  117. warp/tests/test_struct.py +43 -3
  118. warp/tests/test_tuple.py +96 -0
  119. warp/tests/test_types.py +61 -20
  120. warp/tests/test_vec.py +179 -34
  121. warp/tests/test_vec_assign_copy.py +143 -0
  122. warp/tests/tile/test_tile.py +245 -18
  123. warp/tests/tile/test_tile_cholesky.py +605 -0
  124. warp/tests/tile/test_tile_load.py +169 -0
  125. warp/tests/tile/test_tile_mathdx.py +2 -558
  126. warp/tests/tile/test_tile_matmul.py +1 -1
  127. warp/tests/tile/test_tile_mlp.py +1 -1
  128. warp/tests/tile/test_tile_shared_memory.py +5 -5
  129. warp/tests/unittest_suites.py +6 -0
  130. warp/tests/walkthrough_debug.py +1 -1
  131. warp/thirdparty/unittest_parallel.py +108 -9
  132. warp/types.py +571 -267
  133. warp/utils.py +68 -86
  134. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
  135. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
  136. warp/native/marching.cpp +0 -19
  137. warp/native/marching.cu +0 -514
  138. warp/native/marching.h +0 -19
  139. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
  140. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
  141. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,7 @@ void runlength_encode_host(int n,
53
53
  }
54
54
  }
55
55
 
56
- void runlength_encode_int_host(
56
+ void wp_runlength_encode_int_host(
57
57
  uint64_t values,
58
58
  uint64_t run_values,
59
59
  uint64_t run_lengths,
@@ -68,7 +68,7 @@ void runlength_encode_int_host(
68
68
  }
69
69
 
70
70
  #if !WP_ENABLE_CUDA
71
- void runlength_encode_int_device(
71
+ void wp_runlength_encode_int_device(
72
72
  uint64_t values,
73
73
  uint64_t run_values,
74
74
  uint64_t run_lengths,
@@ -28,24 +28,24 @@ void runlength_encode_device(int n,
28
28
  int *run_lengths,
29
29
  int *run_count)
30
30
  {
31
- ContextGuard guard(cuda_context_get_current());
32
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
31
+ ContextGuard guard(wp_cuda_context_get_current());
32
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
33
33
 
34
34
  size_t buff_size = 0;
35
35
  check_cuda(cub::DeviceRunLengthEncode::Encode(
36
36
  nullptr, buff_size, values, run_values, run_lengths, run_count,
37
37
  n, stream));
38
38
 
39
- void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
39
+ void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
40
40
 
41
41
  check_cuda(cub::DeviceRunLengthEncode::Encode(
42
42
  temp_buffer, buff_size, values, run_values, run_lengths, run_count,
43
43
  n, stream));
44
44
 
45
- free_device(WP_CURRENT_CONTEXT, temp_buffer);
45
+ wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
46
46
  }
47
47
 
48
- void runlength_encode_int_device(
48
+ void wp_runlength_encode_int_device(
49
49
  uint64_t values,
50
50
  uint64_t run_values,
51
51
  uint64_t run_lengths,
warp/native/scan.cpp CHANGED
@@ -28,8 +28,8 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
28
28
  // compute temporary memory required
29
29
  if (!inclusive && n > scan_temp_max_size)
30
30
  {
31
- free_host(scan_temp_memory);
32
- scan_temp_memory = alloc_host(sizeof(T) * n);
31
+ wp_free_host(scan_temp_memory);
32
+ scan_temp_memory = wp_alloc_host(sizeof(T) * n);
33
33
  scan_temp_max_size = n;
34
34
  }
35
35
 
@@ -39,7 +39,7 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
39
39
  std::partial_sum(values_in, values_in + n, result);
40
40
  if (!inclusive) {
41
41
  values_out[0] = (T)0;
42
- memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
42
+ wp_memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
43
43
  }
44
44
  }
45
45
 
warp/native/scan.cu CHANGED
@@ -25,9 +25,9 @@
25
25
  template<typename T>
26
26
  void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
27
27
  {
28
- ContextGuard guard(cuda_context_get_current());
28
+ ContextGuard guard(wp_cuda_context_get_current());
29
29
 
30
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
30
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
31
31
 
32
32
  // compute temporary memory required
33
33
  size_t scan_temp_size;
@@ -37,7 +37,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
37
37
  check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
38
38
  }
39
39
 
40
- void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
40
+ void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
41
41
 
42
42
  // scan
43
43
  if (inclusive) {
@@ -46,7 +46,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
46
46
  check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
47
47
  }
48
48
 
49
- free_device(WP_CURRENT_CONTEXT, temp_buffer);
49
+ wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
50
50
  }
51
51
 
52
52
  template void scan_device(const int*, int*, int, bool);
warp/native/sort.cpp CHANGED
@@ -198,41 +198,41 @@ void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start
198
198
 
199
199
  void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
200
200
 
201
- void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
201
+ void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
202
202
 
203
- void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
203
+ void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
204
204
 
205
- void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
205
+ void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
206
206
 
207
- void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
207
+ void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
208
208
 
209
- void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
209
+ void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
210
210
 
211
211
  #endif // !WP_ENABLE_CUDA
212
212
 
213
213
 
214
- void radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
214
+ void wp_radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
215
215
  {
216
216
  radix_sort_pairs_host(
217
217
  reinterpret_cast<int *>(keys),
218
218
  reinterpret_cast<int *>(values), n);
219
219
  }
220
220
 
221
- void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
221
+ void wp_radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
222
222
  {
223
223
  radix_sort_pairs_host(
224
224
  reinterpret_cast<int64_t *>(keys),
225
225
  reinterpret_cast<int *>(values), n);
226
226
  }
227
227
 
228
- void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
228
+ void wp_radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
229
229
  {
230
230
  radix_sort_pairs_host(
231
231
  reinterpret_cast<float *>(keys),
232
232
  reinterpret_cast<int *>(values), n);
233
233
  }
234
234
 
235
- void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
235
+ void wp_segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
236
236
  {
237
237
  segmented_sort_pairs_host(
238
238
  reinterpret_cast<float *>(keys),
@@ -241,7 +241,7 @@ void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint
241
241
  reinterpret_cast<int *>(segment_end_indices), num_segments);
242
242
  }
243
243
 
244
- void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
244
+ void wp_segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
245
245
  {
246
246
  segmented_sort_pairs_host(
247
247
  reinterpret_cast<int *>(keys),
warp/native/sort.cu CHANGED
@@ -23,7 +23,7 @@
23
23
 
24
24
  #include <cub/cub.cuh>
25
25
 
26
- #include <map>
26
+ #include <unordered_map>
27
27
 
28
28
  // temporary buffer for radix sort
29
29
  struct RadixSortTemp
@@ -32,8 +32,8 @@ struct RadixSortTemp
32
32
  size_t size = 0;
33
33
  };
34
34
 
35
- // map temp buffers to CUDA contexts
36
- static std::map<void*, RadixSortTemp> g_radix_sort_temp_map;
35
+ // use unique temp buffers per CUDA stream to avoid race conditions
36
+ static std::unordered_map<void*, RadixSortTemp> g_radix_sort_temp_map;
37
37
 
38
38
 
39
39
  template <typename KeyType>
@@ -44,6 +44,8 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
44
44
  cub::DoubleBuffer<KeyType> d_keys;
45
45
  cub::DoubleBuffer<int> d_values;
46
46
 
47
+ CUstream stream = static_cast<CUstream>(wp_cuda_stream_get_current());
48
+
47
49
  // compute temporary memory required
48
50
  size_t sort_temp_size;
49
51
  check_cuda(cub::DeviceRadixSort::SortPairs(
@@ -52,17 +54,14 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
52
54
  d_keys,
53
55
  d_values,
54
56
  n, 0, sizeof(KeyType)*8,
55
- (cudaStream_t)cuda_stream_get_current()));
56
-
57
- if (!context)
58
- context = cuda_context_get_current();
57
+ stream));
59
58
 
60
- RadixSortTemp& temp = g_radix_sort_temp_map[context];
59
+ RadixSortTemp& temp = g_radix_sort_temp_map[stream];
61
60
 
62
61
  if (sort_temp_size > temp.size)
63
62
  {
64
- free_device(WP_CURRENT_CONTEXT, temp.mem);
65
- temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
63
+ wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
64
+ temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
66
65
  temp.size = sort_temp_size;
67
66
  }
68
67
 
@@ -77,6 +76,17 @@ void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
77
76
  radix_sort_reserve_internal<int>(context, n, mem_out, size_out);
78
77
  }
79
78
 
79
+ void radix_sort_release(void* context, void* stream)
80
+ {
81
+ // release temporary buffer for the given stream, if it exists
82
+ auto it = g_radix_sort_temp_map.find(stream);
83
+ if (it != g_radix_sort_temp_map.end())
84
+ {
85
+ wp_free_device(context, it->second.mem);
86
+ g_radix_sort_temp_map.erase(it);
87
+ }
88
+ }
89
+
80
90
  template <typename KeyType>
81
91
  void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
82
92
  {
@@ -95,13 +105,13 @@ void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
95
105
  d_keys,
96
106
  d_values,
97
107
  n, 0, sizeof(KeyType)*8,
98
- (cudaStream_t)cuda_stream_get_current()));
108
+ (cudaStream_t)wp_cuda_stream_get_current()));
99
109
 
100
110
  if (d_keys.Current() != keys)
101
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
111
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
102
112
 
103
113
  if (d_values.Current() != values)
104
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
114
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
105
115
  }
106
116
 
107
117
  void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
@@ -119,7 +129,7 @@ void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
119
129
  radix_sort_pairs_device<int64_t>(context, keys, values, n);
120
130
  }
121
131
 
122
- void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
132
+ void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
123
133
  {
124
134
  radix_sort_pairs_device(
125
135
  WP_CURRENT_CONTEXT,
@@ -127,7 +137,7 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
127
137
  reinterpret_cast<int *>(values), n);
128
138
  }
129
139
 
130
- void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
140
+ void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
131
141
  {
132
142
  radix_sort_pairs_device(
133
143
  WP_CURRENT_CONTEXT,
@@ -135,7 +145,7 @@ void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
135
145
  reinterpret_cast<int *>(values), n);
136
146
  }
137
147
 
138
- void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
148
+ void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
139
149
  {
140
150
  radix_sort_pairs_device(
141
151
  WP_CURRENT_CONTEXT,
@@ -153,6 +163,8 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
153
163
  int* start_indices = NULL;
154
164
  int* end_indices = NULL;
155
165
 
166
+ CUstream stream = static_cast<CUstream>(wp_cuda_stream_get_current());
167
+
156
168
  // compute temporary memory required
157
169
  size_t sort_temp_size;
158
170
  check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
@@ -166,17 +178,14 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
166
178
  end_indices,
167
179
  0,
168
180
  32,
169
- (cudaStream_t)cuda_stream_get_current()));
170
-
171
- if (!context)
172
- context = cuda_context_get_current();
181
+ stream));
173
182
 
174
- RadixSortTemp& temp = g_radix_sort_temp_map[context];
183
+ RadixSortTemp& temp = g_radix_sort_temp_map[stream];
175
184
 
176
185
  if (sort_temp_size > temp.size)
177
186
  {
178
- free_device(WP_CURRENT_CONTEXT, temp.mem);
179
- temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
187
+ wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
188
+ temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
180
189
  temp.size = sort_temp_size;
181
190
  }
182
191
 
@@ -211,16 +220,16 @@ void segmented_sort_pairs_device(void* context, float* keys, int* values, int n,
211
220
  segment_end_indices,
212
221
  0,
213
222
  32,
214
- (cudaStream_t)cuda_stream_get_current()));
223
+ (cudaStream_t)wp_cuda_stream_get_current()));
215
224
 
216
225
  if (d_keys.Current() != keys)
217
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
226
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
218
227
 
219
228
  if (d_values.Current() != values)
220
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
229
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
221
230
  }
222
231
 
223
- void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
232
+ void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
224
233
  {
225
234
  segmented_sort_pairs_device(
226
235
  WP_CURRENT_CONTEXT,
@@ -256,16 +265,16 @@ void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, i
256
265
  segment_end_indices,
257
266
  0,
258
267
  32,
259
- (cudaStream_t)cuda_stream_get_current()));
268
+ (cudaStream_t)wp_cuda_stream_get_current()));
260
269
 
261
270
  if (d_keys.Current() != keys)
262
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
271
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
263
272
 
264
273
  if (d_values.Current() != values)
265
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
274
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
266
275
  }
267
276
 
268
- void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
277
+ void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
269
278
  {
270
279
  segmented_sort_pairs_device(
271
280
  WP_CURRENT_CONTEXT,
warp/native/sort.h CHANGED
@@ -20,6 +20,8 @@
20
20
  #include <stddef.h>
21
21
 
22
22
  void radix_sort_reserve(void* context, int n, void** mem_out=NULL, size_t* size_out=NULL);
23
+ void radix_sort_release(void* context, void* stream);
24
+
23
25
  void radix_sort_pairs_host(int* keys, int* values, int n);
24
26
  void radix_sort_pairs_host(float* keys, int* values, int n);
25
27
  void radix_sort_pairs_host(int64_t* keys, int* values, int n);
warp/native/sparse.cpp CHANGED
@@ -36,7 +36,7 @@ template <typename T> bool bsr_block_is_zero(int block_idx, int block_size, cons
36
36
  } // namespace
37
37
 
38
38
 
39
- WP_API void bsr_matrix_from_triplets_host(
39
+ WP_API void wp_bsr_matrix_from_triplets_host(
40
40
  int block_size,
41
41
  int scalar_size_in_bytes,
42
42
  int row_count,
@@ -64,8 +64,8 @@ WP_API void bsr_matrix_from_triplets_host(
64
64
  bool return_summed_blocks = tpl_block_offsets != nullptr && tpl_block_indices != nullptr;
65
65
  if (!return_summed_blocks)
66
66
  {
67
- tpl_block_offsets = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
68
- tpl_block_indices = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
67
+ tpl_block_offsets = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
68
+ tpl_block_indices = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
69
69
  }
70
70
 
71
71
  std::iota(tpl_block_indices, tpl_block_indices + nnz, 0);
@@ -156,8 +156,8 @@ WP_API void bsr_matrix_from_triplets_host(
156
156
  if(!return_summed_blocks)
157
157
  {
158
158
  // free our temporary buffers
159
- free_host(tpl_block_offsets);
160
- free_host(tpl_block_indices);
159
+ wp_free_host(tpl_block_offsets);
160
+ wp_free_host(tpl_block_indices);
161
161
  }
162
162
 
163
163
  if (bsr_nnz != nullptr)
@@ -166,7 +166,7 @@ WP_API void bsr_matrix_from_triplets_host(
166
166
  }
167
167
  }
168
168
 
169
- WP_API void bsr_transpose_host(
169
+ WP_API void wp_bsr_transpose_host(
170
170
  int row_count, int col_count, int nnz,
171
171
  const int* bsr_offsets, const int* bsr_columns,
172
172
  int* transposed_bsr_offsets,
@@ -209,7 +209,7 @@ WP_API void bsr_transpose_host(
209
209
  }
210
210
 
211
211
  #if !WP_ENABLE_CUDA
212
- WP_API void bsr_matrix_from_triplets_device(
212
+ WP_API void wp_bsr_matrix_from_triplets_device(
213
213
  int block_size,
214
214
  int scalar_size_in_bytes,
215
215
  int row_count,
@@ -229,7 +229,7 @@ WP_API void bsr_matrix_from_triplets_device(
229
229
  void* bsr_nnz_event) {}
230
230
 
231
231
 
232
- WP_API void bsr_transpose_device(
232
+ WP_API void wp_bsr_transpose_device(
233
233
  int row_count, int col_count, int nnz,
234
234
  const int* bsr_offsets, const int* bsr_columns,
235
235
  int* transposed_bsr_offsets,
warp/native/sparse.cu CHANGED
@@ -50,7 +50,7 @@ template <typename T> struct BsrBlockIsNotZero
50
50
  T zero_mask;
51
51
 
52
52
  BsrBlockIsNotZero(int block_size, const void* values, const uint64_t zero_mask)
53
- : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<const T>(zero_mask))
53
+ : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<T>(zero_mask))
54
54
  {}
55
55
 
56
56
  CUDA_CALLABLE_DEVICE bool operator()(int block) const
@@ -256,7 +256,7 @@ __global__ void bsr_transpose_fill_row_col(const int nnz_upper_bound, const int
256
256
  } // namespace
257
257
 
258
258
 
259
- WP_API void bsr_matrix_from_triplets_device(
259
+ WP_API void wp_bsr_matrix_from_triplets_device(
260
260
  const int block_size,
261
261
  int scalar_size,
262
262
  const int row_count,
@@ -274,13 +274,13 @@ WP_API void bsr_matrix_from_triplets_device(
274
274
  int* bsr_columns,
275
275
  int* bsr_nnz, void* bsr_nnz_event)
276
276
  {
277
- void* context = cuda_context_get_current();
277
+ void* context = wp_cuda_context_get_current();
278
278
  ContextGuard guard(context);
279
279
 
280
280
  // Per-context cached temporary buffers
281
281
  // BsrFromTripletsTemp& bsr_temp = g_bsr_from_triplets_temp_map[context];
282
282
 
283
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
283
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
284
284
 
285
285
  ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * size_t(nnz));
286
286
  ScopedTemporary<int> unique_triplet_count(context, 1);
@@ -289,8 +289,8 @@ WP_API void bsr_matrix_from_triplets_device(
289
289
  if(!return_summed_blocks)
290
290
  {
291
291
  // if not provided, allocate temporary offset and indices buffers
292
- tpl_block_offsets = static_cast<int*>(alloc_device(context, size_t(nnz) * sizeof(int)));
293
- tpl_block_indices = static_cast<int*>(alloc_device(context, size_t(nnz) * sizeof(int)));
292
+ tpl_block_offsets = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
293
+ tpl_block_indices = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
294
294
  }
295
295
 
296
296
 
@@ -357,11 +357,11 @@ WP_API void bsr_matrix_from_triplets_device(
357
357
  {
358
358
  // Copy nnz to host, and record an event for the completed transfer if desired
359
359
 
360
- memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
360
+ wp_memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
361
361
 
362
362
  if (bsr_nnz_event)
363
363
  {
364
- cuda_event_record(bsr_nnz_event, stream);
364
+ wp_cuda_event_record(bsr_nnz_event, stream);
365
365
  }
366
366
  }
367
367
 
@@ -381,21 +381,21 @@ WP_API void bsr_matrix_from_triplets_device(
381
381
  stream));
382
382
  } else {
383
383
  // free our temporary buffers
384
- free_device(context, tpl_block_offsets);
385
- free_device(context, tpl_block_indices);
384
+ wp_free_device(context, tpl_block_offsets);
385
+ wp_free_device(context, tpl_block_indices);
386
386
  }
387
387
  }
388
388
 
389
389
 
390
- WP_API void bsr_transpose_device(int row_count, int col_count, int nnz,
390
+ WP_API void wp_bsr_transpose_device(int row_count, int col_count, int nnz,
391
391
  const int* bsr_offsets, const int* bsr_columns,
392
392
  int* transposed_bsr_offsets, int* transposed_bsr_columns,
393
393
  int* src_block_indices)
394
394
  {
395
- void* context = cuda_context_get_current();
395
+ void* context = wp_cuda_context_get_current();
396
396
  ContextGuard guard(context);
397
397
 
398
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
398
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
399
399
 
400
400
  ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * nnz);
401
401