warp-lang 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +482 -110
  3. warp/bin/warp-clang.dll +0 -0
  4. warp/bin/warp.dll +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +48 -63
  7. warp/builtins.py +955 -137
  8. warp/codegen.py +327 -209
  9. warp/config.py +1 -1
  10. warp/context.py +1363 -800
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/examples/interop/example_jax_callable.py +34 -4
  18. warp/examples/interop/example_jax_kernel.py +27 -1
  19. warp/fabric.py +1 -1
  20. warp/fem/cache.py +27 -19
  21. warp/fem/domain.py +2 -2
  22. warp/fem/field/nodal_field.py +2 -2
  23. warp/fem/field/virtual.py +266 -166
  24. warp/fem/geometry/geometry.py +5 -5
  25. warp/fem/integrate.py +200 -91
  26. warp/fem/space/restriction.py +4 -0
  27. warp/fem/space/shape/tet_shape_function.py +3 -10
  28. warp/jax_experimental/custom_call.py +1 -1
  29. warp/jax_experimental/ffi.py +203 -54
  30. warp/marching_cubes.py +708 -0
  31. warp/native/array.h +103 -8
  32. warp/native/builtin.h +90 -9
  33. warp/native/bvh.cpp +64 -28
  34. warp/native/bvh.cu +58 -58
  35. warp/native/bvh.h +2 -2
  36. warp/native/clang/clang.cpp +7 -7
  37. warp/native/coloring.cpp +13 -3
  38. warp/native/crt.cpp +2 -2
  39. warp/native/crt.h +3 -5
  40. warp/native/cuda_util.cpp +42 -11
  41. warp/native/cuda_util.h +10 -4
  42. warp/native/exports.h +1842 -1908
  43. warp/native/fabric.h +2 -1
  44. warp/native/hashgrid.cpp +37 -37
  45. warp/native/hashgrid.cu +2 -2
  46. warp/native/initializer_array.h +1 -1
  47. warp/native/intersect.h +4 -4
  48. warp/native/mat.h +1913 -119
  49. warp/native/mathdx.cpp +43 -43
  50. warp/native/mesh.cpp +24 -24
  51. warp/native/mesh.cu +26 -26
  52. warp/native/mesh.h +5 -3
  53. warp/native/nanovdb/GridHandle.h +179 -12
  54. warp/native/nanovdb/HostBuffer.h +8 -7
  55. warp/native/nanovdb/NanoVDB.h +517 -895
  56. warp/native/nanovdb/NodeManager.h +323 -0
  57. warp/native/nanovdb/PNanoVDB.h +2 -2
  58. warp/native/quat.h +337 -16
  59. warp/native/rand.h +7 -7
  60. warp/native/range.h +7 -1
  61. warp/native/reduce.cpp +10 -10
  62. warp/native/reduce.cu +13 -14
  63. warp/native/runlength_encode.cpp +2 -2
  64. warp/native/runlength_encode.cu +5 -5
  65. warp/native/scan.cpp +3 -3
  66. warp/native/scan.cu +4 -4
  67. warp/native/sort.cpp +10 -10
  68. warp/native/sort.cu +22 -22
  69. warp/native/sparse.cpp +8 -8
  70. warp/native/sparse.cu +14 -14
  71. warp/native/spatial.h +366 -17
  72. warp/native/svd.h +23 -8
  73. warp/native/temp_buffer.h +2 -2
  74. warp/native/tile.h +303 -70
  75. warp/native/tile_radix_sort.h +5 -1
  76. warp/native/tile_reduce.h +16 -25
  77. warp/native/tuple.h +2 -2
  78. warp/native/vec.h +385 -18
  79. warp/native/volume.cpp +54 -54
  80. warp/native/volume.cu +1 -1
  81. warp/native/volume.h +2 -1
  82. warp/native/volume_builder.cu +30 -37
  83. warp/native/warp.cpp +150 -149
  84. warp/native/warp.cu +337 -193
  85. warp/native/warp.h +227 -226
  86. warp/optim/linear.py +736 -271
  87. warp/render/imgui_manager.py +289 -0
  88. warp/render/render_opengl.py +137 -57
  89. warp/render/render_usd.py +0 -1
  90. warp/sim/collide.py +1 -2
  91. warp/sim/graph_coloring.py +2 -2
  92. warp/sim/integrator_vbd.py +10 -2
  93. warp/sparse.py +559 -176
  94. warp/tape.py +2 -0
  95. warp/tests/aux_test_module_aot.py +7 -0
  96. warp/tests/cuda/test_async.py +3 -3
  97. warp/tests/cuda/test_conditional_captures.py +101 -0
  98. warp/tests/geometry/test_marching_cubes.py +233 -12
  99. warp/tests/sim/test_cloth.py +89 -6
  100. warp/tests/sim/test_coloring.py +82 -7
  101. warp/tests/test_array.py +56 -5
  102. warp/tests/test_assert.py +53 -0
  103. warp/tests/test_atomic_cas.py +127 -114
  104. warp/tests/test_codegen.py +3 -2
  105. warp/tests/test_context.py +8 -15
  106. warp/tests/test_enum.py +136 -0
  107. warp/tests/test_examples.py +2 -2
  108. warp/tests/test_fem.py +45 -2
  109. warp/tests/test_fixedarray.py +229 -0
  110. warp/tests/test_func.py +18 -15
  111. warp/tests/test_future_annotations.py +7 -5
  112. warp/tests/test_linear_solvers.py +30 -0
  113. warp/tests/test_map.py +1 -1
  114. warp/tests/test_mat.py +1540 -378
  115. warp/tests/test_mat_assign_copy.py +178 -0
  116. warp/tests/test_mat_constructors.py +574 -0
  117. warp/tests/test_module_aot.py +287 -0
  118. warp/tests/test_print.py +69 -0
  119. warp/tests/test_quat.py +162 -34
  120. warp/tests/test_quat_assign_copy.py +145 -0
  121. warp/tests/test_reload.py +2 -1
  122. warp/tests/test_sparse.py +103 -0
  123. warp/tests/test_spatial.py +140 -34
  124. warp/tests/test_spatial_assign_copy.py +160 -0
  125. warp/tests/test_static.py +48 -0
  126. warp/tests/test_struct.py +43 -3
  127. warp/tests/test_tape.py +38 -0
  128. warp/tests/test_types.py +0 -20
  129. warp/tests/test_vec.py +216 -441
  130. warp/tests/test_vec_assign_copy.py +143 -0
  131. warp/tests/test_vec_constructors.py +325 -0
  132. warp/tests/tile/test_tile.py +206 -152
  133. warp/tests/tile/test_tile_cholesky.py +605 -0
  134. warp/tests/tile/test_tile_load.py +169 -0
  135. warp/tests/tile/test_tile_mathdx.py +2 -558
  136. warp/tests/tile/test_tile_matmul.py +179 -0
  137. warp/tests/tile/test_tile_mlp.py +1 -1
  138. warp/tests/tile/test_tile_reduce.py +100 -11
  139. warp/tests/tile/test_tile_shared_memory.py +16 -16
  140. warp/tests/tile/test_tile_sort.py +59 -55
  141. warp/tests/unittest_suites.py +16 -0
  142. warp/tests/walkthrough_debug.py +1 -1
  143. warp/thirdparty/unittest_parallel.py +108 -9
  144. warp/types.py +554 -264
  145. warp/utils.py +68 -86
  146. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
  147. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
  148. warp/native/marching.cpp +0 -19
  149. warp/native/marching.cu +0 -514
  150. warp/native/marching.h +0 -19
  151. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
  152. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
  153. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/reduce.cu CHANGED
@@ -22,7 +22,6 @@
22
22
 
23
23
  #define THRUST_IGNORE_CUB_VERSION_CHECK
24
24
  #include <cub/device/device_reduce.cuh>
25
- #include <cub/iterator/counting_input_iterator.cuh>
26
25
 
27
26
  namespace
28
27
  {
@@ -119,14 +118,14 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
119
118
  assert((byte_stride % sizeof(T)) == 0);
120
119
  const int stride = byte_stride / sizeof(T);
121
120
 
122
- ContextGuard guard(cuda_context_get_current());
123
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
121
+ ContextGuard guard(wp_cuda_context_get_current());
122
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
124
123
 
125
124
  cub_strided_iterator<const T> ptr_strided{ptr_a, stride};
126
125
 
127
126
  size_t buff_size = 0;
128
127
  check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, ptr_strided, ptr_out, count, stream));
129
- void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
128
+ void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
130
129
 
131
130
  for (int k = 0; k < type_length; ++k)
132
131
  {
@@ -134,7 +133,7 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
134
133
  check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, ptr_strided, ptr_out + k, count, stream));
135
134
  }
136
135
 
137
- free_device(WP_CURRENT_CONTEXT, temp_buffer);
136
+ wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
138
137
  }
139
138
 
140
139
  template <typename T>
@@ -280,18 +279,18 @@ void array_inner_device(const ElemT *ptr_a, const ElemT *ptr_b, ScalarT *ptr_out
280
279
  const int stride_a = byte_stride_a / sizeof(ElemT);
281
280
  const int stride_b = byte_stride_b / sizeof(ElemT);
282
281
 
283
- ContextGuard guard(cuda_context_get_current());
284
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
282
+ ContextGuard guard(wp_cuda_context_get_current());
283
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
285
284
 
286
285
  cub_inner_product_iterator<ElemT, ScalarT> inner_iterator{ptr_a, ptr_b, stride_a, stride_b, type_length};
287
286
 
288
287
  size_t buff_size = 0;
289
288
  check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, inner_iterator, ptr_out, count, stream));
290
- void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
289
+ void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
291
290
 
292
291
  check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, inner_iterator, ptr_out, count, stream));
293
292
 
294
- free_device(WP_CURRENT_CONTEXT, temp_buffer);
293
+ wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
295
294
  }
296
295
 
297
296
  template <typename T>
@@ -327,10 +326,10 @@ void array_inner_device_dispatch(const T *ptr_a, const T *ptr_b, T *ptr_out, int
327
326
 
328
327
  } // anonymous namespace
329
328
 
330
- void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
329
+ void wp_array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
331
330
  int type_len)
332
331
  {
333
- void *context = cuda_context_get_current();
332
+ void *context = wp_cuda_context_get_current();
334
333
 
335
334
  const float *ptr_a = (const float *)(a);
336
335
  const float *ptr_b = (const float *)(b);
@@ -339,7 +338,7 @@ void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, i
339
338
  array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
340
339
  }
341
340
 
342
- void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
341
+ void wp_array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
343
342
  int type_len)
344
343
  {
345
344
  const double *ptr_a = (const double *)(a);
@@ -349,14 +348,14 @@ void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count,
349
348
  array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
350
349
  }
351
350
 
352
- void array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
351
+ void wp_array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
353
352
  {
354
353
  const float *ptr_a = (const float *)(a);
355
354
  float *ptr_out = (float *)(out);
356
355
  array_sum_device_dispatch(ptr_a, ptr_out, count, byte_stride, type_length);
357
356
  }
358
357
 
359
- void array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
358
+ void wp_array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
360
359
  {
361
360
  const double *ptr_a = (const double *)(a);
362
361
  double *ptr_out = (double *)(out);
@@ -53,7 +53,7 @@ void runlength_encode_host(int n,
53
53
  }
54
54
  }
55
55
 
56
- void runlength_encode_int_host(
56
+ void wp_runlength_encode_int_host(
57
57
  uint64_t values,
58
58
  uint64_t run_values,
59
59
  uint64_t run_lengths,
@@ -68,7 +68,7 @@ void runlength_encode_int_host(
68
68
  }
69
69
 
70
70
  #if !WP_ENABLE_CUDA
71
- void runlength_encode_int_device(
71
+ void wp_runlength_encode_int_device(
72
72
  uint64_t values,
73
73
  uint64_t run_values,
74
74
  uint64_t run_lengths,
@@ -28,24 +28,24 @@ void runlength_encode_device(int n,
28
28
  int *run_lengths,
29
29
  int *run_count)
30
30
  {
31
- ContextGuard guard(cuda_context_get_current());
32
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
31
+ ContextGuard guard(wp_cuda_context_get_current());
32
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
33
33
 
34
34
  size_t buff_size = 0;
35
35
  check_cuda(cub::DeviceRunLengthEncode::Encode(
36
36
  nullptr, buff_size, values, run_values, run_lengths, run_count,
37
37
  n, stream));
38
38
 
39
- void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
39
+ void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
40
40
 
41
41
  check_cuda(cub::DeviceRunLengthEncode::Encode(
42
42
  temp_buffer, buff_size, values, run_values, run_lengths, run_count,
43
43
  n, stream));
44
44
 
45
- free_device(WP_CURRENT_CONTEXT, temp_buffer);
45
+ wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
46
46
  }
47
47
 
48
- void runlength_encode_int_device(
48
+ void wp_runlength_encode_int_device(
49
49
  uint64_t values,
50
50
  uint64_t run_values,
51
51
  uint64_t run_lengths,
warp/native/scan.cpp CHANGED
@@ -28,8 +28,8 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
28
28
  // compute temporary memory required
29
29
  if (!inclusive && n > scan_temp_max_size)
30
30
  {
31
- free_host(scan_temp_memory);
32
- scan_temp_memory = alloc_host(sizeof(T) * n);
31
+ wp_free_host(scan_temp_memory);
32
+ scan_temp_memory = wp_alloc_host(sizeof(T) * n);
33
33
  scan_temp_max_size = n;
34
34
  }
35
35
 
@@ -39,7 +39,7 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
39
39
  std::partial_sum(values_in, values_in + n, result);
40
40
  if (!inclusive) {
41
41
  values_out[0] = (T)0;
42
- memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
42
+ wp_memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
43
43
  }
44
44
  }
45
45
 
warp/native/scan.cu CHANGED
@@ -25,9 +25,9 @@
25
25
  template<typename T>
26
26
  void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
27
27
  {
28
- ContextGuard guard(cuda_context_get_current());
28
+ ContextGuard guard(wp_cuda_context_get_current());
29
29
 
30
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
30
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
31
31
 
32
32
  // compute temporary memory required
33
33
  size_t scan_temp_size;
@@ -37,7 +37,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
37
37
  check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
38
38
  }
39
39
 
40
- void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
40
+ void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
41
41
 
42
42
  // scan
43
43
  if (inclusive) {
@@ -46,7 +46,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
46
46
  check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
47
47
  }
48
48
 
49
- free_device(WP_CURRENT_CONTEXT, temp_buffer);
49
+ wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
50
50
  }
51
51
 
52
52
  template void scan_device(const int*, int*, int, bool);
warp/native/sort.cpp CHANGED
@@ -198,41 +198,41 @@ void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start
198
198
 
199
199
  void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
200
200
 
201
- void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
201
+ void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
202
202
 
203
- void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
203
+ void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
204
204
 
205
- void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
205
+ void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
206
206
 
207
- void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
207
+ void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
208
208
 
209
- void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
209
+ void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
210
210
 
211
211
  #endif // !WP_ENABLE_CUDA
212
212
 
213
213
 
214
- void radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
214
+ void wp_radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
215
215
  {
216
216
  radix_sort_pairs_host(
217
217
  reinterpret_cast<int *>(keys),
218
218
  reinterpret_cast<int *>(values), n);
219
219
  }
220
220
 
221
- void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
221
+ void wp_radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
222
222
  {
223
223
  radix_sort_pairs_host(
224
224
  reinterpret_cast<int64_t *>(keys),
225
225
  reinterpret_cast<int *>(values), n);
226
226
  }
227
227
 
228
- void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
228
+ void wp_radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
229
229
  {
230
230
  radix_sort_pairs_host(
231
231
  reinterpret_cast<float *>(keys),
232
232
  reinterpret_cast<int *>(values), n);
233
233
  }
234
234
 
235
- void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
235
+ void wp_segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
236
236
  {
237
237
  segmented_sort_pairs_host(
238
238
  reinterpret_cast<float *>(keys),
@@ -241,7 +241,7 @@ void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint
241
241
  reinterpret_cast<int *>(segment_end_indices), num_segments);
242
242
  }
243
243
 
244
- void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
244
+ void wp_segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
245
245
  {
246
246
  segmented_sort_pairs_host(
247
247
  reinterpret_cast<int *>(keys),
warp/native/sort.cu CHANGED
@@ -52,17 +52,17 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
52
52
  d_keys,
53
53
  d_values,
54
54
  n, 0, sizeof(KeyType)*8,
55
- (cudaStream_t)cuda_stream_get_current()));
55
+ (cudaStream_t)wp_cuda_stream_get_current()));
56
56
 
57
57
  if (!context)
58
- context = cuda_context_get_current();
58
+ context = wp_cuda_context_get_current();
59
59
 
60
60
  RadixSortTemp& temp = g_radix_sort_temp_map[context];
61
61
 
62
62
  if (sort_temp_size > temp.size)
63
63
  {
64
- free_device(WP_CURRENT_CONTEXT, temp.mem);
65
- temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
64
+ wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
65
+ temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
66
66
  temp.size = sort_temp_size;
67
67
  }
68
68
 
@@ -95,13 +95,13 @@ void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
95
95
  d_keys,
96
96
  d_values,
97
97
  n, 0, sizeof(KeyType)*8,
98
- (cudaStream_t)cuda_stream_get_current()));
98
+ (cudaStream_t)wp_cuda_stream_get_current()));
99
99
 
100
100
  if (d_keys.Current() != keys)
101
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
101
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
102
102
 
103
103
  if (d_values.Current() != values)
104
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
104
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
105
105
  }
106
106
 
107
107
  void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
@@ -119,7 +119,7 @@ void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
119
119
  radix_sort_pairs_device<int64_t>(context, keys, values, n);
120
120
  }
121
121
 
122
- void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
122
+ void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
123
123
  {
124
124
  radix_sort_pairs_device(
125
125
  WP_CURRENT_CONTEXT,
@@ -127,7 +127,7 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
127
127
  reinterpret_cast<int *>(values), n);
128
128
  }
129
129
 
130
- void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
130
+ void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
131
131
  {
132
132
  radix_sort_pairs_device(
133
133
  WP_CURRENT_CONTEXT,
@@ -135,7 +135,7 @@ void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
135
135
  reinterpret_cast<int *>(values), n);
136
136
  }
137
137
 
138
- void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
138
+ void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
139
139
  {
140
140
  radix_sort_pairs_device(
141
141
  WP_CURRENT_CONTEXT,
@@ -166,17 +166,17 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
166
166
  end_indices,
167
167
  0,
168
168
  32,
169
- (cudaStream_t)cuda_stream_get_current()));
169
+ (cudaStream_t)wp_cuda_stream_get_current()));
170
170
 
171
171
  if (!context)
172
- context = cuda_context_get_current();
172
+ context = wp_cuda_context_get_current();
173
173
 
174
174
  RadixSortTemp& temp = g_radix_sort_temp_map[context];
175
175
 
176
176
  if (sort_temp_size > temp.size)
177
177
  {
178
- free_device(WP_CURRENT_CONTEXT, temp.mem);
179
- temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
178
+ wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
179
+ temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
180
180
  temp.size = sort_temp_size;
181
181
  }
182
182
 
@@ -211,16 +211,16 @@ void segmented_sort_pairs_device(void* context, float* keys, int* values, int n,
211
211
  segment_end_indices,
212
212
  0,
213
213
  32,
214
- (cudaStream_t)cuda_stream_get_current()));
214
+ (cudaStream_t)wp_cuda_stream_get_current()));
215
215
 
216
216
  if (d_keys.Current() != keys)
217
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
217
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
218
218
 
219
219
  if (d_values.Current() != values)
220
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
220
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
221
221
  }
222
222
 
223
- void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
223
+ void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
224
224
  {
225
225
  segmented_sort_pairs_device(
226
226
  WP_CURRENT_CONTEXT,
@@ -256,16 +256,16 @@ void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, i
256
256
  segment_end_indices,
257
257
  0,
258
258
  32,
259
- (cudaStream_t)cuda_stream_get_current()));
259
+ (cudaStream_t)wp_cuda_stream_get_current()));
260
260
 
261
261
  if (d_keys.Current() != keys)
262
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
262
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
263
263
 
264
264
  if (d_values.Current() != values)
265
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
265
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
266
266
  }
267
267
 
268
- void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
268
+ void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
269
269
  {
270
270
  segmented_sort_pairs_device(
271
271
  WP_CURRENT_CONTEXT,
warp/native/sparse.cpp CHANGED
@@ -36,7 +36,7 @@ template <typename T> bool bsr_block_is_zero(int block_idx, int block_size, cons
36
36
  } // namespace
37
37
 
38
38
 
39
- WP_API void bsr_matrix_from_triplets_host(
39
+ WP_API void wp_bsr_matrix_from_triplets_host(
40
40
  int block_size,
41
41
  int scalar_size_in_bytes,
42
42
  int row_count,
@@ -64,8 +64,8 @@ WP_API void bsr_matrix_from_triplets_host(
64
64
  bool return_summed_blocks = tpl_block_offsets != nullptr && tpl_block_indices != nullptr;
65
65
  if (!return_summed_blocks)
66
66
  {
67
- tpl_block_offsets = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
68
- tpl_block_indices = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
67
+ tpl_block_offsets = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
68
+ tpl_block_indices = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
69
69
  }
70
70
 
71
71
  std::iota(tpl_block_indices, tpl_block_indices + nnz, 0);
@@ -156,8 +156,8 @@ WP_API void bsr_matrix_from_triplets_host(
156
156
  if(!return_summed_blocks)
157
157
  {
158
158
  // free our temporary buffers
159
- free_host(tpl_block_offsets);
160
- free_host(tpl_block_indices);
159
+ wp_free_host(tpl_block_offsets);
160
+ wp_free_host(tpl_block_indices);
161
161
  }
162
162
 
163
163
  if (bsr_nnz != nullptr)
@@ -166,7 +166,7 @@ WP_API void bsr_matrix_from_triplets_host(
166
166
  }
167
167
  }
168
168
 
169
- WP_API void bsr_transpose_host(
169
+ WP_API void wp_bsr_transpose_host(
170
170
  int row_count, int col_count, int nnz,
171
171
  const int* bsr_offsets, const int* bsr_columns,
172
172
  int* transposed_bsr_offsets,
@@ -209,7 +209,7 @@ WP_API void bsr_transpose_host(
209
209
  }
210
210
 
211
211
  #if !WP_ENABLE_CUDA
212
- WP_API void bsr_matrix_from_triplets_device(
212
+ WP_API void wp_bsr_matrix_from_triplets_device(
213
213
  int block_size,
214
214
  int scalar_size_in_bytes,
215
215
  int row_count,
@@ -229,7 +229,7 @@ WP_API void bsr_matrix_from_triplets_device(
229
229
  void* bsr_nnz_event) {}
230
230
 
231
231
 
232
- WP_API void bsr_transpose_device(
232
+ WP_API void wp_bsr_transpose_device(
233
233
  int row_count, int col_count, int nnz,
234
234
  const int* bsr_offsets, const int* bsr_columns,
235
235
  int* transposed_bsr_offsets,
warp/native/sparse.cu CHANGED
@@ -50,7 +50,7 @@ template <typename T> struct BsrBlockIsNotZero
50
50
  T zero_mask;
51
51
 
52
52
  BsrBlockIsNotZero(int block_size, const void* values, const uint64_t zero_mask)
53
- : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<const T>(zero_mask))
53
+ : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<T>(zero_mask))
54
54
  {}
55
55
 
56
56
  CUDA_CALLABLE_DEVICE bool operator()(int block) const
@@ -256,7 +256,7 @@ __global__ void bsr_transpose_fill_row_col(const int nnz_upper_bound, const int
256
256
  } // namespace
257
257
 
258
258
 
259
- WP_API void bsr_matrix_from_triplets_device(
259
+ WP_API void wp_bsr_matrix_from_triplets_device(
260
260
  const int block_size,
261
261
  int scalar_size,
262
262
  const int row_count,
@@ -274,13 +274,13 @@ WP_API void bsr_matrix_from_triplets_device(
274
274
  int* bsr_columns,
275
275
  int* bsr_nnz, void* bsr_nnz_event)
276
276
  {
277
- void* context = cuda_context_get_current();
277
+ void* context = wp_cuda_context_get_current();
278
278
  ContextGuard guard(context);
279
279
 
280
280
  // Per-context cached temporary buffers
281
281
  // BsrFromTripletsTemp& bsr_temp = g_bsr_from_triplets_temp_map[context];
282
282
 
283
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
283
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
284
284
 
285
285
  ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * size_t(nnz));
286
286
  ScopedTemporary<int> unique_triplet_count(context, 1);
@@ -289,8 +289,8 @@ WP_API void bsr_matrix_from_triplets_device(
289
289
  if(!return_summed_blocks)
290
290
  {
291
291
  // if not provided, allocate temporary offset and indices buffers
292
- tpl_block_offsets = static_cast<int*>(alloc_device(context, size_t(nnz) * sizeof(int)));
293
- tpl_block_indices = static_cast<int*>(alloc_device(context, size_t(nnz) * sizeof(int)));
292
+ tpl_block_offsets = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
293
+ tpl_block_indices = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
294
294
  }
295
295
 
296
296
 
@@ -334,7 +334,7 @@ WP_API void bsr_matrix_from_triplets_device(
334
334
  // Ensures the sorted keys are available in summed_block_indices if needed
335
335
  if(return_summed_blocks && d_keys.Current() != tpl_block_indices)
336
336
  {
337
- check_cuda(cudaMemcpy(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice));
337
+ check_cuda(cudaMemcpyAsync(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice, stream));
338
338
  }
339
339
  }
340
340
 
@@ -357,11 +357,11 @@ WP_API void bsr_matrix_from_triplets_device(
357
357
  {
358
358
  // Copy nnz to host, and record an event for the completed transfer if desired
359
359
 
360
- memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
360
+ wp_memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
361
361
 
362
362
  if (bsr_nnz_event)
363
363
  {
364
- cuda_event_record(bsr_nnz_event, stream);
364
+ wp_cuda_event_record(bsr_nnz_event, stream);
365
365
  }
366
366
  }
367
367
 
@@ -381,21 +381,21 @@ WP_API void bsr_matrix_from_triplets_device(
381
381
  stream));
382
382
  } else {
383
383
  // free our temporary buffers
384
- free_device(context, tpl_block_offsets);
385
- free_device(context, tpl_block_indices);
384
+ wp_free_device(context, tpl_block_offsets);
385
+ wp_free_device(context, tpl_block_indices);
386
386
  }
387
387
  }
388
388
 
389
389
 
390
- WP_API void bsr_transpose_device(int row_count, int col_count, int nnz,
390
+ WP_API void wp_bsr_transpose_device(int row_count, int col_count, int nnz,
391
391
  const int* bsr_offsets, const int* bsr_columns,
392
392
  int* transposed_bsr_offsets, int* transposed_bsr_columns,
393
393
  int* src_block_indices)
394
394
  {
395
- void* context = cuda_context_get_current();
395
+ void* context = wp_cuda_context_get_current();
396
396
  ContextGuard guard(context);
397
397
 
398
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
398
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
399
399
 
400
400
  ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * nnz);
401
401