warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (134) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +482 -110
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +47 -67
  7. warp/builtins.py +955 -137
  8. warp/codegen.py +312 -206
  9. warp/config.py +1 -1
  10. warp/context.py +1249 -784
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/fabric.py +1 -1
  18. warp/fem/cache.py +27 -19
  19. warp/fem/domain.py +2 -2
  20. warp/fem/field/nodal_field.py +2 -2
  21. warp/fem/field/virtual.py +264 -166
  22. warp/fem/geometry/geometry.py +5 -5
  23. warp/fem/integrate.py +129 -51
  24. warp/fem/space/restriction.py +4 -0
  25. warp/fem/space/shape/tet_shape_function.py +3 -10
  26. warp/jax_experimental/custom_call.py +1 -1
  27. warp/jax_experimental/ffi.py +2 -1
  28. warp/marching_cubes.py +708 -0
  29. warp/native/array.h +99 -4
  30. warp/native/builtin.h +82 -5
  31. warp/native/bvh.cpp +64 -28
  32. warp/native/bvh.cu +58 -58
  33. warp/native/bvh.h +2 -2
  34. warp/native/clang/clang.cpp +7 -7
  35. warp/native/coloring.cpp +8 -2
  36. warp/native/crt.cpp +2 -2
  37. warp/native/crt.h +3 -5
  38. warp/native/cuda_util.cpp +41 -10
  39. warp/native/cuda_util.h +10 -4
  40. warp/native/exports.h +1842 -1908
  41. warp/native/fabric.h +2 -1
  42. warp/native/hashgrid.cpp +37 -37
  43. warp/native/hashgrid.cu +2 -2
  44. warp/native/initializer_array.h +1 -1
  45. warp/native/intersect.h +2 -2
  46. warp/native/mat.h +1910 -116
  47. warp/native/mathdx.cpp +43 -43
  48. warp/native/mesh.cpp +24 -24
  49. warp/native/mesh.cu +26 -26
  50. warp/native/mesh.h +4 -2
  51. warp/native/nanovdb/GridHandle.h +179 -12
  52. warp/native/nanovdb/HostBuffer.h +8 -7
  53. warp/native/nanovdb/NanoVDB.h +517 -895
  54. warp/native/nanovdb/NodeManager.h +323 -0
  55. warp/native/nanovdb/PNanoVDB.h +2 -2
  56. warp/native/quat.h +331 -14
  57. warp/native/range.h +7 -1
  58. warp/native/reduce.cpp +10 -10
  59. warp/native/reduce.cu +13 -14
  60. warp/native/runlength_encode.cpp +2 -2
  61. warp/native/runlength_encode.cu +5 -5
  62. warp/native/scan.cpp +3 -3
  63. warp/native/scan.cu +4 -4
  64. warp/native/sort.cpp +10 -10
  65. warp/native/sort.cu +22 -22
  66. warp/native/sparse.cpp +8 -8
  67. warp/native/sparse.cu +13 -13
  68. warp/native/spatial.h +366 -17
  69. warp/native/temp_buffer.h +2 -2
  70. warp/native/tile.h +283 -69
  71. warp/native/vec.h +381 -14
  72. warp/native/volume.cpp +54 -54
  73. warp/native/volume.cu +1 -1
  74. warp/native/volume.h +2 -1
  75. warp/native/volume_builder.cu +30 -37
  76. warp/native/warp.cpp +150 -149
  77. warp/native/warp.cu +323 -192
  78. warp/native/warp.h +227 -226
  79. warp/optim/linear.py +736 -271
  80. warp/render/imgui_manager.py +289 -0
  81. warp/render/render_opengl.py +85 -6
  82. warp/sim/graph_coloring.py +2 -2
  83. warp/sparse.py +558 -175
  84. warp/tests/aux_test_module_aot.py +7 -0
  85. warp/tests/cuda/test_async.py +3 -3
  86. warp/tests/cuda/test_conditional_captures.py +101 -0
  87. warp/tests/geometry/test_marching_cubes.py +233 -12
  88. warp/tests/sim/test_coloring.py +6 -6
  89. warp/tests/test_array.py +56 -5
  90. warp/tests/test_codegen.py +3 -2
  91. warp/tests/test_context.py +8 -15
  92. warp/tests/test_enum.py +136 -0
  93. warp/tests/test_examples.py +2 -2
  94. warp/tests/test_fem.py +45 -2
  95. warp/tests/test_fixedarray.py +229 -0
  96. warp/tests/test_func.py +18 -15
  97. warp/tests/test_future_annotations.py +7 -5
  98. warp/tests/test_linear_solvers.py +30 -0
  99. warp/tests/test_map.py +1 -1
  100. warp/tests/test_mat.py +1518 -378
  101. warp/tests/test_mat_assign_copy.py +178 -0
  102. warp/tests/test_mat_constructors.py +574 -0
  103. warp/tests/test_module_aot.py +287 -0
  104. warp/tests/test_print.py +69 -0
  105. warp/tests/test_quat.py +140 -34
  106. warp/tests/test_quat_assign_copy.py +145 -0
  107. warp/tests/test_reload.py +2 -1
  108. warp/tests/test_sparse.py +71 -0
  109. warp/tests/test_spatial.py +140 -34
  110. warp/tests/test_spatial_assign_copy.py +160 -0
  111. warp/tests/test_struct.py +43 -3
  112. warp/tests/test_types.py +0 -20
  113. warp/tests/test_vec.py +179 -34
  114. warp/tests/test_vec_assign_copy.py +143 -0
  115. warp/tests/tile/test_tile.py +184 -18
  116. warp/tests/tile/test_tile_cholesky.py +605 -0
  117. warp/tests/tile/test_tile_load.py +169 -0
  118. warp/tests/tile/test_tile_mathdx.py +2 -558
  119. warp/tests/tile/test_tile_matmul.py +1 -1
  120. warp/tests/tile/test_tile_mlp.py +1 -1
  121. warp/tests/tile/test_tile_shared_memory.py +5 -5
  122. warp/tests/unittest_suites.py +6 -0
  123. warp/tests/walkthrough_debug.py +1 -1
  124. warp/thirdparty/unittest_parallel.py +108 -9
  125. warp/types.py +554 -264
  126. warp/utils.py +68 -86
  127. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
  128. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
  129. warp/native/marching.cpp +0 -19
  130. warp/native/marching.cu +0 -514
  131. warp/native/marching.h +0 -19
  132. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
  133. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
  134. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,7 @@ void runlength_encode_host(int n,
53
53
  }
54
54
  }
55
55
 
56
- void runlength_encode_int_host(
56
+ void wp_runlength_encode_int_host(
57
57
  uint64_t values,
58
58
  uint64_t run_values,
59
59
  uint64_t run_lengths,
@@ -68,7 +68,7 @@ void runlength_encode_int_host(
68
68
  }
69
69
 
70
70
  #if !WP_ENABLE_CUDA
71
- void runlength_encode_int_device(
71
+ void wp_runlength_encode_int_device(
72
72
  uint64_t values,
73
73
  uint64_t run_values,
74
74
  uint64_t run_lengths,
@@ -28,24 +28,24 @@ void runlength_encode_device(int n,
28
28
  int *run_lengths,
29
29
  int *run_count)
30
30
  {
31
- ContextGuard guard(cuda_context_get_current());
32
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
31
+ ContextGuard guard(wp_cuda_context_get_current());
32
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
33
33
 
34
34
  size_t buff_size = 0;
35
35
  check_cuda(cub::DeviceRunLengthEncode::Encode(
36
36
  nullptr, buff_size, values, run_values, run_lengths, run_count,
37
37
  n, stream));
38
38
 
39
- void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
39
+ void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
40
40
 
41
41
  check_cuda(cub::DeviceRunLengthEncode::Encode(
42
42
  temp_buffer, buff_size, values, run_values, run_lengths, run_count,
43
43
  n, stream));
44
44
 
45
- free_device(WP_CURRENT_CONTEXT, temp_buffer);
45
+ wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
46
46
  }
47
47
 
48
- void runlength_encode_int_device(
48
+ void wp_runlength_encode_int_device(
49
49
  uint64_t values,
50
50
  uint64_t run_values,
51
51
  uint64_t run_lengths,
warp/native/scan.cpp CHANGED
@@ -28,8 +28,8 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
28
28
  // compute temporary memory required
29
29
  if (!inclusive && n > scan_temp_max_size)
30
30
  {
31
- free_host(scan_temp_memory);
32
- scan_temp_memory = alloc_host(sizeof(T) * n);
31
+ wp_free_host(scan_temp_memory);
32
+ scan_temp_memory = wp_alloc_host(sizeof(T) * n);
33
33
  scan_temp_max_size = n;
34
34
  }
35
35
 
@@ -39,7 +39,7 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
39
39
  std::partial_sum(values_in, values_in + n, result);
40
40
  if (!inclusive) {
41
41
  values_out[0] = (T)0;
42
- memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
42
+ wp_memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
43
43
  }
44
44
  }
45
45
 
warp/native/scan.cu CHANGED
@@ -25,9 +25,9 @@
25
25
  template<typename T>
26
26
  void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
27
27
  {
28
- ContextGuard guard(cuda_context_get_current());
28
+ ContextGuard guard(wp_cuda_context_get_current());
29
29
 
30
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
30
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
31
31
 
32
32
  // compute temporary memory required
33
33
  size_t scan_temp_size;
@@ -37,7 +37,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
37
37
  check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
38
38
  }
39
39
 
40
- void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
40
+ void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
41
41
 
42
42
  // scan
43
43
  if (inclusive) {
@@ -46,7 +46,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
46
46
  check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
47
47
  }
48
48
 
49
- free_device(WP_CURRENT_CONTEXT, temp_buffer);
49
+ wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
50
50
  }
51
51
 
52
52
  template void scan_device(const int*, int*, int, bool);
warp/native/sort.cpp CHANGED
@@ -198,41 +198,41 @@ void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start
198
198
 
199
199
  void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
200
200
 
201
- void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
201
+ void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
202
202
 
203
- void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
203
+ void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
204
204
 
205
- void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
205
+ void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
206
206
 
207
- void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
207
+ void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
208
208
 
209
- void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
209
+ void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
210
210
 
211
211
  #endif // !WP_ENABLE_CUDA
212
212
 
213
213
 
214
- void radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
214
+ void wp_radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
215
215
  {
216
216
  radix_sort_pairs_host(
217
217
  reinterpret_cast<int *>(keys),
218
218
  reinterpret_cast<int *>(values), n);
219
219
  }
220
220
 
221
- void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
221
+ void wp_radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
222
222
  {
223
223
  radix_sort_pairs_host(
224
224
  reinterpret_cast<int64_t *>(keys),
225
225
  reinterpret_cast<int *>(values), n);
226
226
  }
227
227
 
228
- void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
228
+ void wp_radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
229
229
  {
230
230
  radix_sort_pairs_host(
231
231
  reinterpret_cast<float *>(keys),
232
232
  reinterpret_cast<int *>(values), n);
233
233
  }
234
234
 
235
- void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
235
+ void wp_segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
236
236
  {
237
237
  segmented_sort_pairs_host(
238
238
  reinterpret_cast<float *>(keys),
@@ -241,7 +241,7 @@ void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint
241
241
  reinterpret_cast<int *>(segment_end_indices), num_segments);
242
242
  }
243
243
 
244
- void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
244
+ void wp_segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
245
245
  {
246
246
  segmented_sort_pairs_host(
247
247
  reinterpret_cast<int *>(keys),
warp/native/sort.cu CHANGED
@@ -52,17 +52,17 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
52
52
  d_keys,
53
53
  d_values,
54
54
  n, 0, sizeof(KeyType)*8,
55
- (cudaStream_t)cuda_stream_get_current()));
55
+ (cudaStream_t)wp_cuda_stream_get_current()));
56
56
 
57
57
  if (!context)
58
- context = cuda_context_get_current();
58
+ context = wp_cuda_context_get_current();
59
59
 
60
60
  RadixSortTemp& temp = g_radix_sort_temp_map[context];
61
61
 
62
62
  if (sort_temp_size > temp.size)
63
63
  {
64
- free_device(WP_CURRENT_CONTEXT, temp.mem);
65
- temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
64
+ wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
65
+ temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
66
66
  temp.size = sort_temp_size;
67
67
  }
68
68
 
@@ -95,13 +95,13 @@ void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
95
95
  d_keys,
96
96
  d_values,
97
97
  n, 0, sizeof(KeyType)*8,
98
- (cudaStream_t)cuda_stream_get_current()));
98
+ (cudaStream_t)wp_cuda_stream_get_current()));
99
99
 
100
100
  if (d_keys.Current() != keys)
101
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
101
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
102
102
 
103
103
  if (d_values.Current() != values)
104
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
104
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
105
105
  }
106
106
 
107
107
  void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
@@ -119,7 +119,7 @@ void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
119
119
  radix_sort_pairs_device<int64_t>(context, keys, values, n);
120
120
  }
121
121
 
122
- void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
122
+ void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
123
123
  {
124
124
  radix_sort_pairs_device(
125
125
  WP_CURRENT_CONTEXT,
@@ -127,7 +127,7 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
127
127
  reinterpret_cast<int *>(values), n);
128
128
  }
129
129
 
130
- void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
130
+ void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
131
131
  {
132
132
  radix_sort_pairs_device(
133
133
  WP_CURRENT_CONTEXT,
@@ -135,7 +135,7 @@ void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
135
135
  reinterpret_cast<int *>(values), n);
136
136
  }
137
137
 
138
- void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
138
+ void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
139
139
  {
140
140
  radix_sort_pairs_device(
141
141
  WP_CURRENT_CONTEXT,
@@ -166,17 +166,17 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
166
166
  end_indices,
167
167
  0,
168
168
  32,
169
- (cudaStream_t)cuda_stream_get_current()));
169
+ (cudaStream_t)wp_cuda_stream_get_current()));
170
170
 
171
171
  if (!context)
172
- context = cuda_context_get_current();
172
+ context = wp_cuda_context_get_current();
173
173
 
174
174
  RadixSortTemp& temp = g_radix_sort_temp_map[context];
175
175
 
176
176
  if (sort_temp_size > temp.size)
177
177
  {
178
- free_device(WP_CURRENT_CONTEXT, temp.mem);
179
- temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
178
+ wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
179
+ temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
180
180
  temp.size = sort_temp_size;
181
181
  }
182
182
 
@@ -211,16 +211,16 @@ void segmented_sort_pairs_device(void* context, float* keys, int* values, int n,
211
211
  segment_end_indices,
212
212
  0,
213
213
  32,
214
- (cudaStream_t)cuda_stream_get_current()));
214
+ (cudaStream_t)wp_cuda_stream_get_current()));
215
215
 
216
216
  if (d_keys.Current() != keys)
217
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
217
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
218
218
 
219
219
  if (d_values.Current() != values)
220
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
220
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
221
221
  }
222
222
 
223
- void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
223
+ void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
224
224
  {
225
225
  segmented_sort_pairs_device(
226
226
  WP_CURRENT_CONTEXT,
@@ -256,16 +256,16 @@ void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, i
256
256
  segment_end_indices,
257
257
  0,
258
258
  32,
259
- (cudaStream_t)cuda_stream_get_current()));
259
+ (cudaStream_t)wp_cuda_stream_get_current()));
260
260
 
261
261
  if (d_keys.Current() != keys)
262
- memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
262
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
263
263
 
264
264
  if (d_values.Current() != values)
265
- memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
265
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
266
266
  }
267
267
 
268
- void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
268
+ void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
269
269
  {
270
270
  segmented_sort_pairs_device(
271
271
  WP_CURRENT_CONTEXT,
warp/native/sparse.cpp CHANGED
@@ -36,7 +36,7 @@ template <typename T> bool bsr_block_is_zero(int block_idx, int block_size, cons
36
36
  } // namespace
37
37
 
38
38
 
39
- WP_API void bsr_matrix_from_triplets_host(
39
+ WP_API void wp_bsr_matrix_from_triplets_host(
40
40
  int block_size,
41
41
  int scalar_size_in_bytes,
42
42
  int row_count,
@@ -64,8 +64,8 @@ WP_API void bsr_matrix_from_triplets_host(
64
64
  bool return_summed_blocks = tpl_block_offsets != nullptr && tpl_block_indices != nullptr;
65
65
  if (!return_summed_blocks)
66
66
  {
67
- tpl_block_offsets = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
68
- tpl_block_indices = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
67
+ tpl_block_offsets = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
68
+ tpl_block_indices = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
69
69
  }
70
70
 
71
71
  std::iota(tpl_block_indices, tpl_block_indices + nnz, 0);
@@ -156,8 +156,8 @@ WP_API void bsr_matrix_from_triplets_host(
156
156
  if(!return_summed_blocks)
157
157
  {
158
158
  // free our temporary buffers
159
- free_host(tpl_block_offsets);
160
- free_host(tpl_block_indices);
159
+ wp_free_host(tpl_block_offsets);
160
+ wp_free_host(tpl_block_indices);
161
161
  }
162
162
 
163
163
  if (bsr_nnz != nullptr)
@@ -166,7 +166,7 @@ WP_API void bsr_matrix_from_triplets_host(
166
166
  }
167
167
  }
168
168
 
169
- WP_API void bsr_transpose_host(
169
+ WP_API void wp_bsr_transpose_host(
170
170
  int row_count, int col_count, int nnz,
171
171
  const int* bsr_offsets, const int* bsr_columns,
172
172
  int* transposed_bsr_offsets,
@@ -209,7 +209,7 @@ WP_API void bsr_transpose_host(
209
209
  }
210
210
 
211
211
  #if !WP_ENABLE_CUDA
212
- WP_API void bsr_matrix_from_triplets_device(
212
+ WP_API void wp_bsr_matrix_from_triplets_device(
213
213
  int block_size,
214
214
  int scalar_size_in_bytes,
215
215
  int row_count,
@@ -229,7 +229,7 @@ WP_API void bsr_matrix_from_triplets_device(
229
229
  void* bsr_nnz_event) {}
230
230
 
231
231
 
232
- WP_API void bsr_transpose_device(
232
+ WP_API void wp_bsr_transpose_device(
233
233
  int row_count, int col_count, int nnz,
234
234
  const int* bsr_offsets, const int* bsr_columns,
235
235
  int* transposed_bsr_offsets,
warp/native/sparse.cu CHANGED
@@ -50,7 +50,7 @@ template <typename T> struct BsrBlockIsNotZero
50
50
  T zero_mask;
51
51
 
52
52
  BsrBlockIsNotZero(int block_size, const void* values, const uint64_t zero_mask)
53
- : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<const T>(zero_mask))
53
+ : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<T>(zero_mask))
54
54
  {}
55
55
 
56
56
  CUDA_CALLABLE_DEVICE bool operator()(int block) const
@@ -256,7 +256,7 @@ __global__ void bsr_transpose_fill_row_col(const int nnz_upper_bound, const int
256
256
  } // namespace
257
257
 
258
258
 
259
- WP_API void bsr_matrix_from_triplets_device(
259
+ WP_API void wp_bsr_matrix_from_triplets_device(
260
260
  const int block_size,
261
261
  int scalar_size,
262
262
  const int row_count,
@@ -274,13 +274,13 @@ WP_API void bsr_matrix_from_triplets_device(
274
274
  int* bsr_columns,
275
275
  int* bsr_nnz, void* bsr_nnz_event)
276
276
  {
277
- void* context = cuda_context_get_current();
277
+ void* context = wp_cuda_context_get_current();
278
278
  ContextGuard guard(context);
279
279
 
280
280
  // Per-context cached temporary buffers
281
281
  // BsrFromTripletsTemp& bsr_temp = g_bsr_from_triplets_temp_map[context];
282
282
 
283
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
283
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
284
284
 
285
285
  ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * size_t(nnz));
286
286
  ScopedTemporary<int> unique_triplet_count(context, 1);
@@ -289,8 +289,8 @@ WP_API void bsr_matrix_from_triplets_device(
289
289
  if(!return_summed_blocks)
290
290
  {
291
291
  // if not provided, allocate temporary offset and indices buffers
292
- tpl_block_offsets = static_cast<int*>(alloc_device(context, size_t(nnz) * sizeof(int)));
293
- tpl_block_indices = static_cast<int*>(alloc_device(context, size_t(nnz) * sizeof(int)));
292
+ tpl_block_offsets = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
293
+ tpl_block_indices = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
294
294
  }
295
295
 
296
296
 
@@ -357,11 +357,11 @@ WP_API void bsr_matrix_from_triplets_device(
357
357
  {
358
358
  // Copy nnz to host, and record an event for the completed transfer if desired
359
359
 
360
- memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
360
+ wp_memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
361
361
 
362
362
  if (bsr_nnz_event)
363
363
  {
364
- cuda_event_record(bsr_nnz_event, stream);
364
+ wp_cuda_event_record(bsr_nnz_event, stream);
365
365
  }
366
366
  }
367
367
 
@@ -381,21 +381,21 @@ WP_API void bsr_matrix_from_triplets_device(
381
381
  stream));
382
382
  } else {
383
383
  // free our temporary buffers
384
- free_device(context, tpl_block_offsets);
385
- free_device(context, tpl_block_indices);
384
+ wp_free_device(context, tpl_block_offsets);
385
+ wp_free_device(context, tpl_block_indices);
386
386
  }
387
387
  }
388
388
 
389
389
 
390
- WP_API void bsr_transpose_device(int row_count, int col_count, int nnz,
390
+ WP_API void wp_bsr_transpose_device(int row_count, int col_count, int nnz,
391
391
  const int* bsr_offsets, const int* bsr_columns,
392
392
  int* transposed_bsr_offsets, int* transposed_bsr_columns,
393
393
  int* src_block_indices)
394
394
  {
395
- void* context = cuda_context_get_current();
395
+ void* context = wp_cuda_context_get_current();
396
396
  ContextGuard guard(context);
397
397
 
398
- cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
398
+ cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
399
399
 
400
400
  ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * nnz);
401
401