PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/native/runlength_encode.cpp CHANGED Viewed

@@ -53,7 +53,7 @@ void runlength_encode_host(int n,
     }
 }
-void runlength_encode_int_host(
+void wp_runlength_encode_int_host(
     uint64_t values,
     uint64_t run_values,
     uint64_t run_lengths,
@@ -68,7 +68,7 @@ void runlength_encode_int_host(
 }
 #if !WP_ENABLE_CUDA
-void runlength_encode_int_device(
+void wp_runlength_encode_int_device(
     uint64_t values,
     uint64_t run_values,
     uint64_t run_lengths,

warp/native/runlength_encode.cu CHANGED Viewed

@@ -28,24 +28,24 @@ void runlength_encode_device(int n,
                              int *run_lengths,
                              int *run_count)
 {
-    ContextGuard guard(cuda_context_get_current());
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    ContextGuard guard(wp_cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     size_t buff_size = 0;
     check_cuda(cub::DeviceRunLengthEncode::Encode(
         nullptr, buff_size, values, run_values, run_lengths, run_count,
         n, stream));
-    void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
+    void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
     check_cuda(cub::DeviceRunLengthEncode::Encode(
         temp_buffer, buff_size, values, run_values, run_lengths, run_count,
         n, stream));
-    free_device(WP_CURRENT_CONTEXT, temp_buffer);
+    wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
-void runlength_encode_int_device(
+void wp_runlength_encode_int_device(
     uint64_t values,
     uint64_t run_values,
     uint64_t run_lengths,

warp/native/scan.cpp CHANGED Viewed

@@ -28,8 +28,8 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
     // compute temporary memory required
     if (!inclusive && n > scan_temp_max_size)
     {
-	    free_host(scan_temp_memory);
-        scan_temp_memory = alloc_host(sizeof(T) * n);
+	    wp_free_host(scan_temp_memory);
+        scan_temp_memory = wp_alloc_host(sizeof(T) * n);
         scan_temp_max_size = n;
     }
@@ -39,7 +39,7 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
     std::partial_sum(values_in, values_in + n, result);
     if (!inclusive) {
         values_out[0] = (T)0;
-        memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
+        wp_memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
     }
 }

warp/native/scan.cu CHANGED Viewed

@@ -25,9 +25,9 @@
 template<typename T>
 void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
 {
-    ContextGuard guard(cuda_context_get_current());
+    ContextGuard guard(wp_cuda_context_get_current());
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     // compute temporary memory required
 	size_t scan_temp_size;
@@ -37,7 +37,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
         check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
     }
-    void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
+    void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
     // scan
     if (inclusive) {
@@ -46,7 +46,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
         check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
     }
-    free_device(WP_CURRENT_CONTEXT, temp_buffer);
+    wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template void scan_device(const int*, int*, int, bool);

warp/native/sort.cpp CHANGED Viewed

@@ -198,41 +198,41 @@ void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start
 void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
-void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
+void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
-void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
+void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
-void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
+void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
-void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
+void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
-void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
+void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
 #endif // !WP_ENABLE_CUDA
-void radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_host(
         reinterpret_cast<int *>(keys),
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_host(
         reinterpret_cast<int64_t *>(keys),
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_host(
         reinterpret_cast<float *>(keys),
         reinterpret_cast<int *>(values), n);
 }
-void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+void wp_segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
     segmented_sort_pairs_host(
         reinterpret_cast<float *>(keys),
@@ -241,7 +241,7 @@ void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint
         reinterpret_cast<int *>(segment_end_indices), num_segments);
 }
-void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+void wp_segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
     segmented_sort_pairs_host(
         reinterpret_cast<int *>(keys),

warp/native/sort.cu CHANGED Viewed

@@ -23,7 +23,7 @@
 #include <cub/cub.cuh>
-#include <map>
+#include <unordered_map>
 // temporary buffer for radix sort
 struct RadixSortTemp
@@ -32,8 +32,8 @@ struct RadixSortTemp
     size_t size = 0;
 };
-// map temp buffers to CUDA contexts
-static std::map<void*, RadixSortTemp> g_radix_sort_temp_map;
+// use unique temp buffers per CUDA stream to avoid race conditions
+static std::unordered_map<void*, RadixSortTemp> g_radix_sort_temp_map;
 template <typename KeyType>
@@ -44,6 +44,8 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
     cub::DoubleBuffer<KeyType> d_keys;
 	cub::DoubleBuffer<int> d_values;
+    CUstream stream = static_cast<CUstream>(wp_cuda_stream_get_current());
     // compute temporary memory required
 	size_t sort_temp_size;
     check_cuda(cub::DeviceRadixSort::SortPairs(
@@ -52,17 +54,14 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
         d_keys,
         d_values,
         n, 0, sizeof(KeyType)*8,
-        (cudaStream_t)cuda_stream_get_current()));
-    if (!context)
-        context = cuda_context_get_current();
+        stream));
-    RadixSortTemp& temp = g_radix_sort_temp_map[context];
+    RadixSortTemp& temp = g_radix_sort_temp_map[stream];
     if (sort_temp_size > temp.size)
     {
-	    free_device(WP_CURRENT_CONTEXT, temp.mem);
-        temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
+	    wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
+        temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
         temp.size = sort_temp_size;
     }
@@ -77,6 +76,17 @@ void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
     radix_sort_reserve_internal<int>(context, n, mem_out, size_out);
 }
+void radix_sort_release(void* context, void* stream)
+{
+    // release temporary buffer for the given stream, if it exists
+    auto it = g_radix_sort_temp_map.find(stream);
+    if (it != g_radix_sort_temp_map.end())
+    {
+        wp_free_device(context, it->second.mem);
+        g_radix_sort_temp_map.erase(it);
+    }
+}
 template <typename KeyType>
 void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
 {
@@ -95,13 +105,13 @@ void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
         d_keys,
         d_values,
         n, 0, sizeof(KeyType)*8,
-        (cudaStream_t)cuda_stream_get_current()));
+        (cudaStream_t)wp_cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
-		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
 	if (d_values.Current() != values)
-		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
 void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
@@ -119,7 +129,7 @@ void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
     radix_sort_pairs_device<int64_t>(context, keys, values, n);
 }
-void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_device(
         WP_CURRENT_CONTEXT,
@@ -127,7 +137,7 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_device(
         WP_CURRENT_CONTEXT,
@@ -135,7 +145,7 @@ void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_device(
         WP_CURRENT_CONTEXT,
@@ -153,6 +163,8 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
     int* start_indices = NULL;
     int* end_indices = NULL;
+    CUstream stream = static_cast<CUstream>(wp_cuda_stream_get_current());
     // compute temporary memory required
 	size_t sort_temp_size;
     check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
@@ -166,17 +178,14 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
         end_indices,
         0,
         32,
-        (cudaStream_t)cuda_stream_get_current()));
-    if (!context)
-        context = cuda_context_get_current();
+        stream));
-    RadixSortTemp& temp = g_radix_sort_temp_map[context];
+    RadixSortTemp& temp = g_radix_sort_temp_map[stream];
     if (sort_temp_size > temp.size)
     {
-	    free_device(WP_CURRENT_CONTEXT, temp.mem);
-        temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
+	    wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
+        temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
         temp.size = sort_temp_size;
     }
@@ -211,16 +220,16 @@ void segmented_sort_pairs_device(void* context, float* keys, int* values, int n,
         segment_end_indices,
         0,
         32,
-        (cudaStream_t)cuda_stream_get_current()));
+        (cudaStream_t)wp_cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
-		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
 	if (d_values.Current() != values)
-		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
-void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
     segmented_sort_pairs_device(
         WP_CURRENT_CONTEXT,
@@ -256,16 +265,16 @@ void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, i
         segment_end_indices,
         0,
         32,
-        (cudaStream_t)cuda_stream_get_current()));
+        (cudaStream_t)wp_cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
-		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
 	if (d_values.Current() != values)
-		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
-void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
     segmented_sort_pairs_device(
         WP_CURRENT_CONTEXT,

warp/native/sort.h CHANGED Viewed

@@ -20,6 +20,8 @@
 #include <stddef.h>
 void radix_sort_reserve(void* context, int n, void** mem_out=NULL, size_t* size_out=NULL);
+void radix_sort_release(void* context, void* stream);
 void radix_sort_pairs_host(int* keys, int* values, int n);
 void radix_sort_pairs_host(float* keys, int* values, int n);
 void radix_sort_pairs_host(int64_t* keys, int* values, int n);

warp/native/sparse.cpp CHANGED Viewed

@@ -36,7 +36,7 @@ template <typename T> bool bsr_block_is_zero(int block_idx, int block_size, cons
 } // namespace
-WP_API void bsr_matrix_from_triplets_host(
+WP_API void wp_bsr_matrix_from_triplets_host(
     int block_size,
     int scalar_size_in_bytes,
     int row_count,
@@ -64,8 +64,8 @@ WP_API void bsr_matrix_from_triplets_host(
     bool return_summed_blocks = tpl_block_offsets != nullptr && tpl_block_indices != nullptr;
     if (!return_summed_blocks)
     {
-        tpl_block_offsets = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
-        tpl_block_indices = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
+        tpl_block_offsets = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
+        tpl_block_indices = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
     }
     std::iota(tpl_block_indices, tpl_block_indices + nnz, 0);
@@ -156,8 +156,8 @@ WP_API void bsr_matrix_from_triplets_host(
     if(!return_summed_blocks)
     {
         // free our temporary buffers
-        free_host(tpl_block_offsets);
-        free_host(tpl_block_indices);
+        wp_free_host(tpl_block_offsets);
+        wp_free_host(tpl_block_indices);
     }
     if (bsr_nnz != nullptr)
@@ -166,7 +166,7 @@ WP_API void bsr_matrix_from_triplets_host(
     }
 }
-WP_API void bsr_transpose_host(
+WP_API void wp_bsr_transpose_host(
     int row_count, int col_count, int nnz,
     const int* bsr_offsets, const int* bsr_columns,
     int* transposed_bsr_offsets,
@@ -209,7 +209,7 @@ WP_API void bsr_transpose_host(
 }
 #if !WP_ENABLE_CUDA
-WP_API void bsr_matrix_from_triplets_device(
+WP_API void wp_bsr_matrix_from_triplets_device(
     int block_size,
     int scalar_size_in_bytes,
     int row_count,
@@ -229,7 +229,7 @@ WP_API void bsr_matrix_from_triplets_device(
     void* bsr_nnz_event) {}
-WP_API void bsr_transpose_device(
+WP_API void wp_bsr_transpose_device(
     int row_count, int col_count, int nnz,
     const int* bsr_offsets, const int* bsr_columns,
     int* transposed_bsr_offsets,

warp/native/sparse.cu CHANGED Viewed

@@ -50,7 +50,7 @@ template <typename T> struct BsrBlockIsNotZero
     T zero_mask;
     BsrBlockIsNotZero(int block_size, const void* values, const uint64_t zero_mask)
-        : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<const T>(zero_mask))
+        : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<T>(zero_mask))
         {}
     CUDA_CALLABLE_DEVICE bool operator()(int block) const
@@ -256,7 +256,7 @@ __global__ void bsr_transpose_fill_row_col(const int nnz_upper_bound, const int
 } // namespace
-WP_API void bsr_matrix_from_triplets_device(
+WP_API void wp_bsr_matrix_from_triplets_device(
     const int block_size,
     int scalar_size,
     const int row_count,
@@ -274,13 +274,13 @@ WP_API void bsr_matrix_from_triplets_device(
     int* bsr_columns,
     int* bsr_nnz, void* bsr_nnz_event)
 {
-    void* context = cuda_context_get_current();
+    void* context = wp_cuda_context_get_current();
     ContextGuard guard(context);
     // Per-context cached temporary buffers
     // BsrFromTripletsTemp& bsr_temp = g_bsr_from_triplets_temp_map[context];
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * size_t(nnz));
     ScopedTemporary<int> unique_triplet_count(context, 1);
@@ -289,8 +289,8 @@ WP_API void bsr_matrix_from_triplets_device(
     if(!return_summed_blocks)
     {
         // if not provided, allocate temporary offset and indices buffers
-        tpl_block_offsets = static_cast<int*>(alloc_device(context, size_t(nnz) * sizeof(int)));
-        tpl_block_indices = static_cast<int*>(alloc_device(context,  size_t(nnz) * sizeof(int)));
+        tpl_block_offsets = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
+        tpl_block_indices = static_cast<int*>(wp_alloc_device(context,  size_t(nnz) * sizeof(int)));
     }
@@ -357,11 +357,11 @@ WP_API void bsr_matrix_from_triplets_device(
     {
         // Copy nnz to host, and record an event for the completed transfer if desired
-        memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
+        wp_memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
         if (bsr_nnz_event)
         {
-            cuda_event_record(bsr_nnz_event, stream);
+            wp_cuda_event_record(bsr_nnz_event, stream);
         }
     }
@@ -381,21 +381,21 @@ WP_API void bsr_matrix_from_triplets_device(
                                                  stream));
     } else {
         // free our temporary buffers
-        free_device(context, tpl_block_offsets);
-        free_device(context, tpl_block_indices);
+        wp_free_device(context, tpl_block_offsets);
+        wp_free_device(context, tpl_block_indices);
      }
 }
-WP_API void bsr_transpose_device(int row_count, int col_count, int nnz,
+WP_API void wp_bsr_transpose_device(int row_count, int col_count, int nnz,
                           const int* bsr_offsets, const int* bsr_columns,
                           int* transposed_bsr_offsets, int* transposed_bsr_columns,
                           int* src_block_indices)
 {
-    void* context = cuda_context_get_current();
+    void* context = wp_cuda_context_get_current();
     ContextGuard guard(context);
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * nnz);