PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl - Mend

warp-lang 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +93 -30
warp/build_dll.py +48 -63
warp/builtins.py +955 -137
warp/codegen.py +327 -209
warp/config.py +1 -1
warp/context.py +1363 -800
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +266 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +200 -91
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +203 -54
warp/marching_cubes.py +708 -0
warp/native/array.h +103 -8
warp/native/builtin.h +90 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +13 -3
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +42 -11
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +4 -4
warp/native/mat.h +1913 -119
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +5 -3
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +337 -16
warp/native/rand.h +7 -7
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +14 -14
warp/native/spatial.h +366 -17
warp/native/svd.h +23 -8
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +303 -70
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +385 -18
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +337 -193
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +137 -57
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/graph_coloring.py +2 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +559 -176
warp/tape.py +2 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +82 -7
warp/tests/test_array.py +56 -5
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1540 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +162 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +103 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_static.py +48 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tape.py +38 -0
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +216 -441
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +206 -152
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +16 -16
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +16 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/native/reduce.cu CHANGED Viewed

@@ -22,7 +22,6 @@
 #define THRUST_IGNORE_CUB_VERSION_CHECK
 #include <cub/device/device_reduce.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
 namespace
 {
@@ -119,14 +118,14 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
     assert((byte_stride % sizeof(T)) == 0);
     const int stride = byte_stride / sizeof(T);
-    ContextGuard guard(cuda_context_get_current());
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    ContextGuard guard(wp_cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     cub_strided_iterator<const T> ptr_strided{ptr_a, stride};
     size_t buff_size = 0;
     check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, ptr_strided, ptr_out, count, stream));
-    void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
+    void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
     for (int k = 0; k < type_length; ++k)
     {
@@ -134,7 +133,7 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
         check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, ptr_strided, ptr_out + k, count, stream));
     }
-    free_device(WP_CURRENT_CONTEXT, temp_buffer);
+    wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template <typename T>
@@ -280,18 +279,18 @@ void array_inner_device(const ElemT *ptr_a, const ElemT *ptr_b, ScalarT *ptr_out
     const int stride_a = byte_stride_a / sizeof(ElemT);
     const int stride_b = byte_stride_b / sizeof(ElemT);
-    ContextGuard guard(cuda_context_get_current());
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    ContextGuard guard(wp_cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     cub_inner_product_iterator<ElemT, ScalarT> inner_iterator{ptr_a, ptr_b, stride_a, stride_b, type_length};
     size_t buff_size = 0;
     check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, inner_iterator, ptr_out, count, stream));
-    void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
+    void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
     check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, inner_iterator, ptr_out, count, stream));
-    free_device(WP_CURRENT_CONTEXT, temp_buffer);
+    wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template <typename T>
@@ -327,10 +326,10 @@ void array_inner_device_dispatch(const T *ptr_a, const T *ptr_b, T *ptr_out, int
 } // anonymous namespace
-void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+void wp_array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
                               int type_len)
 {
-    void *context = cuda_context_get_current();
+    void *context = wp_cuda_context_get_current();
     const float *ptr_a = (const float *)(a);
     const float *ptr_b = (const float *)(b);
@@ -339,7 +338,7 @@ void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, i
     array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
 }
-void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+void wp_array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
                                int type_len)
 {
     const double *ptr_a = (const double *)(a);
@@ -349,14 +348,14 @@ void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count,
     array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
 }
-void array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
+void wp_array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
 {
     const float *ptr_a = (const float *)(a);
     float *ptr_out = (float *)(out);
     array_sum_device_dispatch(ptr_a, ptr_out, count, byte_stride, type_length);
 }
-void array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
+void wp_array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
 {
     const double *ptr_a = (const double *)(a);
     double *ptr_out = (double *)(out);

warp/native/runlength_encode.cpp CHANGED Viewed

@@ -53,7 +53,7 @@ void runlength_encode_host(int n,
     }
 }
-void runlength_encode_int_host(
+void wp_runlength_encode_int_host(
     uint64_t values,
     uint64_t run_values,
     uint64_t run_lengths,
@@ -68,7 +68,7 @@ void runlength_encode_int_host(
 }
 #if !WP_ENABLE_CUDA
-void runlength_encode_int_device(
+void wp_runlength_encode_int_device(
     uint64_t values,
     uint64_t run_values,
     uint64_t run_lengths,

warp/native/runlength_encode.cu CHANGED Viewed

@@ -28,24 +28,24 @@ void runlength_encode_device(int n,
                              int *run_lengths,
                              int *run_count)
 {
-    ContextGuard guard(cuda_context_get_current());
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    ContextGuard guard(wp_cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     size_t buff_size = 0;
     check_cuda(cub::DeviceRunLengthEncode::Encode(
         nullptr, buff_size, values, run_values, run_lengths, run_count,
         n, stream));
-    void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
+    void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
     check_cuda(cub::DeviceRunLengthEncode::Encode(
         temp_buffer, buff_size, values, run_values, run_lengths, run_count,
         n, stream));
-    free_device(WP_CURRENT_CONTEXT, temp_buffer);
+    wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
-void runlength_encode_int_device(
+void wp_runlength_encode_int_device(
     uint64_t values,
     uint64_t run_values,
     uint64_t run_lengths,

warp/native/scan.cpp CHANGED Viewed

@@ -28,8 +28,8 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
     // compute temporary memory required
     if (!inclusive && n > scan_temp_max_size)
     {
-	    free_host(scan_temp_memory);
-        scan_temp_memory = alloc_host(sizeof(T) * n);
+	    wp_free_host(scan_temp_memory);
+        scan_temp_memory = wp_alloc_host(sizeof(T) * n);
         scan_temp_max_size = n;
     }
@@ -39,7 +39,7 @@ void scan_host(const T* values_in, T* values_out, int n, bool inclusive)
     std::partial_sum(values_in, values_in + n, result);
     if (!inclusive) {
         values_out[0] = (T)0;
-        memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
+        wp_memcpy_h2h(values_out + 1, result, sizeof(T) * (n - 1));
     }
 }

warp/native/scan.cu CHANGED Viewed

@@ -25,9 +25,9 @@
 template<typename T>
 void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
 {
-    ContextGuard guard(cuda_context_get_current());
+    ContextGuard guard(wp_cuda_context_get_current());
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     // compute temporary memory required
 	size_t scan_temp_size;
@@ -37,7 +37,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
         check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
     }
-    void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
+    void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
     // scan
     if (inclusive) {
@@ -46,7 +46,7 @@ void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
         check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
     }
-    free_device(WP_CURRENT_CONTEXT, temp_buffer);
+    wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template void scan_device(const int*, int*, int, bool);

warp/native/sort.cpp CHANGED Viewed

@@ -198,41 +198,41 @@ void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start
 void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
-void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
+void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
-void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
+void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
-void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
+void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
-void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
+void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
-void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
+void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
 #endif // !WP_ENABLE_CUDA
-void radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_host(
         reinterpret_cast<int *>(keys),
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_host(
         reinterpret_cast<int64_t *>(keys),
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_host(
         reinterpret_cast<float *>(keys),
         reinterpret_cast<int *>(values), n);
 }
-void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+void wp_segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
     segmented_sort_pairs_host(
         reinterpret_cast<float *>(keys),
@@ -241,7 +241,7 @@ void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint
         reinterpret_cast<int *>(segment_end_indices), num_segments);
 }
-void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+void wp_segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
     segmented_sort_pairs_host(
         reinterpret_cast<int *>(keys),

warp/native/sort.cu CHANGED Viewed

@@ -52,17 +52,17 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
         d_keys,
         d_values,
         n, 0, sizeof(KeyType)*8,
-        (cudaStream_t)cuda_stream_get_current()));
+        (cudaStream_t)wp_cuda_stream_get_current()));
     if (!context)
-        context = cuda_context_get_current();
+        context = wp_cuda_context_get_current();
     RadixSortTemp& temp = g_radix_sort_temp_map[context];
     if (sort_temp_size > temp.size)
     {
-	    free_device(WP_CURRENT_CONTEXT, temp.mem);
-        temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
+	    wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
+        temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
         temp.size = sort_temp_size;
     }
@@ -95,13 +95,13 @@ void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
         d_keys,
         d_values,
         n, 0, sizeof(KeyType)*8,
-        (cudaStream_t)cuda_stream_get_current()));
+        (cudaStream_t)wp_cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
-		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
 	if (d_values.Current() != values)
-		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
 void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
@@ -119,7 +119,7 @@ void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
     radix_sort_pairs_device<int64_t>(context, keys, values, n);
 }
-void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_device(
         WP_CURRENT_CONTEXT,
@@ -127,7 +127,7 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_device(
         WP_CURRENT_CONTEXT,
@@ -135,7 +135,7 @@ void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
+void wp_radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_device(
         WP_CURRENT_CONTEXT,
@@ -166,17 +166,17 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
         end_indices,
         0,
         32,
-        (cudaStream_t)cuda_stream_get_current()));
+        (cudaStream_t)wp_cuda_stream_get_current()));
     if (!context)
-        context = cuda_context_get_current();
+        context = wp_cuda_context_get_current();
     RadixSortTemp& temp = g_radix_sort_temp_map[context];
     if (sort_temp_size > temp.size)
     {
-	    free_device(WP_CURRENT_CONTEXT, temp.mem);
-        temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
+	    wp_free_device(WP_CURRENT_CONTEXT, temp.mem);
+        temp.mem = wp_alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
         temp.size = sort_temp_size;
     }
@@ -211,16 +211,16 @@ void segmented_sort_pairs_device(void* context, float* keys, int* values, int n,
         segment_end_indices,
         0,
         32,
-        (cudaStream_t)cuda_stream_get_current()));
+        (cudaStream_t)wp_cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
-		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
 	if (d_values.Current() != values)
-		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
-void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+void wp_segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
     segmented_sort_pairs_device(
         WP_CURRENT_CONTEXT,
@@ -256,16 +256,16 @@ void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, i
         segment_end_indices,
         0,
         32,
-        (cudaStream_t)cuda_stream_get_current()));
+        (cudaStream_t)wp_cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
-		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
 	if (d_values.Current() != values)
-		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
+		wp_memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
-void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+void wp_segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
     segmented_sort_pairs_device(
         WP_CURRENT_CONTEXT,

warp/native/sparse.cpp CHANGED Viewed

@@ -36,7 +36,7 @@ template <typename T> bool bsr_block_is_zero(int block_idx, int block_size, cons
 } // namespace
-WP_API void bsr_matrix_from_triplets_host(
+WP_API void wp_bsr_matrix_from_triplets_host(
     int block_size,
     int scalar_size_in_bytes,
     int row_count,
@@ -64,8 +64,8 @@ WP_API void bsr_matrix_from_triplets_host(
     bool return_summed_blocks = tpl_block_offsets != nullptr && tpl_block_indices != nullptr;
     if (!return_summed_blocks)
     {
-        tpl_block_offsets = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
-        tpl_block_indices = static_cast<int*>(alloc_host(size_t(nnz) * sizeof(int)));
+        tpl_block_offsets = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
+        tpl_block_indices = static_cast<int*>(wp_alloc_host(size_t(nnz) * sizeof(int)));
     }
     std::iota(tpl_block_indices, tpl_block_indices + nnz, 0);
@@ -156,8 +156,8 @@ WP_API void bsr_matrix_from_triplets_host(
     if(!return_summed_blocks)
     {
         // free our temporary buffers
-        free_host(tpl_block_offsets);
-        free_host(tpl_block_indices);
+        wp_free_host(tpl_block_offsets);
+        wp_free_host(tpl_block_indices);
     }
     if (bsr_nnz != nullptr)
@@ -166,7 +166,7 @@ WP_API void bsr_matrix_from_triplets_host(
     }
 }
-WP_API void bsr_transpose_host(
+WP_API void wp_bsr_transpose_host(
     int row_count, int col_count, int nnz,
     const int* bsr_offsets, const int* bsr_columns,
     int* transposed_bsr_offsets,
@@ -209,7 +209,7 @@ WP_API void bsr_transpose_host(
 }
 #if !WP_ENABLE_CUDA
-WP_API void bsr_matrix_from_triplets_device(
+WP_API void wp_bsr_matrix_from_triplets_device(
     int block_size,
     int scalar_size_in_bytes,
     int row_count,
@@ -229,7 +229,7 @@ WP_API void bsr_matrix_from_triplets_device(
     void* bsr_nnz_event) {}
-WP_API void bsr_transpose_device(
+WP_API void wp_bsr_transpose_device(
     int row_count, int col_count, int nnz,
     const int* bsr_offsets, const int* bsr_columns,
     int* transposed_bsr_offsets,

warp/native/sparse.cu CHANGED Viewed

@@ -50,7 +50,7 @@ template <typename T> struct BsrBlockIsNotZero
     T zero_mask;
     BsrBlockIsNotZero(int block_size, const void* values, const uint64_t zero_mask)
-        : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<const T>(zero_mask))
+        : block_size(block_size), values(static_cast<const T*>(values)), zero_mask(static_cast<T>(zero_mask))
         {}
     CUDA_CALLABLE_DEVICE bool operator()(int block) const
@@ -256,7 +256,7 @@ __global__ void bsr_transpose_fill_row_col(const int nnz_upper_bound, const int
 } // namespace
-WP_API void bsr_matrix_from_triplets_device(
+WP_API void wp_bsr_matrix_from_triplets_device(
     const int block_size,
     int scalar_size,
     const int row_count,
@@ -274,13 +274,13 @@ WP_API void bsr_matrix_from_triplets_device(
     int* bsr_columns,
     int* bsr_nnz, void* bsr_nnz_event)
 {
-    void* context = cuda_context_get_current();
+    void* context = wp_cuda_context_get_current();
     ContextGuard guard(context);
     // Per-context cached temporary buffers
     // BsrFromTripletsTemp& bsr_temp = g_bsr_from_triplets_temp_map[context];
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * size_t(nnz));
     ScopedTemporary<int> unique_triplet_count(context, 1);
@@ -289,8 +289,8 @@ WP_API void bsr_matrix_from_triplets_device(
     if(!return_summed_blocks)
     {
         // if not provided, allocate temporary offset and indices buffers
-        tpl_block_offsets = static_cast<int*>(alloc_device(context, size_t(nnz) * sizeof(int)));
-        tpl_block_indices = static_cast<int*>(alloc_device(context,  size_t(nnz) * sizeof(int)));
+        tpl_block_offsets = static_cast<int*>(wp_alloc_device(context, size_t(nnz) * sizeof(int)));
+        tpl_block_indices = static_cast<int*>(wp_alloc_device(context,  size_t(nnz) * sizeof(int)));
     }
@@ -334,7 +334,7 @@ WP_API void bsr_matrix_from_triplets_device(
         // Ensures the sorted keys are available in summed_block_indices if needed
         if(return_summed_blocks && d_keys.Current() != tpl_block_indices)
         {
-            check_cuda(cudaMemcpy(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice));
+            check_cuda(cudaMemcpyAsync(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice, stream));
         }
     }
@@ -357,11 +357,11 @@ WP_API void bsr_matrix_from_triplets_device(
     {
         // Copy nnz to host, and record an event for the completed transfer if desired
-        memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
+        wp_memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
         if (bsr_nnz_event)
         {
-            cuda_event_record(bsr_nnz_event, stream);
+            wp_cuda_event_record(bsr_nnz_event, stream);
         }
     }
@@ -381,21 +381,21 @@ WP_API void bsr_matrix_from_triplets_device(
                                                  stream));
     } else {
         // free our temporary buffers
-        free_device(context, tpl_block_offsets);
-        free_device(context, tpl_block_indices);
+        wp_free_device(context, tpl_block_offsets);
+        wp_free_device(context, tpl_block_indices);
      }
 }
-WP_API void bsr_transpose_device(int row_count, int col_count, int nnz,
+WP_API void wp_bsr_transpose_device(int row_count, int col_count, int nnz,
                           const int* bsr_offsets, const int* bsr_columns,
                           int* transposed_bsr_offsets, int* transposed_bsr_columns,
                           int* src_block_indices)
 {
-    void* context = cuda_context_get_current();
+    void* context = wp_cuda_context_get_current();
     ContextGuard guard(context);
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     ScopedTemporary<BsrRowCol> combined_row_col(context, 2 * nnz);