PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/native/bvh.cu CHANGED Viewed

@@ -155,7 +155,7 @@ void bvh_refit_device(BVH& bvh)
     ContextGuard guard(bvh.context);
     // clear child counters
-    memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
+    wp_memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
     wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_leaf_nodes, (bvh.num_leaf_nodes, bvh.node_parents, bvh.node_counts, bvh.primitive_indices, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
 }
@@ -474,16 +474,16 @@ LinearBVHBuilderGPU::LinearBVHBuilderGPU()
     , total_upper(NULL)
     , total_inv_edges(NULL)
 {
-    total_lower = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
-    total_upper = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
-    total_inv_edges = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
+    total_lower = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
+    total_upper = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
+    total_inv_edges = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
 }
 LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
 {
-    free_device(WP_CURRENT_CONTEXT, total_lower);
-    free_device(WP_CURRENT_CONTEXT, total_upper);
-    free_device(WP_CURRENT_CONTEXT, total_inv_edges);
+    wp_free_device(WP_CURRENT_CONTEXT, total_lower);
+    wp_free_device(WP_CURRENT_CONTEXT, total_upper);
+    wp_free_device(WP_CURRENT_CONTEXT, total_inv_edges);
 }
@@ -491,12 +491,12 @@ LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
 void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds)
 {
     // allocate temporary memory used during  building
-    indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); 	// *2 for radix sort
-    keys = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2);	    // *2 for radix sort
-    deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items);    	// highest differentiating bit between keys for item i and i+1
-    range_lefts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
-    range_rights = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
-    num_children = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
+    indices = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); 	// *2 for radix sort
+    keys = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2);	    // *2 for radix sort
+    deltas = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items);    	// highest differentiating bit between keys for item i and i+1
+    range_lefts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
+    range_rights = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
+    num_children = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
     // if total bounds supplied by the host then we just
     // compute our edge length and upload it to the GPU directly
@@ -508,17 +508,17 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
         vec3 inv_edges = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
-        memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &total_bounds->lower[0], sizeof(vec3));
-        memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &total_bounds->upper[0], sizeof(vec3));
-        memcpy_h2d(WP_CURRENT_CONTEXT, total_inv_edges, &inv_edges[0], sizeof(vec3));
+        wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &total_bounds->lower[0], sizeof(vec3));
+        wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &total_bounds->upper[0], sizeof(vec3));
+        wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_inv_edges, &inv_edges[0], sizeof(vec3));
     }
     else
     {
         static vec3 upper(-FLT_MAX);
         static vec3 lower(FLT_MAX);
-        memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &lower, sizeof(lower));
-        memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &upper, sizeof(upper));
+        wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &lower, sizeof(lower));
+        wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &upper, sizeof(upper));
         // compute the total bounds on the GPU
         wp_launch_device(WP_CURRENT_CONTEXT, compute_total_bounds, num_items, (item_lowers, item_uppers, total_lower, total_upper, num_items));
@@ -532,7 +532,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
     // sort items based on Morton key (note the 32-bit sort key corresponds to the template parameter to morton3, i.e. 3x9 bit keys combined)
     radix_sort_pairs_device(WP_CURRENT_CONTEXT, keys, indices, num_items);
-    memcpy_d2d(WP_CURRENT_CONTEXT, bvh.primitive_indices, indices, sizeof(int) * num_items);
+    wp_memcpy_d2d(WP_CURRENT_CONTEXT, bvh.primitive_indices, indices, sizeof(int) * num_items);
     // calculate deltas between adjacent keys
     wp_launch_device(WP_CURRENT_CONTEXT, compute_key_deltas, num_items, (keys, deltas, num_items-1));
@@ -541,20 +541,20 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
     wp_launch_device(WP_CURRENT_CONTEXT, build_leaves, num_items, (item_lowers, item_uppers, num_items, indices, range_lefts, range_rights, bvh.node_lowers, bvh.node_uppers));
     // reset children count, this is our atomic counter so we know when an internal node is complete, only used during building
-    memset_device(WP_CURRENT_CONTEXT, num_children, 0, sizeof(int)*bvh.max_nodes);
+    wp_memset_device(WP_CURRENT_CONTEXT, num_children, 0, sizeof(int)*bvh.max_nodes);
     // build the tree and internal node bounds
     wp_launch_device(WP_CURRENT_CONTEXT, build_hierarchy, num_items, (num_items, bvh.root, deltas, num_children, bvh.primitive_indices, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
     wp_launch_device(WP_CURRENT_CONTEXT, mark_packed_leaf_nodes, bvh.max_nodes, (bvh.max_nodes, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
     // free temporary memory
-    free_device(WP_CURRENT_CONTEXT, indices);
-    free_device(WP_CURRENT_CONTEXT, keys);
-    free_device(WP_CURRENT_CONTEXT, deltas);
+    wp_free_device(WP_CURRENT_CONTEXT, indices);
+    wp_free_device(WP_CURRENT_CONTEXT, keys);
+    wp_free_device(WP_CURRENT_CONTEXT, deltas);
-    free_device(WP_CURRENT_CONTEXT, range_lefts);
-    free_device(WP_CURRENT_CONTEXT, range_rights);
-    free_device(WP_CURRENT_CONTEXT, num_children);
+    wp_free_device(WP_CURRENT_CONTEXT, range_lefts);
+    wp_free_device(WP_CURRENT_CONTEXT, range_rights);
+    wp_free_device(WP_CURRENT_CONTEXT, num_children);
 }
@@ -562,8 +562,8 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
 template<typename T>
 T* make_device_buffer_of(void* context, T* host_buffer, size_t buffer_size)
 {
-    T* device_buffer = (T*)alloc_device(context, sizeof(T) * buffer_size);;
-    memcpy_h2d(context, device_buffer, host_buffer, sizeof(T) * buffer_size);
+    T* device_buffer = (T*)wp_alloc_device(context, sizeof(T) * buffer_size);;
+    wp_memcpy_h2d(context, device_buffer, host_buffer, sizeof(T) * buffer_size);
     return device_buffer;
 }
@@ -662,8 +662,8 @@ void copy_host_tree_to_device(void* context, BVH& bvh_host, BVH& bvh_device_on_h
     bvh_device_on_host.num_items = bvh_host.num_items;
     bvh_device_on_host.max_depth = bvh_host.max_depth;
-    bvh_device_on_host.root = (int*)alloc_device(context, sizeof(int));
-    memcpy_h2d(context, bvh_device_on_host.root, bvh_host.root, sizeof(int));
+    bvh_device_on_host.root = (int*)wp_alloc_device(context, sizeof(int));
+    wp_memcpy_h2d(context, bvh_device_on_host.root, bvh_host.root, sizeof(int));
     bvh_device_on_host.context = context;
     bvh_device_on_host.node_lowers = make_device_buffer_of(context, bvh_host.node_lowers, bvh_host.max_nodes);
@@ -682,12 +682,12 @@ void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items,
         // copy bounds back to CPU
         std::vector<vec3> lowers_host(num_items);
         std::vector<vec3> uppers_host(num_items);
-        memcpy_d2h(WP_CURRENT_CONTEXT, lowers_host.data(), lowers, sizeof(vec3) * num_items);
-        memcpy_d2h(WP_CURRENT_CONTEXT, uppers_host.data(), uppers, sizeof(vec3) * num_items);
+        wp_memcpy_d2h(WP_CURRENT_CONTEXT, lowers_host.data(), lowers, sizeof(vec3) * num_items);
+        wp_memcpy_d2h(WP_CURRENT_CONTEXT, uppers_host.data(), uppers, sizeof(vec3) * num_items);
         // run CPU based constructor
         wp::BVH bvh_host;
-        bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
+        wp::bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
         // copy host tree to device
         wp::copy_host_tree_to_device(WP_CURRENT_CONTEXT, bvh_host, bvh_device_on_host);
@@ -695,26 +695,26 @@ void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items,
         bvh_device_on_host.item_lowers = lowers;
         bvh_device_on_host.item_uppers = uppers;
         // node_counts is not allocated for host tree
-        bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
-        bvh_destroy_host(bvh_host);
+        bvh_device_on_host.node_counts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
+        wp::bvh_destroy_host(bvh_host);
     }
     else if (constructor_type == BVH_CONSTRUCTOR_LBVH)
     {
         bvh_device_on_host.num_items = num_items;
         bvh_device_on_host.max_nodes = 2 * num_items - 1;
         bvh_device_on_host.num_leaf_nodes = num_items;
-        bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
-        memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_lowers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
-        bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
-        memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_uppers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
-        bvh_device_on_host.node_parents = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
-        bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
-        bvh_device_on_host.root = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
-        bvh_device_on_host.primitive_indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * num_items);
+        bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
+        wp_memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_lowers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
+        bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
+        wp_memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_uppers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
+        bvh_device_on_host.node_parents = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
+        bvh_device_on_host.node_counts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
+        bvh_device_on_host.root = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
+        bvh_device_on_host.primitive_indices = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * num_items);
         bvh_device_on_host.item_lowers = lowers;
         bvh_device_on_host.item_uppers = uppers;
-        bvh_device_on_host.context = context ? context : cuda_context_get_current();
+        bvh_device_on_host.context = context ? context : wp_cuda_context_get_current();
         LinearBVHBuilderGPU builder;
         builder.build(bvh_device_on_host, lowers, uppers, num_items, NULL);
@@ -729,26 +729,26 @@ void bvh_destroy_device(BVH& bvh)
 {
     ContextGuard guard(bvh.context);
-    free_device(WP_CURRENT_CONTEXT, bvh.node_lowers); bvh.node_lowers = NULL;
-    free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
-    free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
-    free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
-    free_device(WP_CURRENT_CONTEXT, bvh.primitive_indices); bvh.primitive_indices = NULL;
-    free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
+    wp_free_device(WP_CURRENT_CONTEXT, bvh.node_lowers); bvh.node_lowers = NULL;
+    wp_free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
+    wp_free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
+    wp_free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
+    wp_free_device(WP_CURRENT_CONTEXT, bvh.primitive_indices); bvh.primitive_indices = NULL;
+    wp_free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
 }
 } // namespace wp
-void bvh_refit_device(uint64_t id)
+void wp_bvh_refit_device(uint64_t id)
 {
     wp::BVH bvh;
     if (bvh_get_descriptor(id, bvh))
     {
         ContextGuard guard(bvh.context);
-        bvh_refit_device(bvh);
+        wp::bvh_refit_device(bvh);
     }
 }
@@ -759,17 +759,17 @@ void bvh_refit_device(uint64_t id)
 * muted. However, the muted leaf nodes will still have the pointer to their parents, thus the up-tracing
 * can still work. We will only compute the bounding box of a leaf node if its parent is not a leaf node.
 */
-uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items, int constructor_type)
+uint64_t wp_bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items, int constructor_type)
 {
     ContextGuard guard(context);
     wp::BVH bvh_device_on_host;
     wp::BVH* bvh_device_ptr = nullptr;
-    bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
+    wp::bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
     // create device-side BVH descriptor
-    bvh_device_ptr = (wp::BVH*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
-    memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device_ptr, &bvh_device_on_host, sizeof(wp::BVH));
+    bvh_device_ptr = (wp::BVH*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
+    wp_memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device_ptr, &bvh_device_on_host, sizeof(wp::BVH));
     uint64_t bvh_id = (uint64_t)bvh_device_ptr;
     wp::bvh_add_descriptor(bvh_id, bvh_device_on_host);
@@ -777,7 +777,7 @@ uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, in
 }
-void bvh_destroy_device(uint64_t id)
+void wp_bvh_destroy_device(uint64_t id)
 {
     wp::BVH bvh;
     if (wp::bvh_get_descriptor(id, bvh))
@@ -786,6 +786,6 @@ void bvh_destroy_device(uint64_t id)
         wp::bvh_rem_descriptor(id);
         // free descriptor
-        free_device(WP_CURRENT_CONTEXT, (void*)id);
+        wp_free_device(WP_CURRENT_CONTEXT, (void*)id);
     }
 }

warp/native/bvh.h CHANGED Viewed

@@ -357,7 +357,7 @@ CUDA_CALLABLE inline bvh_query_t bvh_query(
 		BVHPackedNodeHalf node_lower = bvh_load_node(bvh.node_lowers, node_index);
 		BVHPackedNodeHalf node_upper = bvh_load_node(bvh.node_uppers, node_index);
-		if (!bvh_query_intersection_test(query, (vec3&)node_lower, (vec3&)node_upper))
+        if (!bvh_query_intersection_test(query, reinterpret_cast<vec3&>(node_lower), reinterpret_cast<vec3&>(node_upper)))
 		{
 			continue;
 		}
@@ -464,7 +464,7 @@ CUDA_CALLABLE inline bool bvh_query_next(bvh_query_t& query, int& index)
 		wp::vec3 upper_pos(node_upper.x, node_upper.y, node_upper.z);
 		wp::bounds3 current_bounds(lower_pos, upper_pos);
-		if (!bvh_query_intersection_test(query, (vec3&)node_lower, (vec3&)node_upper))
+        if (!bvh_query_intersection_test(query, reinterpret_cast<vec3&>(node_lower), reinterpret_cast<vec3&>(node_upper)))
 		{
 			continue;
 		}

warp/native/clang/clang.cpp CHANGED Viewed

@@ -175,7 +175,7 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
     clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
     bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
-    buffer.release();
+    (void)buffer.release();
     return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
 }
@@ -240,14 +240,14 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
     clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
     bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
-    buffer.release();
+    (void)buffer.release();
     return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
 }
 extern "C" {
-WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp)
+WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp)
 {
     initialize_llvm();
@@ -294,7 +294,7 @@ WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char*
     return 0;
 }
-WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
+WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
 {
     initialize_llvm();
@@ -355,7 +355,7 @@ WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char*
 static llvm::orc::LLJIT* jit = nullptr;
 // Load an object file into an in-memory DLL named `module_name`
-WP_API int load_obj(const char* object_file, const char* module_name)
+WP_API int wp_load_obj(const char* object_file, const char* module_name)
 {
     if(!jit)
     {
@@ -497,7 +497,7 @@ WP_API int load_obj(const char* object_file, const char* module_name)
     return 0;
 }
-WP_API int unload_obj(const char* module_name)
+WP_API int wp_unload_obj(const char* module_name)
 {
     if(!jit)  // If there's no JIT instance there are no object files loaded
     {
@@ -516,7 +516,7 @@ WP_API int unload_obj(const char* module_name)
     return 0;
 }
-WP_API uint64_t lookup(const char* dll_name, const char* function_name)
+WP_API uint64_t wp_lookup(const char* dll_name, const char* function_name)
 {
     auto* dll = jit->getJITDylibByName(dll_name);

warp/native/coloring.cpp CHANGED Viewed

@@ -35,6 +35,7 @@
 #include "warp.h"
+#include <climits>
 #include <iostream>
 #include <vector>
 #include <array>
@@ -338,9 +339,14 @@ public:
     int get_node_weight(int node_idx)
     {
+        if (node_idx < 0 || node_idx >= (int)node_weights.size()) {
+            fprintf(stderr, "The node_idx %d is out of range!\n", node_idx);
+            return INT_MIN;
+        }
         return node_weights[node_idx];
     }
     void add_node(int weight, int node_idx)
     {
         if (weight >= weight_buckets.size())
@@ -539,7 +545,7 @@ using namespace wp;
 extern "C"
 {
-    int graph_coloring(int num_nodes, wp::array_t<int> edges, int algorithm, wp::array_t<int> node_colors)
+    int wp_graph_coloring(int num_nodes, wp::array_t<int> edges, int algorithm, wp::array_t<int> node_colors)
     {
         if (node_colors.ndim != 1 || node_colors.shape[0] != num_nodes)
         {
@@ -594,7 +600,7 @@ extern "C"
         return num_colors;
     }
-    float balance_coloring(int num_nodes, wp::array_t<int> edges, int num_colors,
+    float wp_balance_coloring(int num_nodes, wp::array_t<int> edges, int num_colors,
         float target_max_min_ratio, wp::array_t<int> node_colors)
     {
         Graph graph(num_nodes, edges);

warp/native/crt.cpp CHANGED Viewed

@@ -41,11 +41,11 @@ extern "C" WP_API void _wp_assert(const char* expression, const char* file, unsi
     fflush(stdout);
     fprintf(stderr,
         "Assertion failed: '%s'\n"
-        "At '%s:%d'\n",
+        "At '%s:%u'\n",
         expression, file, line);
     fflush(stderr);
     // Now invoke the standard assert(), which may abort the program or break
     // into the debugger as decided by the runtime environment.
-    assert(false && "assert() failed");
+    assert(false && "assert() failed"); // cppcheck-suppress incorrectStringBooleanError
 }

warp/native/crt.h CHANGED Viewed

@@ -110,11 +110,9 @@ extern "C" WP_API int _wp_isinf(double);
 #define SCHAR_MIN   (-128)
 #define SCHAR_MAX   127
 #define UCHAR_MAX   255
-enum {
-  _JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0,
-  CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN,
-  CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX,
-};
+#define _JITIFY_CHAR_IS_UNSIGNED ((char)-1 >= 0)
+#define CHAR_MIN (_JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN)
+#define CHAR_MAX (_JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX)
 #define SHRT_MIN    (-32768)
 #define SHRT_MAX    32767
 #define USHRT_MAX   65535

warp/native/cuda_util.cpp CHANGED Viewed

@@ -33,14 +33,14 @@
 #include <stack>
 // the minimum CUDA version required from the driver
-#define WP_CUDA_DRIVER_VERSION 11040
+#define WP_CUDA_DRIVER_VERSION 12000
 // the minimum CUDA Toolkit version required to build Warp
-#define WP_CUDA_TOOLKIT_VERSION 11050
+#define WP_CUDA_TOOLKIT_VERSION 12000
 // check if the CUDA Toolkit is too old
 #if CUDA_VERSION < WP_CUDA_TOOLKIT_VERSION
-#error Building Warp requires CUDA Toolkit version 11.5 or higher
+#error Building Warp requires CUDA Toolkit version 12.0 or higher
 #endif
 // Avoid including <cudaGLTypedefs.h>, which requires OpenGL headers to be installed.
@@ -56,11 +56,12 @@ typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResou
 // function pointers to driver API entry points
 // these are explicitly versioned according to cudaTypedefs.h from CUDA Toolkit WP_CUDA_TOOLKIT_VERSION
-#if CUDA_VERSION < 12000
-static PFN_cuGetProcAddress_v11030 pfn_cuGetProcAddress;
-#else
-static PFN_cuGetProcAddress_v12000 pfn_cuGetProcAddress;
+#if CUDA_VERSION >= 13000
+#define PFN_cuGetProcAddress  PFN_cuGetProcAddress_v12000
 #endif
+static PFN_cuGetProcAddress_v12000 pfn_cuGetProcAddress;
 static PFN_cuDriverGetVersion_v2020 pfn_cuDriverGetVersion;
 static PFN_cuGetErrorName_v6000 pfn_cuGetErrorName;
 static PFN_cuGetErrorString_v6000 pfn_cuGetErrorString;
@@ -100,6 +101,12 @@ static PFN_cuEventQuery_v2000 pfn_cuEventQuery;
 static PFN_cuEventRecord_v2000 pfn_cuEventRecord;
 static PFN_cuEventRecordWithFlags_v11010 pfn_cuEventRecordWithFlags;
 static PFN_cuEventSynchronize_v2000 pfn_cuEventSynchronize;
+#if CUDA_VERSION >= 12030
+// function used to add conditional graph nodes, not available in older CUDA versions
+static PFN_cuGraphAddNode_v12030 pfn_cuGraphAddNode;
+#endif
+static PFN_cuGraphNodeGetDependentNodes_v10000 pfn_cuGraphNodeGetDependentNodes;
+static PFN_cuGraphNodeGetType_v10000 pfn_cuGraphNodeGetType;
 static PFN_cuModuleLoadDataEx_v2010 pfn_cuModuleLoadDataEx;
 static PFN_cuModuleUnload_v2000 pfn_cuModuleUnload;
 static PFN_cuModuleGetFunction_v2000 pfn_cuModuleGetFunction;
@@ -163,7 +170,7 @@ bool init_cuda_driver()
 #if defined(_WIN32)
     static HMODULE hCudaDriver = LoadLibraryA("nvcuda.dll");
     if (hCudaDriver == NULL) {
-        fprintf(stderr, "Warp CUDA error: Could not open nvcuda.dll.\n");
+        fprintf(stderr, "Warp CUDA warning: Could not find or load the NVIDIA CUDA driver. Proceeding in CPU-only mode.\n");
         return false;
     }
     pfn_cuGetProcAddress = (PFN_cuGetProcAddress)GetProcAddress(hCudaDriver, "cuGetProcAddress");
@@ -173,7 +180,7 @@ bool init_cuda_driver()
         // WSL and possibly other systems might require the .1 suffix
         hCudaDriver = dlopen("libcuda.so.1", RTLD_NOW);
         if (hCudaDriver == NULL) {
-            fprintf(stderr, "Warp CUDA error: Could not open libcuda.so.\n");
+            fprintf(stderr, "Warp CUDA warning: Could not find or load the NVIDIA CUDA driver. Proceeding in CPU-only mode.\n");
             return false;
         }
     }
@@ -243,6 +250,12 @@ bool init_cuda_driver()
     get_driver_entry_point("cuEventRecord", 2000, &(void*&)pfn_cuEventRecord);
     get_driver_entry_point("cuEventRecordWithFlags", 11010, &(void*&)pfn_cuEventRecordWithFlags);
     get_driver_entry_point("cuEventSynchronize", 2000, &(void*&)pfn_cuEventSynchronize);
+#if CUDA_VERSION >= 12030
+    if (driver_version >= 12030)
+        get_driver_entry_point("cuGraphAddNode", 12030, &(void*&)pfn_cuGraphAddNode);
+#endif
+    get_driver_entry_point("cuGraphNodeGetDependentNodes", 10000, &(void*&)pfn_cuGraphNodeGetDependentNodes);
+    get_driver_entry_point("cuGraphNodeGetType", 10000, &(void*&)pfn_cuGraphNodeGetType);
     get_driver_entry_point("cuModuleLoadDataEx", 2010, &(void*&)pfn_cuModuleLoadDataEx);
     get_driver_entry_point("cuModuleUnload", 2000, &(void*&)pfn_cuModuleUnload);
     get_driver_entry_point("cuModuleGetFunction", 2000, &(void*&)pfn_cuModuleGetFunction);
@@ -332,7 +345,8 @@ bool get_graph_leaf_nodes(cudaGraph_t graph, std::vector<cudaGraphNode_t>& leaf_
     for (cudaGraphNode_t node : nodes)
     {
         size_t dependent_count;
-        if (!check_cuda(cudaGraphNodeGetDependentNodes(node, NULL, &dependent_count)))
+        if (!check_cu(cuGraphNodeGetDependentNodes_f(node, NULL, &dependent_count)))
             return false;
         if (dependent_count == 0)
@@ -553,6 +567,23 @@ CUresult cuEventSynchronize_f(CUevent event)
     return pfn_cuEventSynchronize ? pfn_cuEventSynchronize(event) : DRIVER_ENTRY_POINT_ERROR;
 }
+#if CUDA_VERSION >= 12030
+CUresult cuGraphAddNode_f(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams)
+{
+    return pfn_cuGraphAddNode ? pfn_cuGraphAddNode(phGraphNode, hGraph, dependencies, dependencyData, numDependencies, nodeParams) : DRIVER_ENTRY_POINT_ERROR;
+}
+#endif
+CUresult cuGraphNodeGetDependentNodes_f(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes)
+{
+    return pfn_cuGraphNodeGetDependentNodes ? pfn_cuGraphNodeGetDependentNodes(hNode, dependentNodes, numDependentNodes) : DRIVER_ENTRY_POINT_ERROR;
+}
+CUresult cuGraphNodeGetType_f(CUgraphNode hNode, CUgraphNodeType* type)
+{
+    return pfn_cuGraphNodeGetType ? pfn_cuGraphNodeGetType(hNode, type) : DRIVER_ENTRY_POINT_ERROR;
+}
 CUresult cuModuleLoadDataEx_f(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
 {
     return pfn_cuModuleLoadDataEx ? pfn_cuModuleLoadDataEx(module, image, numOptions, options, optionValues) : DRIVER_ENTRY_POINT_ERROR;

warp/native/cuda_util.h CHANGED Viewed

@@ -38,19 +38,19 @@
     #define wp_launch_device(context, kernel, dim, args) { \
         if (dim) { \
         ContextGuard guard(context); \
-        cudaStream_t stream = (cudaStream_t)cuda_stream_get_current(); \
+        cudaStream_t stream = (cudaStream_t)wp_cuda_stream_get_current(); \
         const int num_threads = 256; \
         const int num_blocks = (dim+num_threads-1)/num_threads; \
         begin_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream, context, #kernel); \
         kernel<<<num_blocks, 256, 0, stream>>>args; \
-        check_cuda(cuda_context_check(WP_CURRENT_CONTEXT)); \
+        check_cuda(wp_cuda_context_check(WP_CURRENT_CONTEXT)); \
         end_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream); }}
 #else
     // helper for launching kernels (no error checking)
     #define wp_launch_device(context, kernel, dim, args) { \
         if (dim) { \
         ContextGuard guard(context); \
-        cudaStream_t stream = (cudaStream_t)cuda_stream_get_current(); \
+        cudaStream_t stream = (cudaStream_t)wp_cuda_stream_get_current(); \
         const int num_threads = 256; \
         const int num_blocks = (dim+num_threads-1)/num_threads; \
         begin_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream, context, #kernel); \
@@ -99,6 +99,12 @@ CUresult cuEventQuery_f(CUevent event);
 CUresult cuEventRecord_f(CUevent event, CUstream stream);
 CUresult cuEventRecordWithFlags_f(CUevent event, CUstream stream, unsigned int flags);
 CUresult cuEventSynchronize_f(CUevent event);
+#if CUDA_VERSION >= 12030
+// function used to add conditional graph nodes, not available in older CUDA versions
+CUresult cuGraphAddNode_f(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams);
+#endif
+CUresult cuGraphNodeGetDependentNodes_f(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+CUresult cuGraphNodeGetType_f(CUgraphNode hNode, CUgraphNodeType* type);
 CUresult cuModuleUnload_f(CUmodule hmod);
 CUresult cuModuleLoadDataEx_f(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
 CUresult cuModuleGetFunction_f(CUfunction *hfunc, CUmodule hmod, const char *name);
@@ -255,7 +261,7 @@ constexpr int WP_TIMING_GRAPH = 16;  // graph launch
 #define begin_cuda_range(_flag, _stream, _context, _name) \
     CudaTimingRange _timing_range; \
     bool _timing_enabled; \
-    if ((g_cuda_timing_state->flags & _flag) && !cuda_stream_is_capturing(_stream)) { \
+    if ((g_cuda_timing_state->flags & _flag) && !wp_cuda_stream_is_capturing(_stream)) { \
         ContextGuard guard(_context, true); \
         _timing_enabled = true; \
         _timing_range.context = _context ? _context : get_current_context(); \