warp-lang 1.8.1__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +282 -103
- warp/__init__.pyi +482 -110
- warp/bin/warp-clang.dll +0 -0
- warp/bin/warp.dll +0 -0
- warp/build.py +93 -30
- warp/build_dll.py +47 -67
- warp/builtins.py +955 -137
- warp/codegen.py +312 -206
- warp/config.py +1 -1
- warp/context.py +1249 -784
- warp/examples/core/example_marching_cubes.py +1 -0
- warp/examples/core/example_render_opengl.py +100 -3
- warp/examples/fem/example_apic_fluid.py +98 -52
- warp/examples/fem/example_convection_diffusion_dg.py +25 -4
- warp/examples/fem/example_diffusion_mgpu.py +8 -3
- warp/examples/fem/utils.py +68 -22
- warp/fabric.py +1 -1
- warp/fem/cache.py +27 -19
- warp/fem/domain.py +2 -2
- warp/fem/field/nodal_field.py +2 -2
- warp/fem/field/virtual.py +264 -166
- warp/fem/geometry/geometry.py +5 -5
- warp/fem/integrate.py +129 -51
- warp/fem/space/restriction.py +4 -0
- warp/fem/space/shape/tet_shape_function.py +3 -10
- warp/jax_experimental/custom_call.py +1 -1
- warp/jax_experimental/ffi.py +2 -1
- warp/marching_cubes.py +708 -0
- warp/native/array.h +99 -4
- warp/native/builtin.h +82 -5
- warp/native/bvh.cpp +64 -28
- warp/native/bvh.cu +58 -58
- warp/native/bvh.h +2 -2
- warp/native/clang/clang.cpp +7 -7
- warp/native/coloring.cpp +8 -2
- warp/native/crt.cpp +2 -2
- warp/native/crt.h +3 -5
- warp/native/cuda_util.cpp +41 -10
- warp/native/cuda_util.h +10 -4
- warp/native/exports.h +1842 -1908
- warp/native/fabric.h +2 -1
- warp/native/hashgrid.cpp +37 -37
- warp/native/hashgrid.cu +2 -2
- warp/native/initializer_array.h +1 -1
- warp/native/intersect.h +2 -2
- warp/native/mat.h +1910 -116
- warp/native/mathdx.cpp +43 -43
- warp/native/mesh.cpp +24 -24
- warp/native/mesh.cu +26 -26
- warp/native/mesh.h +4 -2
- warp/native/nanovdb/GridHandle.h +179 -12
- warp/native/nanovdb/HostBuffer.h +8 -7
- warp/native/nanovdb/NanoVDB.h +517 -895
- warp/native/nanovdb/NodeManager.h +323 -0
- warp/native/nanovdb/PNanoVDB.h +2 -2
- warp/native/quat.h +331 -14
- warp/native/range.h +7 -1
- warp/native/reduce.cpp +10 -10
- warp/native/reduce.cu +13 -14
- warp/native/runlength_encode.cpp +2 -2
- warp/native/runlength_encode.cu +5 -5
- warp/native/scan.cpp +3 -3
- warp/native/scan.cu +4 -4
- warp/native/sort.cpp +10 -10
- warp/native/sort.cu +22 -22
- warp/native/sparse.cpp +8 -8
- warp/native/sparse.cu +13 -13
- warp/native/spatial.h +366 -17
- warp/native/temp_buffer.h +2 -2
- warp/native/tile.h +283 -69
- warp/native/vec.h +381 -14
- warp/native/volume.cpp +54 -54
- warp/native/volume.cu +1 -1
- warp/native/volume.h +2 -1
- warp/native/volume_builder.cu +30 -37
- warp/native/warp.cpp +150 -149
- warp/native/warp.cu +323 -192
- warp/native/warp.h +227 -226
- warp/optim/linear.py +736 -271
- warp/render/imgui_manager.py +289 -0
- warp/render/render_opengl.py +85 -6
- warp/sim/graph_coloring.py +2 -2
- warp/sparse.py +558 -175
- warp/tests/aux_test_module_aot.py +7 -0
- warp/tests/cuda/test_async.py +3 -3
- warp/tests/cuda/test_conditional_captures.py +101 -0
- warp/tests/geometry/test_marching_cubes.py +233 -12
- warp/tests/sim/test_coloring.py +6 -6
- warp/tests/test_array.py +56 -5
- warp/tests/test_codegen.py +3 -2
- warp/tests/test_context.py +8 -15
- warp/tests/test_enum.py +136 -0
- warp/tests/test_examples.py +2 -2
- warp/tests/test_fem.py +45 -2
- warp/tests/test_fixedarray.py +229 -0
- warp/tests/test_func.py +18 -15
- warp/tests/test_future_annotations.py +7 -5
- warp/tests/test_linear_solvers.py +30 -0
- warp/tests/test_map.py +1 -1
- warp/tests/test_mat.py +1518 -378
- warp/tests/test_mat_assign_copy.py +178 -0
- warp/tests/test_mat_constructors.py +574 -0
- warp/tests/test_module_aot.py +287 -0
- warp/tests/test_print.py +69 -0
- warp/tests/test_quat.py +140 -34
- warp/tests/test_quat_assign_copy.py +145 -0
- warp/tests/test_reload.py +2 -1
- warp/tests/test_sparse.py +71 -0
- warp/tests/test_spatial.py +140 -34
- warp/tests/test_spatial_assign_copy.py +160 -0
- warp/tests/test_struct.py +43 -3
- warp/tests/test_types.py +0 -20
- warp/tests/test_vec.py +179 -34
- warp/tests/test_vec_assign_copy.py +143 -0
- warp/tests/tile/test_tile.py +184 -18
- warp/tests/tile/test_tile_cholesky.py +605 -0
- warp/tests/tile/test_tile_load.py +169 -0
- warp/tests/tile/test_tile_mathdx.py +2 -558
- warp/tests/tile/test_tile_matmul.py +1 -1
- warp/tests/tile/test_tile_mlp.py +1 -1
- warp/tests/tile/test_tile_shared_memory.py +5 -5
- warp/tests/unittest_suites.py +6 -0
- warp/tests/walkthrough_debug.py +1 -1
- warp/thirdparty/unittest_parallel.py +108 -9
- warp/types.py +554 -264
- warp/utils.py +68 -86
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
- warp/native/marching.cpp +0 -19
- warp/native/marching.cu +0 -514
- warp/native/marching.h +0 -19
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/bvh.cu
CHANGED
|
@@ -155,7 +155,7 @@ void bvh_refit_device(BVH& bvh)
|
|
|
155
155
|
ContextGuard guard(bvh.context);
|
|
156
156
|
|
|
157
157
|
// clear child counters
|
|
158
|
-
|
|
158
|
+
wp_memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
|
|
159
159
|
wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_leaf_nodes, (bvh.num_leaf_nodes, bvh.node_parents, bvh.node_counts, bvh.primitive_indices, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
|
|
160
160
|
}
|
|
161
161
|
|
|
@@ -474,16 +474,16 @@ LinearBVHBuilderGPU::LinearBVHBuilderGPU()
|
|
|
474
474
|
, total_upper(NULL)
|
|
475
475
|
, total_inv_edges(NULL)
|
|
476
476
|
{
|
|
477
|
-
total_lower = (vec3*)
|
|
478
|
-
total_upper = (vec3*)
|
|
479
|
-
total_inv_edges = (vec3*)
|
|
477
|
+
total_lower = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
478
|
+
total_upper = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
479
|
+
total_inv_edges = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
480
480
|
}
|
|
481
481
|
|
|
482
482
|
LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
|
|
483
483
|
{
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
484
|
+
wp_free_device(WP_CURRENT_CONTEXT, total_lower);
|
|
485
|
+
wp_free_device(WP_CURRENT_CONTEXT, total_upper);
|
|
486
|
+
wp_free_device(WP_CURRENT_CONTEXT, total_inv_edges);
|
|
487
487
|
}
|
|
488
488
|
|
|
489
489
|
|
|
@@ -491,12 +491,12 @@ LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
|
|
|
491
491
|
void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds)
|
|
492
492
|
{
|
|
493
493
|
// allocate temporary memory used during building
|
|
494
|
-
indices = (int*)
|
|
495
|
-
keys = (int*)
|
|
496
|
-
deltas = (int*)
|
|
497
|
-
range_lefts = (int*)
|
|
498
|
-
range_rights = (int*)
|
|
499
|
-
num_children = (int*)
|
|
494
|
+
indices = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
|
|
495
|
+
keys = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
|
|
496
|
+
deltas = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differentiating bit between keys for item i and i+1
|
|
497
|
+
range_lefts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
498
|
+
range_rights = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
499
|
+
num_children = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
500
500
|
|
|
501
501
|
// if total bounds supplied by the host then we just
|
|
502
502
|
// compute our edge length and upload it to the GPU directly
|
|
@@ -508,17 +508,17 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
|
|
|
508
508
|
|
|
509
509
|
vec3 inv_edges = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
|
|
510
510
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
511
|
+
wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &total_bounds->lower[0], sizeof(vec3));
|
|
512
|
+
wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &total_bounds->upper[0], sizeof(vec3));
|
|
513
|
+
wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_inv_edges, &inv_edges[0], sizeof(vec3));
|
|
514
514
|
}
|
|
515
515
|
else
|
|
516
516
|
{
|
|
517
517
|
static vec3 upper(-FLT_MAX);
|
|
518
518
|
static vec3 lower(FLT_MAX);
|
|
519
519
|
|
|
520
|
-
|
|
521
|
-
|
|
520
|
+
wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &lower, sizeof(lower));
|
|
521
|
+
wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &upper, sizeof(upper));
|
|
522
522
|
|
|
523
523
|
// compute the total bounds on the GPU
|
|
524
524
|
wp_launch_device(WP_CURRENT_CONTEXT, compute_total_bounds, num_items, (item_lowers, item_uppers, total_lower, total_upper, num_items));
|
|
@@ -532,7 +532,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
|
|
|
532
532
|
|
|
533
533
|
// sort items based on Morton key (note the 32-bit sort key corresponds to the template parameter to morton3, i.e. 3x9 bit keys combined)
|
|
534
534
|
radix_sort_pairs_device(WP_CURRENT_CONTEXT, keys, indices, num_items);
|
|
535
|
-
|
|
535
|
+
wp_memcpy_d2d(WP_CURRENT_CONTEXT, bvh.primitive_indices, indices, sizeof(int) * num_items);
|
|
536
536
|
|
|
537
537
|
// calculate deltas between adjacent keys
|
|
538
538
|
wp_launch_device(WP_CURRENT_CONTEXT, compute_key_deltas, num_items, (keys, deltas, num_items-1));
|
|
@@ -541,20 +541,20 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
|
|
|
541
541
|
wp_launch_device(WP_CURRENT_CONTEXT, build_leaves, num_items, (item_lowers, item_uppers, num_items, indices, range_lefts, range_rights, bvh.node_lowers, bvh.node_uppers));
|
|
542
542
|
|
|
543
543
|
// reset children count, this is our atomic counter so we know when an internal node is complete, only used during building
|
|
544
|
-
|
|
544
|
+
wp_memset_device(WP_CURRENT_CONTEXT, num_children, 0, sizeof(int)*bvh.max_nodes);
|
|
545
545
|
|
|
546
546
|
// build the tree and internal node bounds
|
|
547
547
|
wp_launch_device(WP_CURRENT_CONTEXT, build_hierarchy, num_items, (num_items, bvh.root, deltas, num_children, bvh.primitive_indices, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
|
|
548
548
|
wp_launch_device(WP_CURRENT_CONTEXT, mark_packed_leaf_nodes, bvh.max_nodes, (bvh.max_nodes, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
|
|
549
549
|
|
|
550
550
|
// free temporary memory
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
551
|
+
wp_free_device(WP_CURRENT_CONTEXT, indices);
|
|
552
|
+
wp_free_device(WP_CURRENT_CONTEXT, keys);
|
|
553
|
+
wp_free_device(WP_CURRENT_CONTEXT, deltas);
|
|
554
554
|
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
555
|
+
wp_free_device(WP_CURRENT_CONTEXT, range_lefts);
|
|
556
|
+
wp_free_device(WP_CURRENT_CONTEXT, range_rights);
|
|
557
|
+
wp_free_device(WP_CURRENT_CONTEXT, num_children);
|
|
558
558
|
|
|
559
559
|
}
|
|
560
560
|
|
|
@@ -562,8 +562,8 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
|
|
|
562
562
|
template<typename T>
|
|
563
563
|
T* make_device_buffer_of(void* context, T* host_buffer, size_t buffer_size)
|
|
564
564
|
{
|
|
565
|
-
T* device_buffer = (T*)
|
|
566
|
-
|
|
565
|
+
T* device_buffer = (T*)wp_alloc_device(context, sizeof(T) * buffer_size);;
|
|
566
|
+
wp_memcpy_h2d(context, device_buffer, host_buffer, sizeof(T) * buffer_size);
|
|
567
567
|
|
|
568
568
|
return device_buffer;
|
|
569
569
|
}
|
|
@@ -662,8 +662,8 @@ void copy_host_tree_to_device(void* context, BVH& bvh_host, BVH& bvh_device_on_h
|
|
|
662
662
|
bvh_device_on_host.num_items = bvh_host.num_items;
|
|
663
663
|
bvh_device_on_host.max_depth = bvh_host.max_depth;
|
|
664
664
|
|
|
665
|
-
bvh_device_on_host.root = (int*)
|
|
666
|
-
|
|
665
|
+
bvh_device_on_host.root = (int*)wp_alloc_device(context, sizeof(int));
|
|
666
|
+
wp_memcpy_h2d(context, bvh_device_on_host.root, bvh_host.root, sizeof(int));
|
|
667
667
|
bvh_device_on_host.context = context;
|
|
668
668
|
|
|
669
669
|
bvh_device_on_host.node_lowers = make_device_buffer_of(context, bvh_host.node_lowers, bvh_host.max_nodes);
|
|
@@ -682,12 +682,12 @@ void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items,
|
|
|
682
682
|
// copy bounds back to CPU
|
|
683
683
|
std::vector<vec3> lowers_host(num_items);
|
|
684
684
|
std::vector<vec3> uppers_host(num_items);
|
|
685
|
-
|
|
686
|
-
|
|
685
|
+
wp_memcpy_d2h(WP_CURRENT_CONTEXT, lowers_host.data(), lowers, sizeof(vec3) * num_items);
|
|
686
|
+
wp_memcpy_d2h(WP_CURRENT_CONTEXT, uppers_host.data(), uppers, sizeof(vec3) * num_items);
|
|
687
687
|
|
|
688
688
|
// run CPU based constructor
|
|
689
689
|
wp::BVH bvh_host;
|
|
690
|
-
bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
|
|
690
|
+
wp::bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
|
|
691
691
|
|
|
692
692
|
// copy host tree to device
|
|
693
693
|
wp::copy_host_tree_to_device(WP_CURRENT_CONTEXT, bvh_host, bvh_device_on_host);
|
|
@@ -695,26 +695,26 @@ void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items,
|
|
|
695
695
|
bvh_device_on_host.item_lowers = lowers;
|
|
696
696
|
bvh_device_on_host.item_uppers = uppers;
|
|
697
697
|
// node_counts is not allocated for host tree
|
|
698
|
-
bvh_device_on_host.node_counts = (int*)
|
|
699
|
-
bvh_destroy_host(bvh_host);
|
|
698
|
+
bvh_device_on_host.node_counts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
|
|
699
|
+
wp::bvh_destroy_host(bvh_host);
|
|
700
700
|
}
|
|
701
701
|
else if (constructor_type == BVH_CONSTRUCTOR_LBVH)
|
|
702
702
|
{
|
|
703
703
|
bvh_device_on_host.num_items = num_items;
|
|
704
704
|
bvh_device_on_host.max_nodes = 2 * num_items - 1;
|
|
705
705
|
bvh_device_on_host.num_leaf_nodes = num_items;
|
|
706
|
-
bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)
|
|
707
|
-
|
|
708
|
-
bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)
|
|
709
|
-
|
|
710
|
-
bvh_device_on_host.node_parents = (int*)
|
|
711
|
-
bvh_device_on_host.node_counts = (int*)
|
|
712
|
-
bvh_device_on_host.root = (int*)
|
|
713
|
-
bvh_device_on_host.primitive_indices = (int*)
|
|
706
|
+
bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
|
|
707
|
+
wp_memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_lowers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
|
|
708
|
+
bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
|
|
709
|
+
wp_memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_uppers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
|
|
710
|
+
bvh_device_on_host.node_parents = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
|
|
711
|
+
bvh_device_on_host.node_counts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
|
|
712
|
+
bvh_device_on_host.root = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
|
|
713
|
+
bvh_device_on_host.primitive_indices = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * num_items);
|
|
714
714
|
bvh_device_on_host.item_lowers = lowers;
|
|
715
715
|
bvh_device_on_host.item_uppers = uppers;
|
|
716
716
|
|
|
717
|
-
bvh_device_on_host.context = context ? context :
|
|
717
|
+
bvh_device_on_host.context = context ? context : wp_cuda_context_get_current();
|
|
718
718
|
|
|
719
719
|
LinearBVHBuilderGPU builder;
|
|
720
720
|
builder.build(bvh_device_on_host, lowers, uppers, num_items, NULL);
|
|
@@ -729,26 +729,26 @@ void bvh_destroy_device(BVH& bvh)
|
|
|
729
729
|
{
|
|
730
730
|
ContextGuard guard(bvh.context);
|
|
731
731
|
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
732
|
+
wp_free_device(WP_CURRENT_CONTEXT, bvh.node_lowers); bvh.node_lowers = NULL;
|
|
733
|
+
wp_free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
|
|
734
|
+
wp_free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
|
|
735
|
+
wp_free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
|
|
736
|
+
wp_free_device(WP_CURRENT_CONTEXT, bvh.primitive_indices); bvh.primitive_indices = NULL;
|
|
737
|
+
wp_free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
|
|
738
738
|
}
|
|
739
739
|
|
|
740
740
|
|
|
741
741
|
} // namespace wp
|
|
742
742
|
|
|
743
743
|
|
|
744
|
-
void
|
|
744
|
+
void wp_bvh_refit_device(uint64_t id)
|
|
745
745
|
{
|
|
746
746
|
wp::BVH bvh;
|
|
747
747
|
if (bvh_get_descriptor(id, bvh))
|
|
748
748
|
{
|
|
749
749
|
ContextGuard guard(bvh.context);
|
|
750
750
|
|
|
751
|
-
bvh_refit_device(bvh);
|
|
751
|
+
wp::bvh_refit_device(bvh);
|
|
752
752
|
}
|
|
753
753
|
}
|
|
754
754
|
|
|
@@ -759,17 +759,17 @@ void bvh_refit_device(uint64_t id)
|
|
|
759
759
|
* muted. However, the muted leaf nodes will still have the pointer to their parents, thus the up-tracing
|
|
760
760
|
* can still work. We will only compute the bounding box of a leaf node if its parent is not a leaf node.
|
|
761
761
|
*/
|
|
762
|
-
uint64_t
|
|
762
|
+
uint64_t wp_bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items, int constructor_type)
|
|
763
763
|
{
|
|
764
764
|
ContextGuard guard(context);
|
|
765
765
|
wp::BVH bvh_device_on_host;
|
|
766
766
|
wp::BVH* bvh_device_ptr = nullptr;
|
|
767
767
|
|
|
768
|
-
bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
|
|
768
|
+
wp::bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
|
|
769
769
|
|
|
770
770
|
// create device-side BVH descriptor
|
|
771
|
-
bvh_device_ptr = (wp::BVH*)
|
|
772
|
-
|
|
771
|
+
bvh_device_ptr = (wp::BVH*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
|
|
772
|
+
wp_memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device_ptr, &bvh_device_on_host, sizeof(wp::BVH));
|
|
773
773
|
|
|
774
774
|
uint64_t bvh_id = (uint64_t)bvh_device_ptr;
|
|
775
775
|
wp::bvh_add_descriptor(bvh_id, bvh_device_on_host);
|
|
@@ -777,7 +777,7 @@ uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, in
|
|
|
777
777
|
}
|
|
778
778
|
|
|
779
779
|
|
|
780
|
-
void
|
|
780
|
+
void wp_bvh_destroy_device(uint64_t id)
|
|
781
781
|
{
|
|
782
782
|
wp::BVH bvh;
|
|
783
783
|
if (wp::bvh_get_descriptor(id, bvh))
|
|
@@ -786,6 +786,6 @@ void bvh_destroy_device(uint64_t id)
|
|
|
786
786
|
wp::bvh_rem_descriptor(id);
|
|
787
787
|
|
|
788
788
|
// free descriptor
|
|
789
|
-
|
|
789
|
+
wp_free_device(WP_CURRENT_CONTEXT, (void*)id);
|
|
790
790
|
}
|
|
791
791
|
}
|
warp/native/bvh.h
CHANGED
|
@@ -357,7 +357,7 @@ CUDA_CALLABLE inline bvh_query_t bvh_query(
|
|
|
357
357
|
BVHPackedNodeHalf node_lower = bvh_load_node(bvh.node_lowers, node_index);
|
|
358
358
|
BVHPackedNodeHalf node_upper = bvh_load_node(bvh.node_uppers, node_index);
|
|
359
359
|
|
|
360
|
-
|
|
360
|
+
if (!bvh_query_intersection_test(query, reinterpret_cast<vec3&>(node_lower), reinterpret_cast<vec3&>(node_upper)))
|
|
361
361
|
{
|
|
362
362
|
continue;
|
|
363
363
|
}
|
|
@@ -464,7 +464,7 @@ CUDA_CALLABLE inline bool bvh_query_next(bvh_query_t& query, int& index)
|
|
|
464
464
|
wp::vec3 upper_pos(node_upper.x, node_upper.y, node_upper.z);
|
|
465
465
|
wp::bounds3 current_bounds(lower_pos, upper_pos);
|
|
466
466
|
|
|
467
|
-
|
|
467
|
+
if (!bvh_query_intersection_test(query, reinterpret_cast<vec3&>(node_lower), reinterpret_cast<vec3&>(node_upper)))
|
|
468
468
|
{
|
|
469
469
|
continue;
|
|
470
470
|
}
|
warp/native/clang/clang.cpp
CHANGED
|
@@ -175,7 +175,7 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
|
|
|
175
175
|
|
|
176
176
|
clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
|
|
177
177
|
bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
|
|
178
|
-
buffer.release();
|
|
178
|
+
(void)buffer.release();
|
|
179
179
|
|
|
180
180
|
return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
|
|
181
181
|
}
|
|
@@ -240,14 +240,14 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
|
|
|
240
240
|
|
|
241
241
|
clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
|
|
242
242
|
bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
|
|
243
|
-
buffer.release();
|
|
243
|
+
(void)buffer.release();
|
|
244
244
|
|
|
245
245
|
return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
|
|
246
246
|
}
|
|
247
247
|
|
|
248
248
|
extern "C" {
|
|
249
249
|
|
|
250
|
-
WP_API int
|
|
250
|
+
WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp)
|
|
251
251
|
{
|
|
252
252
|
initialize_llvm();
|
|
253
253
|
|
|
@@ -294,7 +294,7 @@ WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char*
|
|
|
294
294
|
return 0;
|
|
295
295
|
}
|
|
296
296
|
|
|
297
|
-
WP_API int
|
|
297
|
+
WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
|
|
298
298
|
{
|
|
299
299
|
initialize_llvm();
|
|
300
300
|
|
|
@@ -355,7 +355,7 @@ WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char*
|
|
|
355
355
|
static llvm::orc::LLJIT* jit = nullptr;
|
|
356
356
|
|
|
357
357
|
// Load an object file into an in-memory DLL named `module_name`
|
|
358
|
-
WP_API int
|
|
358
|
+
WP_API int wp_load_obj(const char* object_file, const char* module_name)
|
|
359
359
|
{
|
|
360
360
|
if(!jit)
|
|
361
361
|
{
|
|
@@ -497,7 +497,7 @@ WP_API int load_obj(const char* object_file, const char* module_name)
|
|
|
497
497
|
return 0;
|
|
498
498
|
}
|
|
499
499
|
|
|
500
|
-
WP_API int
|
|
500
|
+
WP_API int wp_unload_obj(const char* module_name)
|
|
501
501
|
{
|
|
502
502
|
if(!jit) // If there's no JIT instance there are no object files loaded
|
|
503
503
|
{
|
|
@@ -516,7 +516,7 @@ WP_API int unload_obj(const char* module_name)
|
|
|
516
516
|
return 0;
|
|
517
517
|
}
|
|
518
518
|
|
|
519
|
-
WP_API uint64_t
|
|
519
|
+
WP_API uint64_t wp_lookup(const char* dll_name, const char* function_name)
|
|
520
520
|
{
|
|
521
521
|
auto* dll = jit->getJITDylibByName(dll_name);
|
|
522
522
|
|
warp/native/coloring.cpp
CHANGED
|
@@ -35,6 +35,7 @@
|
|
|
35
35
|
|
|
36
36
|
#include "warp.h"
|
|
37
37
|
|
|
38
|
+
#include <climits>
|
|
38
39
|
#include <iostream>
|
|
39
40
|
#include <vector>
|
|
40
41
|
#include <array>
|
|
@@ -338,9 +339,14 @@ public:
|
|
|
338
339
|
|
|
339
340
|
int get_node_weight(int node_idx)
|
|
340
341
|
{
|
|
342
|
+
if (node_idx < 0 || node_idx >= (int)node_weights.size()) {
|
|
343
|
+
fprintf(stderr, "The node_idx %d is out of range!\n", node_idx);
|
|
344
|
+
return INT_MIN;
|
|
345
|
+
}
|
|
341
346
|
return node_weights[node_idx];
|
|
342
347
|
}
|
|
343
348
|
|
|
349
|
+
|
|
344
350
|
void add_node(int weight, int node_idx)
|
|
345
351
|
{
|
|
346
352
|
if (weight >= weight_buckets.size())
|
|
@@ -539,7 +545,7 @@ using namespace wp;
|
|
|
539
545
|
|
|
540
546
|
extern "C"
|
|
541
547
|
{
|
|
542
|
-
int
|
|
548
|
+
int wp_graph_coloring(int num_nodes, wp::array_t<int> edges, int algorithm, wp::array_t<int> node_colors)
|
|
543
549
|
{
|
|
544
550
|
if (node_colors.ndim != 1 || node_colors.shape[0] != num_nodes)
|
|
545
551
|
{
|
|
@@ -594,7 +600,7 @@ extern "C"
|
|
|
594
600
|
return num_colors;
|
|
595
601
|
}
|
|
596
602
|
|
|
597
|
-
float
|
|
603
|
+
float wp_balance_coloring(int num_nodes, wp::array_t<int> edges, int num_colors,
|
|
598
604
|
float target_max_min_ratio, wp::array_t<int> node_colors)
|
|
599
605
|
{
|
|
600
606
|
Graph graph(num_nodes, edges);
|
warp/native/crt.cpp
CHANGED
|
@@ -41,11 +41,11 @@ extern "C" WP_API void _wp_assert(const char* expression, const char* file, unsi
|
|
|
41
41
|
fflush(stdout);
|
|
42
42
|
fprintf(stderr,
|
|
43
43
|
"Assertion failed: '%s'\n"
|
|
44
|
-
"At '%s:%
|
|
44
|
+
"At '%s:%u'\n",
|
|
45
45
|
expression, file, line);
|
|
46
46
|
fflush(stderr);
|
|
47
47
|
|
|
48
48
|
// Now invoke the standard assert(), which may abort the program or break
|
|
49
49
|
// into the debugger as decided by the runtime environment.
|
|
50
|
-
assert(false && "assert() failed");
|
|
50
|
+
assert(false && "assert() failed"); // cppcheck-suppress incorrectStringBooleanError
|
|
51
51
|
}
|
warp/native/crt.h
CHANGED
|
@@ -110,11 +110,9 @@ extern "C" WP_API int _wp_isinf(double);
|
|
|
110
110
|
#define SCHAR_MIN (-128)
|
|
111
111
|
#define SCHAR_MAX 127
|
|
112
112
|
#define UCHAR_MAX 255
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX,
|
|
117
|
-
};
|
|
113
|
+
#define _JITIFY_CHAR_IS_UNSIGNED ((char)-1 >= 0)
|
|
114
|
+
#define CHAR_MIN (_JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN)
|
|
115
|
+
#define CHAR_MAX (_JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX)
|
|
118
116
|
#define SHRT_MIN (-32768)
|
|
119
117
|
#define SHRT_MAX 32767
|
|
120
118
|
#define USHRT_MAX 65535
|
warp/native/cuda_util.cpp
CHANGED
|
@@ -33,14 +33,14 @@
|
|
|
33
33
|
#include <stack>
|
|
34
34
|
|
|
35
35
|
// the minimum CUDA version required from the driver
|
|
36
|
-
#define WP_CUDA_DRIVER_VERSION
|
|
36
|
+
#define WP_CUDA_DRIVER_VERSION 12000
|
|
37
37
|
|
|
38
38
|
// the minimum CUDA Toolkit version required to build Warp
|
|
39
|
-
#define WP_CUDA_TOOLKIT_VERSION
|
|
39
|
+
#define WP_CUDA_TOOLKIT_VERSION 12000
|
|
40
40
|
|
|
41
41
|
// check if the CUDA Toolkit is too old
|
|
42
42
|
#if CUDA_VERSION < WP_CUDA_TOOLKIT_VERSION
|
|
43
|
-
#error Building Warp requires CUDA Toolkit version
|
|
43
|
+
#error Building Warp requires CUDA Toolkit version 12.0 or higher
|
|
44
44
|
#endif
|
|
45
45
|
|
|
46
46
|
// Avoid including <cudaGLTypedefs.h>, which requires OpenGL headers to be installed.
|
|
@@ -56,11 +56,12 @@ typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResou
|
|
|
56
56
|
|
|
57
57
|
// function pointers to driver API entry points
|
|
58
58
|
// these are explicitly versioned according to cudaTypedefs.h from CUDA Toolkit WP_CUDA_TOOLKIT_VERSION
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
#
|
|
62
|
-
static PFN_cuGetProcAddress_v12000 pfn_cuGetProcAddress;
|
|
59
|
+
|
|
60
|
+
#if CUDA_VERSION >= 13000
|
|
61
|
+
#define PFN_cuGetProcAddress PFN_cuGetProcAddress_v12000
|
|
63
62
|
#endif
|
|
63
|
+
|
|
64
|
+
static PFN_cuGetProcAddress_v12000 pfn_cuGetProcAddress;
|
|
64
65
|
static PFN_cuDriverGetVersion_v2020 pfn_cuDriverGetVersion;
|
|
65
66
|
static PFN_cuGetErrorName_v6000 pfn_cuGetErrorName;
|
|
66
67
|
static PFN_cuGetErrorString_v6000 pfn_cuGetErrorString;
|
|
@@ -100,6 +101,12 @@ static PFN_cuEventQuery_v2000 pfn_cuEventQuery;
|
|
|
100
101
|
static PFN_cuEventRecord_v2000 pfn_cuEventRecord;
|
|
101
102
|
static PFN_cuEventRecordWithFlags_v11010 pfn_cuEventRecordWithFlags;
|
|
102
103
|
static PFN_cuEventSynchronize_v2000 pfn_cuEventSynchronize;
|
|
104
|
+
#if CUDA_VERSION >= 12030
|
|
105
|
+
// function used to add conditional graph nodes, not available in older CUDA versions
|
|
106
|
+
static PFN_cuGraphAddNode_v12030 pfn_cuGraphAddNode;
|
|
107
|
+
#endif
|
|
108
|
+
static PFN_cuGraphNodeGetDependentNodes_v10000 pfn_cuGraphNodeGetDependentNodes;
|
|
109
|
+
static PFN_cuGraphNodeGetType_v10000 pfn_cuGraphNodeGetType;
|
|
103
110
|
static PFN_cuModuleLoadDataEx_v2010 pfn_cuModuleLoadDataEx;
|
|
104
111
|
static PFN_cuModuleUnload_v2000 pfn_cuModuleUnload;
|
|
105
112
|
static PFN_cuModuleGetFunction_v2000 pfn_cuModuleGetFunction;
|
|
@@ -163,7 +170,7 @@ bool init_cuda_driver()
|
|
|
163
170
|
#if defined(_WIN32)
|
|
164
171
|
static HMODULE hCudaDriver = LoadLibraryA("nvcuda.dll");
|
|
165
172
|
if (hCudaDriver == NULL) {
|
|
166
|
-
fprintf(stderr, "Warp CUDA
|
|
173
|
+
fprintf(stderr, "Warp CUDA warning: Could not find or load the NVIDIA CUDA driver. Proceeding in CPU-only mode.\n");
|
|
167
174
|
return false;
|
|
168
175
|
}
|
|
169
176
|
pfn_cuGetProcAddress = (PFN_cuGetProcAddress)GetProcAddress(hCudaDriver, "cuGetProcAddress");
|
|
@@ -173,7 +180,7 @@ bool init_cuda_driver()
|
|
|
173
180
|
// WSL and possibly other systems might require the .1 suffix
|
|
174
181
|
hCudaDriver = dlopen("libcuda.so.1", RTLD_NOW);
|
|
175
182
|
if (hCudaDriver == NULL) {
|
|
176
|
-
fprintf(stderr, "Warp CUDA
|
|
183
|
+
fprintf(stderr, "Warp CUDA warning: Could not find or load the NVIDIA CUDA driver. Proceeding in CPU-only mode.\n");
|
|
177
184
|
return false;
|
|
178
185
|
}
|
|
179
186
|
}
|
|
@@ -243,6 +250,12 @@ bool init_cuda_driver()
|
|
|
243
250
|
get_driver_entry_point("cuEventRecord", 2000, &(void*&)pfn_cuEventRecord);
|
|
244
251
|
get_driver_entry_point("cuEventRecordWithFlags", 11010, &(void*&)pfn_cuEventRecordWithFlags);
|
|
245
252
|
get_driver_entry_point("cuEventSynchronize", 2000, &(void*&)pfn_cuEventSynchronize);
|
|
253
|
+
#if CUDA_VERSION >= 12030
|
|
254
|
+
if (driver_version >= 12030)
|
|
255
|
+
get_driver_entry_point("cuGraphAddNode", 12030, &(void*&)pfn_cuGraphAddNode);
|
|
256
|
+
#endif
|
|
257
|
+
get_driver_entry_point("cuGraphNodeGetDependentNodes", 10000, &(void*&)pfn_cuGraphNodeGetDependentNodes);
|
|
258
|
+
get_driver_entry_point("cuGraphNodeGetType", 10000, &(void*&)pfn_cuGraphNodeGetType);
|
|
246
259
|
get_driver_entry_point("cuModuleLoadDataEx", 2010, &(void*&)pfn_cuModuleLoadDataEx);
|
|
247
260
|
get_driver_entry_point("cuModuleUnload", 2000, &(void*&)pfn_cuModuleUnload);
|
|
248
261
|
get_driver_entry_point("cuModuleGetFunction", 2000, &(void*&)pfn_cuModuleGetFunction);
|
|
@@ -332,7 +345,8 @@ bool get_graph_leaf_nodes(cudaGraph_t graph, std::vector<cudaGraphNode_t>& leaf_
|
|
|
332
345
|
for (cudaGraphNode_t node : nodes)
|
|
333
346
|
{
|
|
334
347
|
size_t dependent_count;
|
|
335
|
-
|
|
348
|
+
|
|
349
|
+
if (!check_cu(cuGraphNodeGetDependentNodes_f(node, NULL, &dependent_count)))
|
|
336
350
|
return false;
|
|
337
351
|
|
|
338
352
|
if (dependent_count == 0)
|
|
@@ -553,6 +567,23 @@ CUresult cuEventSynchronize_f(CUevent event)
|
|
|
553
567
|
return pfn_cuEventSynchronize ? pfn_cuEventSynchronize(event) : DRIVER_ENTRY_POINT_ERROR;
|
|
554
568
|
}
|
|
555
569
|
|
|
570
|
+
#if CUDA_VERSION >= 12030
|
|
571
|
+
CUresult cuGraphAddNode_f(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams)
|
|
572
|
+
{
|
|
573
|
+
return pfn_cuGraphAddNode ? pfn_cuGraphAddNode(phGraphNode, hGraph, dependencies, dependencyData, numDependencies, nodeParams) : DRIVER_ENTRY_POINT_ERROR;
|
|
574
|
+
}
|
|
575
|
+
#endif
|
|
576
|
+
|
|
577
|
+
CUresult cuGraphNodeGetDependentNodes_f(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes)
|
|
578
|
+
{
|
|
579
|
+
return pfn_cuGraphNodeGetDependentNodes ? pfn_cuGraphNodeGetDependentNodes(hNode, dependentNodes, numDependentNodes) : DRIVER_ENTRY_POINT_ERROR;
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
CUresult cuGraphNodeGetType_f(CUgraphNode hNode, CUgraphNodeType* type)
|
|
583
|
+
{
|
|
584
|
+
return pfn_cuGraphNodeGetType ? pfn_cuGraphNodeGetType(hNode, type) : DRIVER_ENTRY_POINT_ERROR;
|
|
585
|
+
}
|
|
586
|
+
|
|
556
587
|
CUresult cuModuleLoadDataEx_f(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
|
|
557
588
|
{
|
|
558
589
|
return pfn_cuModuleLoadDataEx ? pfn_cuModuleLoadDataEx(module, image, numOptions, options, optionValues) : DRIVER_ENTRY_POINT_ERROR;
|
warp/native/cuda_util.h
CHANGED
|
@@ -38,19 +38,19 @@
|
|
|
38
38
|
#define wp_launch_device(context, kernel, dim, args) { \
|
|
39
39
|
if (dim) { \
|
|
40
40
|
ContextGuard guard(context); \
|
|
41
|
-
cudaStream_t stream = (cudaStream_t)
|
|
41
|
+
cudaStream_t stream = (cudaStream_t)wp_cuda_stream_get_current(); \
|
|
42
42
|
const int num_threads = 256; \
|
|
43
43
|
const int num_blocks = (dim+num_threads-1)/num_threads; \
|
|
44
44
|
begin_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream, context, #kernel); \
|
|
45
45
|
kernel<<<num_blocks, 256, 0, stream>>>args; \
|
|
46
|
-
check_cuda(
|
|
46
|
+
check_cuda(wp_cuda_context_check(WP_CURRENT_CONTEXT)); \
|
|
47
47
|
end_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream); }}
|
|
48
48
|
#else
|
|
49
49
|
// helper for launching kernels (no error checking)
|
|
50
50
|
#define wp_launch_device(context, kernel, dim, args) { \
|
|
51
51
|
if (dim) { \
|
|
52
52
|
ContextGuard guard(context); \
|
|
53
|
-
cudaStream_t stream = (cudaStream_t)
|
|
53
|
+
cudaStream_t stream = (cudaStream_t)wp_cuda_stream_get_current(); \
|
|
54
54
|
const int num_threads = 256; \
|
|
55
55
|
const int num_blocks = (dim+num_threads-1)/num_threads; \
|
|
56
56
|
begin_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream, context, #kernel); \
|
|
@@ -99,6 +99,12 @@ CUresult cuEventQuery_f(CUevent event);
|
|
|
99
99
|
CUresult cuEventRecord_f(CUevent event, CUstream stream);
|
|
100
100
|
CUresult cuEventRecordWithFlags_f(CUevent event, CUstream stream, unsigned int flags);
|
|
101
101
|
CUresult cuEventSynchronize_f(CUevent event);
|
|
102
|
+
#if CUDA_VERSION >= 12030
|
|
103
|
+
// function used to add conditional graph nodes, not available in older CUDA versions
|
|
104
|
+
CUresult cuGraphAddNode_f(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams);
|
|
105
|
+
#endif
|
|
106
|
+
CUresult cuGraphNodeGetDependentNodes_f(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
|
|
107
|
+
CUresult cuGraphNodeGetType_f(CUgraphNode hNode, CUgraphNodeType* type);
|
|
102
108
|
CUresult cuModuleUnload_f(CUmodule hmod);
|
|
103
109
|
CUresult cuModuleLoadDataEx_f(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
|
104
110
|
CUresult cuModuleGetFunction_f(CUfunction *hfunc, CUmodule hmod, const char *name);
|
|
@@ -255,7 +261,7 @@ constexpr int WP_TIMING_GRAPH = 16; // graph launch
|
|
|
255
261
|
#define begin_cuda_range(_flag, _stream, _context, _name) \
|
|
256
262
|
CudaTimingRange _timing_range; \
|
|
257
263
|
bool _timing_enabled; \
|
|
258
|
-
if ((g_cuda_timing_state->flags & _flag) && !
|
|
264
|
+
if ((g_cuda_timing_state->flags & _flag) && !wp_cuda_stream_is_capturing(_stream)) { \
|
|
259
265
|
ContextGuard guard(_context, true); \
|
|
260
266
|
_timing_enabled = true; \
|
|
261
267
|
_timing_range.context = _context ? _context : get_current_context(); \
|