warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +1904 -114
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +331 -101
  7. warp/builtins.py +1244 -160
  8. warp/codegen.py +317 -206
  9. warp/config.py +1 -1
  10. warp/context.py +1465 -789
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/examples/interop/example_jax_kernel.py +2 -1
  18. warp/fabric.py +1 -1
  19. warp/fem/cache.py +27 -19
  20. warp/fem/domain.py +2 -2
  21. warp/fem/field/nodal_field.py +2 -2
  22. warp/fem/field/virtual.py +264 -166
  23. warp/fem/geometry/geometry.py +5 -5
  24. warp/fem/integrate.py +129 -51
  25. warp/fem/space/restriction.py +4 -0
  26. warp/fem/space/shape/tet_shape_function.py +3 -10
  27. warp/jax_experimental/custom_call.py +25 -2
  28. warp/jax_experimental/ffi.py +22 -1
  29. warp/jax_experimental/xla_ffi.py +16 -7
  30. warp/marching_cubes.py +708 -0
  31. warp/native/array.h +99 -4
  32. warp/native/builtin.h +86 -9
  33. warp/native/bvh.cpp +64 -28
  34. warp/native/bvh.cu +58 -58
  35. warp/native/bvh.h +2 -2
  36. warp/native/clang/clang.cpp +7 -7
  37. warp/native/coloring.cpp +8 -2
  38. warp/native/crt.cpp +2 -2
  39. warp/native/crt.h +3 -5
  40. warp/native/cuda_util.cpp +41 -10
  41. warp/native/cuda_util.h +10 -4
  42. warp/native/exports.h +1842 -1908
  43. warp/native/fabric.h +2 -1
  44. warp/native/hashgrid.cpp +37 -37
  45. warp/native/hashgrid.cu +2 -2
  46. warp/native/initializer_array.h +1 -1
  47. warp/native/intersect.h +2 -2
  48. warp/native/mat.h +1910 -116
  49. warp/native/mathdx.cpp +43 -43
  50. warp/native/mesh.cpp +24 -24
  51. warp/native/mesh.cu +26 -26
  52. warp/native/mesh.h +4 -2
  53. warp/native/nanovdb/GridHandle.h +179 -12
  54. warp/native/nanovdb/HostBuffer.h +8 -7
  55. warp/native/nanovdb/NanoVDB.h +517 -895
  56. warp/native/nanovdb/NodeManager.h +323 -0
  57. warp/native/nanovdb/PNanoVDB.h +2 -2
  58. warp/native/quat.h +331 -14
  59. warp/native/range.h +7 -1
  60. warp/native/reduce.cpp +10 -10
  61. warp/native/reduce.cu +13 -14
  62. warp/native/runlength_encode.cpp +2 -2
  63. warp/native/runlength_encode.cu +5 -5
  64. warp/native/scan.cpp +3 -3
  65. warp/native/scan.cu +4 -4
  66. warp/native/sort.cpp +10 -10
  67. warp/native/sort.cu +40 -31
  68. warp/native/sort.h +2 -0
  69. warp/native/sparse.cpp +8 -8
  70. warp/native/sparse.cu +13 -13
  71. warp/native/spatial.h +366 -17
  72. warp/native/temp_buffer.h +2 -2
  73. warp/native/tile.h +471 -82
  74. warp/native/vec.h +328 -14
  75. warp/native/volume.cpp +54 -54
  76. warp/native/volume.cu +1 -1
  77. warp/native/volume.h +2 -1
  78. warp/native/volume_builder.cu +30 -37
  79. warp/native/warp.cpp +150 -149
  80. warp/native/warp.cu +377 -216
  81. warp/native/warp.h +227 -226
  82. warp/optim/linear.py +736 -271
  83. warp/render/imgui_manager.py +289 -0
  84. warp/render/render_opengl.py +99 -18
  85. warp/render/render_usd.py +1 -0
  86. warp/sim/graph_coloring.py +2 -2
  87. warp/sparse.py +558 -175
  88. warp/tests/aux_test_module_aot.py +7 -0
  89. warp/tests/cuda/test_async.py +3 -3
  90. warp/tests/cuda/test_conditional_captures.py +101 -0
  91. warp/tests/geometry/test_hash_grid.py +38 -0
  92. warp/tests/geometry/test_marching_cubes.py +233 -12
  93. warp/tests/interop/test_jax.py +608 -28
  94. warp/tests/sim/test_coloring.py +6 -6
  95. warp/tests/test_array.py +58 -5
  96. warp/tests/test_codegen.py +4 -3
  97. warp/tests/test_context.py +8 -15
  98. warp/tests/test_enum.py +136 -0
  99. warp/tests/test_examples.py +2 -2
  100. warp/tests/test_fem.py +49 -6
  101. warp/tests/test_fixedarray.py +229 -0
  102. warp/tests/test_func.py +18 -15
  103. warp/tests/test_future_annotations.py +7 -5
  104. warp/tests/test_linear_solvers.py +30 -0
  105. warp/tests/test_map.py +15 -1
  106. warp/tests/test_mat.py +1518 -378
  107. warp/tests/test_mat_assign_copy.py +178 -0
  108. warp/tests/test_mat_constructors.py +574 -0
  109. warp/tests/test_module_aot.py +287 -0
  110. warp/tests/test_print.py +69 -0
  111. warp/tests/test_quat.py +140 -34
  112. warp/tests/test_quat_assign_copy.py +145 -0
  113. warp/tests/test_reload.py +2 -1
  114. warp/tests/test_sparse.py +71 -0
  115. warp/tests/test_spatial.py +140 -34
  116. warp/tests/test_spatial_assign_copy.py +160 -0
  117. warp/tests/test_struct.py +43 -3
  118. warp/tests/test_tuple.py +96 -0
  119. warp/tests/test_types.py +61 -20
  120. warp/tests/test_vec.py +179 -34
  121. warp/tests/test_vec_assign_copy.py +143 -0
  122. warp/tests/tile/test_tile.py +245 -18
  123. warp/tests/tile/test_tile_cholesky.py +605 -0
  124. warp/tests/tile/test_tile_load.py +169 -0
  125. warp/tests/tile/test_tile_mathdx.py +2 -558
  126. warp/tests/tile/test_tile_matmul.py +1 -1
  127. warp/tests/tile/test_tile_mlp.py +1 -1
  128. warp/tests/tile/test_tile_shared_memory.py +5 -5
  129. warp/tests/unittest_suites.py +6 -0
  130. warp/tests/walkthrough_debug.py +1 -1
  131. warp/thirdparty/unittest_parallel.py +108 -9
  132. warp/types.py +571 -267
  133. warp/utils.py +68 -86
  134. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
  135. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
  136. warp/native/marching.cpp +0 -19
  137. warp/native/marching.cu +0 -514
  138. warp/native/marching.h +0 -19
  139. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
  140. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
  141. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0
warp/native/bvh.cu CHANGED
@@ -155,7 +155,7 @@ void bvh_refit_device(BVH& bvh)
155
155
  ContextGuard guard(bvh.context);
156
156
 
157
157
  // clear child counters
158
- memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
158
+ wp_memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
159
159
  wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_leaf_nodes, (bvh.num_leaf_nodes, bvh.node_parents, bvh.node_counts, bvh.primitive_indices, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
160
160
  }
161
161
 
@@ -474,16 +474,16 @@ LinearBVHBuilderGPU::LinearBVHBuilderGPU()
474
474
  , total_upper(NULL)
475
475
  , total_inv_edges(NULL)
476
476
  {
477
- total_lower = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
478
- total_upper = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
479
- total_inv_edges = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
477
+ total_lower = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
478
+ total_upper = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
479
+ total_inv_edges = (vec3*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
480
480
  }
481
481
 
482
482
  LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
483
483
  {
484
- free_device(WP_CURRENT_CONTEXT, total_lower);
485
- free_device(WP_CURRENT_CONTEXT, total_upper);
486
- free_device(WP_CURRENT_CONTEXT, total_inv_edges);
484
+ wp_free_device(WP_CURRENT_CONTEXT, total_lower);
485
+ wp_free_device(WP_CURRENT_CONTEXT, total_upper);
486
+ wp_free_device(WP_CURRENT_CONTEXT, total_inv_edges);
487
487
  }
488
488
 
489
489
 
@@ -491,12 +491,12 @@ LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
491
491
  void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds)
492
492
  {
493
493
  // allocate temporary memory used during building
494
- indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
495
- keys = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
496
- deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differentiating bit between keys for item i and i+1
497
- range_lefts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
498
- range_rights = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
499
- num_children = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
494
+ indices = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
495
+ keys = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
496
+ deltas = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differentiating bit between keys for item i and i+1
497
+ range_lefts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
498
+ range_rights = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
499
+ num_children = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
500
500
 
501
501
  // if total bounds supplied by the host then we just
502
502
  // compute our edge length and upload it to the GPU directly
@@ -508,17 +508,17 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
508
508
 
509
509
  vec3 inv_edges = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
510
510
 
511
- memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &total_bounds->lower[0], sizeof(vec3));
512
- memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &total_bounds->upper[0], sizeof(vec3));
513
- memcpy_h2d(WP_CURRENT_CONTEXT, total_inv_edges, &inv_edges[0], sizeof(vec3));
511
+ wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &total_bounds->lower[0], sizeof(vec3));
512
+ wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &total_bounds->upper[0], sizeof(vec3));
513
+ wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_inv_edges, &inv_edges[0], sizeof(vec3));
514
514
  }
515
515
  else
516
516
  {
517
517
  static vec3 upper(-FLT_MAX);
518
518
  static vec3 lower(FLT_MAX);
519
519
 
520
- memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &lower, sizeof(lower));
521
- memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &upper, sizeof(upper));
520
+ wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &lower, sizeof(lower));
521
+ wp_memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &upper, sizeof(upper));
522
522
 
523
523
  // compute the total bounds on the GPU
524
524
  wp_launch_device(WP_CURRENT_CONTEXT, compute_total_bounds, num_items, (item_lowers, item_uppers, total_lower, total_upper, num_items));
@@ -532,7 +532,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
532
532
 
533
533
  // sort items based on Morton key (note the 32-bit sort key corresponds to the template parameter to morton3, i.e. 3x9 bit keys combined)
534
534
  radix_sort_pairs_device(WP_CURRENT_CONTEXT, keys, indices, num_items);
535
- memcpy_d2d(WP_CURRENT_CONTEXT, bvh.primitive_indices, indices, sizeof(int) * num_items);
535
+ wp_memcpy_d2d(WP_CURRENT_CONTEXT, bvh.primitive_indices, indices, sizeof(int) * num_items);
536
536
 
537
537
  // calculate deltas between adjacent keys
538
538
  wp_launch_device(WP_CURRENT_CONTEXT, compute_key_deltas, num_items, (keys, deltas, num_items-1));
@@ -541,20 +541,20 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
541
541
  wp_launch_device(WP_CURRENT_CONTEXT, build_leaves, num_items, (item_lowers, item_uppers, num_items, indices, range_lefts, range_rights, bvh.node_lowers, bvh.node_uppers));
542
542
 
543
543
  // reset children count, this is our atomic counter so we know when an internal node is complete, only used during building
544
- memset_device(WP_CURRENT_CONTEXT, num_children, 0, sizeof(int)*bvh.max_nodes);
544
+ wp_memset_device(WP_CURRENT_CONTEXT, num_children, 0, sizeof(int)*bvh.max_nodes);
545
545
 
546
546
  // build the tree and internal node bounds
547
547
  wp_launch_device(WP_CURRENT_CONTEXT, build_hierarchy, num_items, (num_items, bvh.root, deltas, num_children, bvh.primitive_indices, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
548
548
  wp_launch_device(WP_CURRENT_CONTEXT, mark_packed_leaf_nodes, bvh.max_nodes, (bvh.max_nodes, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
549
549
 
550
550
  // free temporary memory
551
- free_device(WP_CURRENT_CONTEXT, indices);
552
- free_device(WP_CURRENT_CONTEXT, keys);
553
- free_device(WP_CURRENT_CONTEXT, deltas);
551
+ wp_free_device(WP_CURRENT_CONTEXT, indices);
552
+ wp_free_device(WP_CURRENT_CONTEXT, keys);
553
+ wp_free_device(WP_CURRENT_CONTEXT, deltas);
554
554
 
555
- free_device(WP_CURRENT_CONTEXT, range_lefts);
556
- free_device(WP_CURRENT_CONTEXT, range_rights);
557
- free_device(WP_CURRENT_CONTEXT, num_children);
555
+ wp_free_device(WP_CURRENT_CONTEXT, range_lefts);
556
+ wp_free_device(WP_CURRENT_CONTEXT, range_rights);
557
+ wp_free_device(WP_CURRENT_CONTEXT, num_children);
558
558
 
559
559
  }
560
560
 
@@ -562,8 +562,8 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
562
562
  template<typename T>
563
563
  T* make_device_buffer_of(void* context, T* host_buffer, size_t buffer_size)
564
564
  {
565
- T* device_buffer = (T*)alloc_device(context, sizeof(T) * buffer_size);;
566
- memcpy_h2d(context, device_buffer, host_buffer, sizeof(T) * buffer_size);
565
+ T* device_buffer = (T*)wp_alloc_device(context, sizeof(T) * buffer_size);;
566
+ wp_memcpy_h2d(context, device_buffer, host_buffer, sizeof(T) * buffer_size);
567
567
 
568
568
  return device_buffer;
569
569
  }
@@ -662,8 +662,8 @@ void copy_host_tree_to_device(void* context, BVH& bvh_host, BVH& bvh_device_on_h
662
662
  bvh_device_on_host.num_items = bvh_host.num_items;
663
663
  bvh_device_on_host.max_depth = bvh_host.max_depth;
664
664
 
665
- bvh_device_on_host.root = (int*)alloc_device(context, sizeof(int));
666
- memcpy_h2d(context, bvh_device_on_host.root, bvh_host.root, sizeof(int));
665
+ bvh_device_on_host.root = (int*)wp_alloc_device(context, sizeof(int));
666
+ wp_memcpy_h2d(context, bvh_device_on_host.root, bvh_host.root, sizeof(int));
667
667
  bvh_device_on_host.context = context;
668
668
 
669
669
  bvh_device_on_host.node_lowers = make_device_buffer_of(context, bvh_host.node_lowers, bvh_host.max_nodes);
@@ -682,12 +682,12 @@ void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items,
682
682
  // copy bounds back to CPU
683
683
  std::vector<vec3> lowers_host(num_items);
684
684
  std::vector<vec3> uppers_host(num_items);
685
- memcpy_d2h(WP_CURRENT_CONTEXT, lowers_host.data(), lowers, sizeof(vec3) * num_items);
686
- memcpy_d2h(WP_CURRENT_CONTEXT, uppers_host.data(), uppers, sizeof(vec3) * num_items);
685
+ wp_memcpy_d2h(WP_CURRENT_CONTEXT, lowers_host.data(), lowers, sizeof(vec3) * num_items);
686
+ wp_memcpy_d2h(WP_CURRENT_CONTEXT, uppers_host.data(), uppers, sizeof(vec3) * num_items);
687
687
 
688
688
  // run CPU based constructor
689
689
  wp::BVH bvh_host;
690
- bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
690
+ wp::bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
691
691
 
692
692
  // copy host tree to device
693
693
  wp::copy_host_tree_to_device(WP_CURRENT_CONTEXT, bvh_host, bvh_device_on_host);
@@ -695,26 +695,26 @@ void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items,
695
695
  bvh_device_on_host.item_lowers = lowers;
696
696
  bvh_device_on_host.item_uppers = uppers;
697
697
  // node_counts is not allocated for host tree
698
- bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
699
- bvh_destroy_host(bvh_host);
698
+ bvh_device_on_host.node_counts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
699
+ wp::bvh_destroy_host(bvh_host);
700
700
  }
701
701
  else if (constructor_type == BVH_CONSTRUCTOR_LBVH)
702
702
  {
703
703
  bvh_device_on_host.num_items = num_items;
704
704
  bvh_device_on_host.max_nodes = 2 * num_items - 1;
705
705
  bvh_device_on_host.num_leaf_nodes = num_items;
706
- bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
707
- memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_lowers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
708
- bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
709
- memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_uppers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
710
- bvh_device_on_host.node_parents = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
711
- bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
712
- bvh_device_on_host.root = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
713
- bvh_device_on_host.primitive_indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * num_items);
706
+ bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
707
+ wp_memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_lowers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
708
+ bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
709
+ wp_memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_uppers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
710
+ bvh_device_on_host.node_parents = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
711
+ bvh_device_on_host.node_counts = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
712
+ bvh_device_on_host.root = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
713
+ bvh_device_on_host.primitive_indices = (int*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * num_items);
714
714
  bvh_device_on_host.item_lowers = lowers;
715
715
  bvh_device_on_host.item_uppers = uppers;
716
716
 
717
- bvh_device_on_host.context = context ? context : cuda_context_get_current();
717
+ bvh_device_on_host.context = context ? context : wp_cuda_context_get_current();
718
718
 
719
719
  LinearBVHBuilderGPU builder;
720
720
  builder.build(bvh_device_on_host, lowers, uppers, num_items, NULL);
@@ -729,26 +729,26 @@ void bvh_destroy_device(BVH& bvh)
729
729
  {
730
730
  ContextGuard guard(bvh.context);
731
731
 
732
- free_device(WP_CURRENT_CONTEXT, bvh.node_lowers); bvh.node_lowers = NULL;
733
- free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
734
- free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
735
- free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
736
- free_device(WP_CURRENT_CONTEXT, bvh.primitive_indices); bvh.primitive_indices = NULL;
737
- free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
732
+ wp_free_device(WP_CURRENT_CONTEXT, bvh.node_lowers); bvh.node_lowers = NULL;
733
+ wp_free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
734
+ wp_free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
735
+ wp_free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
736
+ wp_free_device(WP_CURRENT_CONTEXT, bvh.primitive_indices); bvh.primitive_indices = NULL;
737
+ wp_free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
738
738
  }
739
739
 
740
740
 
741
741
  } // namespace wp
742
742
 
743
743
 
744
- void bvh_refit_device(uint64_t id)
744
+ void wp_bvh_refit_device(uint64_t id)
745
745
  {
746
746
  wp::BVH bvh;
747
747
  if (bvh_get_descriptor(id, bvh))
748
748
  {
749
749
  ContextGuard guard(bvh.context);
750
750
 
751
- bvh_refit_device(bvh);
751
+ wp::bvh_refit_device(bvh);
752
752
  }
753
753
  }
754
754
 
@@ -759,17 +759,17 @@ void bvh_refit_device(uint64_t id)
759
759
  * muted. However, the muted leaf nodes will still have the pointer to their parents, thus the up-tracing
760
760
  * can still work. We will only compute the bounding box of a leaf node if its parent is not a leaf node.
761
761
  */
762
- uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items, int constructor_type)
762
+ uint64_t wp_bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items, int constructor_type)
763
763
  {
764
764
  ContextGuard guard(context);
765
765
  wp::BVH bvh_device_on_host;
766
766
  wp::BVH* bvh_device_ptr = nullptr;
767
767
 
768
- bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
768
+ wp::bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
769
769
 
770
770
  // create device-side BVH descriptor
771
- bvh_device_ptr = (wp::BVH*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
772
- memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device_ptr, &bvh_device_on_host, sizeof(wp::BVH));
771
+ bvh_device_ptr = (wp::BVH*)wp_alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
772
+ wp_memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device_ptr, &bvh_device_on_host, sizeof(wp::BVH));
773
773
 
774
774
  uint64_t bvh_id = (uint64_t)bvh_device_ptr;
775
775
  wp::bvh_add_descriptor(bvh_id, bvh_device_on_host);
@@ -777,7 +777,7 @@ uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, in
777
777
  }
778
778
 
779
779
 
780
- void bvh_destroy_device(uint64_t id)
780
+ void wp_bvh_destroy_device(uint64_t id)
781
781
  {
782
782
  wp::BVH bvh;
783
783
  if (wp::bvh_get_descriptor(id, bvh))
@@ -786,6 +786,6 @@ void bvh_destroy_device(uint64_t id)
786
786
  wp::bvh_rem_descriptor(id);
787
787
 
788
788
  // free descriptor
789
- free_device(WP_CURRENT_CONTEXT, (void*)id);
789
+ wp_free_device(WP_CURRENT_CONTEXT, (void*)id);
790
790
  }
791
791
  }
warp/native/bvh.h CHANGED
@@ -357,7 +357,7 @@ CUDA_CALLABLE inline bvh_query_t bvh_query(
357
357
  BVHPackedNodeHalf node_lower = bvh_load_node(bvh.node_lowers, node_index);
358
358
  BVHPackedNodeHalf node_upper = bvh_load_node(bvh.node_uppers, node_index);
359
359
 
360
- if (!bvh_query_intersection_test(query, (vec3&)node_lower, (vec3&)node_upper))
360
+ if (!bvh_query_intersection_test(query, reinterpret_cast<vec3&>(node_lower), reinterpret_cast<vec3&>(node_upper)))
361
361
  {
362
362
  continue;
363
363
  }
@@ -464,7 +464,7 @@ CUDA_CALLABLE inline bool bvh_query_next(bvh_query_t& query, int& index)
464
464
  wp::vec3 upper_pos(node_upper.x, node_upper.y, node_upper.z);
465
465
  wp::bounds3 current_bounds(lower_pos, upper_pos);
466
466
 
467
- if (!bvh_query_intersection_test(query, (vec3&)node_lower, (vec3&)node_upper))
467
+ if (!bvh_query_intersection_test(query, reinterpret_cast<vec3&>(node_lower), reinterpret_cast<vec3&>(node_upper)))
468
468
  {
469
469
  continue;
470
470
  }
@@ -175,7 +175,7 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
175
175
 
176
176
  clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
177
177
  bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
178
- buffer.release();
178
+ (void)buffer.release();
179
179
 
180
180
  return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
181
181
  }
@@ -240,14 +240,14 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
240
240
 
241
241
  clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
242
242
  bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
243
- buffer.release();
243
+ (void)buffer.release();
244
244
 
245
245
  return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
246
246
  }
247
247
 
248
248
  extern "C" {
249
249
 
250
- WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp)
250
+ WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp)
251
251
  {
252
252
  initialize_llvm();
253
253
 
@@ -294,7 +294,7 @@ WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char*
294
294
  return 0;
295
295
  }
296
296
 
297
- WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
297
+ WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
298
298
  {
299
299
  initialize_llvm();
300
300
 
@@ -355,7 +355,7 @@ WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char*
355
355
  static llvm::orc::LLJIT* jit = nullptr;
356
356
 
357
357
  // Load an object file into an in-memory DLL named `module_name`
358
- WP_API int load_obj(const char* object_file, const char* module_name)
358
+ WP_API int wp_load_obj(const char* object_file, const char* module_name)
359
359
  {
360
360
  if(!jit)
361
361
  {
@@ -497,7 +497,7 @@ WP_API int load_obj(const char* object_file, const char* module_name)
497
497
  return 0;
498
498
  }
499
499
 
500
- WP_API int unload_obj(const char* module_name)
500
+ WP_API int wp_unload_obj(const char* module_name)
501
501
  {
502
502
  if(!jit) // If there's no JIT instance there are no object files loaded
503
503
  {
@@ -516,7 +516,7 @@ WP_API int unload_obj(const char* module_name)
516
516
  return 0;
517
517
  }
518
518
 
519
- WP_API uint64_t lookup(const char* dll_name, const char* function_name)
519
+ WP_API uint64_t wp_lookup(const char* dll_name, const char* function_name)
520
520
  {
521
521
  auto* dll = jit->getJITDylibByName(dll_name);
522
522
 
warp/native/coloring.cpp CHANGED
@@ -35,6 +35,7 @@
35
35
 
36
36
  #include "warp.h"
37
37
 
38
+ #include <climits>
38
39
  #include <iostream>
39
40
  #include <vector>
40
41
  #include <array>
@@ -338,9 +339,14 @@ public:
338
339
 
339
340
  int get_node_weight(int node_idx)
340
341
  {
342
+ if (node_idx < 0 || node_idx >= (int)node_weights.size()) {
343
+ fprintf(stderr, "The node_idx %d is out of range!\n", node_idx);
344
+ return INT_MIN;
345
+ }
341
346
  return node_weights[node_idx];
342
347
  }
343
348
 
349
+
344
350
  void add_node(int weight, int node_idx)
345
351
  {
346
352
  if (weight >= weight_buckets.size())
@@ -539,7 +545,7 @@ using namespace wp;
539
545
 
540
546
  extern "C"
541
547
  {
542
- int graph_coloring(int num_nodes, wp::array_t<int> edges, int algorithm, wp::array_t<int> node_colors)
548
+ int wp_graph_coloring(int num_nodes, wp::array_t<int> edges, int algorithm, wp::array_t<int> node_colors)
543
549
  {
544
550
  if (node_colors.ndim != 1 || node_colors.shape[0] != num_nodes)
545
551
  {
@@ -594,7 +600,7 @@ extern "C"
594
600
  return num_colors;
595
601
  }
596
602
 
597
- float balance_coloring(int num_nodes, wp::array_t<int> edges, int num_colors,
603
+ float wp_balance_coloring(int num_nodes, wp::array_t<int> edges, int num_colors,
598
604
  float target_max_min_ratio, wp::array_t<int> node_colors)
599
605
  {
600
606
  Graph graph(num_nodes, edges);
warp/native/crt.cpp CHANGED
@@ -41,11 +41,11 @@ extern "C" WP_API void _wp_assert(const char* expression, const char* file, unsi
41
41
  fflush(stdout);
42
42
  fprintf(stderr,
43
43
  "Assertion failed: '%s'\n"
44
- "At '%s:%d'\n",
44
+ "At '%s:%u'\n",
45
45
  expression, file, line);
46
46
  fflush(stderr);
47
47
 
48
48
  // Now invoke the standard assert(), which may abort the program or break
49
49
  // into the debugger as decided by the runtime environment.
50
- assert(false && "assert() failed");
50
+ assert(false && "assert() failed"); // cppcheck-suppress incorrectStringBooleanError
51
51
  }
warp/native/crt.h CHANGED
@@ -110,11 +110,9 @@ extern "C" WP_API int _wp_isinf(double);
110
110
  #define SCHAR_MIN (-128)
111
111
  #define SCHAR_MAX 127
112
112
  #define UCHAR_MAX 255
113
- enum {
114
- _JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0,
115
- CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN,
116
- CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX,
117
- };
113
+ #define _JITIFY_CHAR_IS_UNSIGNED ((char)-1 >= 0)
114
+ #define CHAR_MIN (_JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN)
115
+ #define CHAR_MAX (_JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX)
118
116
  #define SHRT_MIN (-32768)
119
117
  #define SHRT_MAX 32767
120
118
  #define USHRT_MAX 65535
warp/native/cuda_util.cpp CHANGED
@@ -33,14 +33,14 @@
33
33
  #include <stack>
34
34
 
35
35
  // the minimum CUDA version required from the driver
36
- #define WP_CUDA_DRIVER_VERSION 11040
36
+ #define WP_CUDA_DRIVER_VERSION 12000
37
37
 
38
38
  // the minimum CUDA Toolkit version required to build Warp
39
- #define WP_CUDA_TOOLKIT_VERSION 11050
39
+ #define WP_CUDA_TOOLKIT_VERSION 12000
40
40
 
41
41
  // check if the CUDA Toolkit is too old
42
42
  #if CUDA_VERSION < WP_CUDA_TOOLKIT_VERSION
43
- #error Building Warp requires CUDA Toolkit version 11.5 or higher
43
+ #error Building Warp requires CUDA Toolkit version 12.0 or higher
44
44
  #endif
45
45
 
46
46
  // Avoid including <cudaGLTypedefs.h>, which requires OpenGL headers to be installed.
@@ -56,11 +56,12 @@ typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResou
56
56
 
57
57
  // function pointers to driver API entry points
58
58
  // these are explicitly versioned according to cudaTypedefs.h from CUDA Toolkit WP_CUDA_TOOLKIT_VERSION
59
- #if CUDA_VERSION < 12000
60
- static PFN_cuGetProcAddress_v11030 pfn_cuGetProcAddress;
61
- #else
62
- static PFN_cuGetProcAddress_v12000 pfn_cuGetProcAddress;
59
+
60
+ #if CUDA_VERSION >= 13000
61
+ #define PFN_cuGetProcAddress PFN_cuGetProcAddress_v12000
63
62
  #endif
63
+
64
+ static PFN_cuGetProcAddress_v12000 pfn_cuGetProcAddress;
64
65
  static PFN_cuDriverGetVersion_v2020 pfn_cuDriverGetVersion;
65
66
  static PFN_cuGetErrorName_v6000 pfn_cuGetErrorName;
66
67
  static PFN_cuGetErrorString_v6000 pfn_cuGetErrorString;
@@ -100,6 +101,12 @@ static PFN_cuEventQuery_v2000 pfn_cuEventQuery;
100
101
  static PFN_cuEventRecord_v2000 pfn_cuEventRecord;
101
102
  static PFN_cuEventRecordWithFlags_v11010 pfn_cuEventRecordWithFlags;
102
103
  static PFN_cuEventSynchronize_v2000 pfn_cuEventSynchronize;
104
+ #if CUDA_VERSION >= 12030
105
+ // function used to add conditional graph nodes, not available in older CUDA versions
106
+ static PFN_cuGraphAddNode_v12030 pfn_cuGraphAddNode;
107
+ #endif
108
+ static PFN_cuGraphNodeGetDependentNodes_v10000 pfn_cuGraphNodeGetDependentNodes;
109
+ static PFN_cuGraphNodeGetType_v10000 pfn_cuGraphNodeGetType;
103
110
  static PFN_cuModuleLoadDataEx_v2010 pfn_cuModuleLoadDataEx;
104
111
  static PFN_cuModuleUnload_v2000 pfn_cuModuleUnload;
105
112
  static PFN_cuModuleGetFunction_v2000 pfn_cuModuleGetFunction;
@@ -163,7 +170,7 @@ bool init_cuda_driver()
163
170
  #if defined(_WIN32)
164
171
  static HMODULE hCudaDriver = LoadLibraryA("nvcuda.dll");
165
172
  if (hCudaDriver == NULL) {
166
- fprintf(stderr, "Warp CUDA error: Could not open nvcuda.dll.\n");
173
+ fprintf(stderr, "Warp CUDA warning: Could not find or load the NVIDIA CUDA driver. Proceeding in CPU-only mode.\n");
167
174
  return false;
168
175
  }
169
176
  pfn_cuGetProcAddress = (PFN_cuGetProcAddress)GetProcAddress(hCudaDriver, "cuGetProcAddress");
@@ -173,7 +180,7 @@ bool init_cuda_driver()
173
180
  // WSL and possibly other systems might require the .1 suffix
174
181
  hCudaDriver = dlopen("libcuda.so.1", RTLD_NOW);
175
182
  if (hCudaDriver == NULL) {
176
- fprintf(stderr, "Warp CUDA error: Could not open libcuda.so.\n");
183
+ fprintf(stderr, "Warp CUDA warning: Could not find or load the NVIDIA CUDA driver. Proceeding in CPU-only mode.\n");
177
184
  return false;
178
185
  }
179
186
  }
@@ -243,6 +250,12 @@ bool init_cuda_driver()
243
250
  get_driver_entry_point("cuEventRecord", 2000, &(void*&)pfn_cuEventRecord);
244
251
  get_driver_entry_point("cuEventRecordWithFlags", 11010, &(void*&)pfn_cuEventRecordWithFlags);
245
252
  get_driver_entry_point("cuEventSynchronize", 2000, &(void*&)pfn_cuEventSynchronize);
253
+ #if CUDA_VERSION >= 12030
254
+ if (driver_version >= 12030)
255
+ get_driver_entry_point("cuGraphAddNode", 12030, &(void*&)pfn_cuGraphAddNode);
256
+ #endif
257
+ get_driver_entry_point("cuGraphNodeGetDependentNodes", 10000, &(void*&)pfn_cuGraphNodeGetDependentNodes);
258
+ get_driver_entry_point("cuGraphNodeGetType", 10000, &(void*&)pfn_cuGraphNodeGetType);
246
259
  get_driver_entry_point("cuModuleLoadDataEx", 2010, &(void*&)pfn_cuModuleLoadDataEx);
247
260
  get_driver_entry_point("cuModuleUnload", 2000, &(void*&)pfn_cuModuleUnload);
248
261
  get_driver_entry_point("cuModuleGetFunction", 2000, &(void*&)pfn_cuModuleGetFunction);
@@ -332,7 +345,8 @@ bool get_graph_leaf_nodes(cudaGraph_t graph, std::vector<cudaGraphNode_t>& leaf_
332
345
  for (cudaGraphNode_t node : nodes)
333
346
  {
334
347
  size_t dependent_count;
335
- if (!check_cuda(cudaGraphNodeGetDependentNodes(node, NULL, &dependent_count)))
348
+
349
+ if (!check_cu(cuGraphNodeGetDependentNodes_f(node, NULL, &dependent_count)))
336
350
  return false;
337
351
 
338
352
  if (dependent_count == 0)
@@ -553,6 +567,23 @@ CUresult cuEventSynchronize_f(CUevent event)
553
567
  return pfn_cuEventSynchronize ? pfn_cuEventSynchronize(event) : DRIVER_ENTRY_POINT_ERROR;
554
568
  }
555
569
 
570
+ #if CUDA_VERSION >= 12030
571
+ CUresult cuGraphAddNode_f(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams)
572
+ {
573
+ return pfn_cuGraphAddNode ? pfn_cuGraphAddNode(phGraphNode, hGraph, dependencies, dependencyData, numDependencies, nodeParams) : DRIVER_ENTRY_POINT_ERROR;
574
+ }
575
+ #endif
576
+
577
+ CUresult cuGraphNodeGetDependentNodes_f(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes)
578
+ {
579
+ return pfn_cuGraphNodeGetDependentNodes ? pfn_cuGraphNodeGetDependentNodes(hNode, dependentNodes, numDependentNodes) : DRIVER_ENTRY_POINT_ERROR;
580
+ }
581
+
582
+ CUresult cuGraphNodeGetType_f(CUgraphNode hNode, CUgraphNodeType* type)
583
+ {
584
+ return pfn_cuGraphNodeGetType ? pfn_cuGraphNodeGetType(hNode, type) : DRIVER_ENTRY_POINT_ERROR;
585
+ }
586
+
556
587
  CUresult cuModuleLoadDataEx_f(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
557
588
  {
558
589
  return pfn_cuModuleLoadDataEx ? pfn_cuModuleLoadDataEx(module, image, numOptions, options, optionValues) : DRIVER_ENTRY_POINT_ERROR;
warp/native/cuda_util.h CHANGED
@@ -38,19 +38,19 @@
38
38
  #define wp_launch_device(context, kernel, dim, args) { \
39
39
  if (dim) { \
40
40
  ContextGuard guard(context); \
41
- cudaStream_t stream = (cudaStream_t)cuda_stream_get_current(); \
41
+ cudaStream_t stream = (cudaStream_t)wp_cuda_stream_get_current(); \
42
42
  const int num_threads = 256; \
43
43
  const int num_blocks = (dim+num_threads-1)/num_threads; \
44
44
  begin_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream, context, #kernel); \
45
45
  kernel<<<num_blocks, 256, 0, stream>>>args; \
46
- check_cuda(cuda_context_check(WP_CURRENT_CONTEXT)); \
46
+ check_cuda(wp_cuda_context_check(WP_CURRENT_CONTEXT)); \
47
47
  end_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream); }}
48
48
  #else
49
49
  // helper for launching kernels (no error checking)
50
50
  #define wp_launch_device(context, kernel, dim, args) { \
51
51
  if (dim) { \
52
52
  ContextGuard guard(context); \
53
- cudaStream_t stream = (cudaStream_t)cuda_stream_get_current(); \
53
+ cudaStream_t stream = (cudaStream_t)wp_cuda_stream_get_current(); \
54
54
  const int num_threads = 256; \
55
55
  const int num_blocks = (dim+num_threads-1)/num_threads; \
56
56
  begin_cuda_range(WP_TIMING_KERNEL_BUILTIN, stream, context, #kernel); \
@@ -99,6 +99,12 @@ CUresult cuEventQuery_f(CUevent event);
99
99
  CUresult cuEventRecord_f(CUevent event, CUstream stream);
100
100
  CUresult cuEventRecordWithFlags_f(CUevent event, CUstream stream, unsigned int flags);
101
101
  CUresult cuEventSynchronize_f(CUevent event);
102
+ #if CUDA_VERSION >= 12030
103
+ // function used to add conditional graph nodes, not available in older CUDA versions
104
+ CUresult cuGraphAddNode_f(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams);
105
+ #endif
106
+ CUresult cuGraphNodeGetDependentNodes_f(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
107
+ CUresult cuGraphNodeGetType_f(CUgraphNode hNode, CUgraphNodeType* type);
102
108
  CUresult cuModuleUnload_f(CUmodule hmod);
103
109
  CUresult cuModuleLoadDataEx_f(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
104
110
  CUresult cuModuleGetFunction_f(CUfunction *hfunc, CUmodule hmod, const char *name);
@@ -255,7 +261,7 @@ constexpr int WP_TIMING_GRAPH = 16; // graph launch
255
261
  #define begin_cuda_range(_flag, _stream, _context, _name) \
256
262
  CudaTimingRange _timing_range; \
257
263
  bool _timing_enabled; \
258
- if ((g_cuda_timing_state->flags & _flag) && !cuda_stream_is_capturing(_stream)) { \
264
+ if ((g_cuda_timing_state->flags & _flag) && !wp_cuda_stream_is_capturing(_stream)) { \
259
265
  ContextGuard guard(_context, true); \
260
266
  _timing_enabled = true; \
261
267
  _timing_range.context = _context ? _context : get_current_context(); \