warp-lang 1.8.0__py3-none-macosx_10_13_universal2.whl → 1.9.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +482 -110
  3. warp/bin/libwarp-clang.dylib +0 -0
  4. warp/bin/libwarp.dylib +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +48 -63
  7. warp/builtins.py +955 -137
  8. warp/codegen.py +327 -209
  9. warp/config.py +1 -1
  10. warp/context.py +1363 -800
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/examples/interop/example_jax_callable.py +34 -4
  18. warp/examples/interop/example_jax_kernel.py +27 -1
  19. warp/fabric.py +1 -1
  20. warp/fem/cache.py +27 -19
  21. warp/fem/domain.py +2 -2
  22. warp/fem/field/nodal_field.py +2 -2
  23. warp/fem/field/virtual.py +266 -166
  24. warp/fem/geometry/geometry.py +5 -5
  25. warp/fem/integrate.py +200 -91
  26. warp/fem/space/restriction.py +4 -0
  27. warp/fem/space/shape/tet_shape_function.py +3 -10
  28. warp/jax_experimental/custom_call.py +1 -1
  29. warp/jax_experimental/ffi.py +203 -54
  30. warp/marching_cubes.py +708 -0
  31. warp/native/array.h +103 -8
  32. warp/native/builtin.h +90 -9
  33. warp/native/bvh.cpp +64 -28
  34. warp/native/bvh.cu +58 -58
  35. warp/native/bvh.h +2 -2
  36. warp/native/clang/clang.cpp +7 -7
  37. warp/native/coloring.cpp +13 -3
  38. warp/native/crt.cpp +2 -2
  39. warp/native/crt.h +3 -5
  40. warp/native/cuda_util.cpp +42 -11
  41. warp/native/cuda_util.h +10 -4
  42. warp/native/exports.h +1842 -1908
  43. warp/native/fabric.h +2 -1
  44. warp/native/hashgrid.cpp +37 -37
  45. warp/native/hashgrid.cu +2 -2
  46. warp/native/initializer_array.h +1 -1
  47. warp/native/intersect.h +4 -4
  48. warp/native/mat.h +1913 -119
  49. warp/native/mathdx.cpp +43 -43
  50. warp/native/mesh.cpp +24 -24
  51. warp/native/mesh.cu +26 -26
  52. warp/native/mesh.h +5 -3
  53. warp/native/nanovdb/GridHandle.h +179 -12
  54. warp/native/nanovdb/HostBuffer.h +8 -7
  55. warp/native/nanovdb/NanoVDB.h +517 -895
  56. warp/native/nanovdb/NodeManager.h +323 -0
  57. warp/native/nanovdb/PNanoVDB.h +2 -2
  58. warp/native/quat.h +337 -16
  59. warp/native/rand.h +7 -7
  60. warp/native/range.h +7 -1
  61. warp/native/reduce.cpp +10 -10
  62. warp/native/reduce.cu +13 -14
  63. warp/native/runlength_encode.cpp +2 -2
  64. warp/native/runlength_encode.cu +5 -5
  65. warp/native/scan.cpp +3 -3
  66. warp/native/scan.cu +4 -4
  67. warp/native/sort.cpp +10 -10
  68. warp/native/sort.cu +22 -22
  69. warp/native/sparse.cpp +8 -8
  70. warp/native/sparse.cu +14 -14
  71. warp/native/spatial.h +366 -17
  72. warp/native/svd.h +23 -8
  73. warp/native/temp_buffer.h +2 -2
  74. warp/native/tile.h +303 -70
  75. warp/native/tile_radix_sort.h +5 -1
  76. warp/native/tile_reduce.h +16 -25
  77. warp/native/tuple.h +2 -2
  78. warp/native/vec.h +385 -18
  79. warp/native/volume.cpp +54 -54
  80. warp/native/volume.cu +1 -1
  81. warp/native/volume.h +2 -1
  82. warp/native/volume_builder.cu +30 -37
  83. warp/native/warp.cpp +150 -149
  84. warp/native/warp.cu +337 -193
  85. warp/native/warp.h +227 -226
  86. warp/optim/linear.py +736 -271
  87. warp/render/imgui_manager.py +289 -0
  88. warp/render/render_opengl.py +137 -57
  89. warp/render/render_usd.py +0 -1
  90. warp/sim/collide.py +1 -2
  91. warp/sim/graph_coloring.py +2 -2
  92. warp/sim/integrator_vbd.py +10 -2
  93. warp/sparse.py +559 -176
  94. warp/tape.py +2 -0
  95. warp/tests/aux_test_module_aot.py +7 -0
  96. warp/tests/cuda/test_async.py +3 -3
  97. warp/tests/cuda/test_conditional_captures.py +101 -0
  98. warp/tests/geometry/test_marching_cubes.py +233 -12
  99. warp/tests/sim/test_cloth.py +89 -6
  100. warp/tests/sim/test_coloring.py +82 -7
  101. warp/tests/test_array.py +56 -5
  102. warp/tests/test_assert.py +53 -0
  103. warp/tests/test_atomic_cas.py +127 -114
  104. warp/tests/test_codegen.py +3 -2
  105. warp/tests/test_context.py +8 -15
  106. warp/tests/test_enum.py +136 -0
  107. warp/tests/test_examples.py +2 -2
  108. warp/tests/test_fem.py +45 -2
  109. warp/tests/test_fixedarray.py +229 -0
  110. warp/tests/test_func.py +18 -15
  111. warp/tests/test_future_annotations.py +7 -5
  112. warp/tests/test_linear_solvers.py +30 -0
  113. warp/tests/test_map.py +1 -1
  114. warp/tests/test_mat.py +1540 -378
  115. warp/tests/test_mat_assign_copy.py +178 -0
  116. warp/tests/test_mat_constructors.py +574 -0
  117. warp/tests/test_module_aot.py +287 -0
  118. warp/tests/test_print.py +69 -0
  119. warp/tests/test_quat.py +162 -34
  120. warp/tests/test_quat_assign_copy.py +145 -0
  121. warp/tests/test_reload.py +2 -1
  122. warp/tests/test_sparse.py +103 -0
  123. warp/tests/test_spatial.py +140 -34
  124. warp/tests/test_spatial_assign_copy.py +160 -0
  125. warp/tests/test_static.py +48 -0
  126. warp/tests/test_struct.py +43 -3
  127. warp/tests/test_tape.py +38 -0
  128. warp/tests/test_types.py +0 -20
  129. warp/tests/test_vec.py +216 -441
  130. warp/tests/test_vec_assign_copy.py +143 -0
  131. warp/tests/test_vec_constructors.py +325 -0
  132. warp/tests/tile/test_tile.py +206 -152
  133. warp/tests/tile/test_tile_cholesky.py +605 -0
  134. warp/tests/tile/test_tile_load.py +169 -0
  135. warp/tests/tile/test_tile_mathdx.py +2 -558
  136. warp/tests/tile/test_tile_matmul.py +179 -0
  137. warp/tests/tile/test_tile_mlp.py +1 -1
  138. warp/tests/tile/test_tile_reduce.py +100 -11
  139. warp/tests/tile/test_tile_shared_memory.py +16 -16
  140. warp/tests/tile/test_tile_sort.py +59 -55
  141. warp/tests/unittest_suites.py +16 -0
  142. warp/tests/walkthrough_debug.py +1 -1
  143. warp/thirdparty/unittest_parallel.py +108 -9
  144. warp/types.py +554 -264
  145. warp/utils.py +68 -86
  146. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
  147. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
  148. warp/native/marching.cpp +0 -19
  149. warp/native/marching.cu +0 -514
  150. warp/native/marching.h +0 -19
  151. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
  152. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
  153. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/warp.cu CHANGED
@@ -168,7 +168,7 @@ struct ContextInfo
168
168
  {
169
169
  DeviceInfo* device_info = NULL;
170
170
 
171
- // the current stream, managed from Python (see cuda_context_set_stream() and cuda_context_get_stream())
171
+ // the current stream, managed from Python (see wp_cuda_context_set_stream() and wp_cuda_context_get_stream())
172
172
  CUstream stream = NULL;
173
173
 
174
174
  // conditional graph node support, loaded on demand if the driver supports it (CUDA 12.4+)
@@ -237,11 +237,11 @@ static std::unordered_map<CUstream, StreamInfo> g_streams;
237
237
 
238
238
  // Ongoing graph captures registered using wp.capture_begin().
239
239
  // This maps the capture id to the stream where capture was started.
240
- // See cuda_graph_begin_capture(), cuda_graph_end_capture(), and free_device_async().
240
+ // See wp_cuda_graph_begin_capture(), wp_cuda_graph_end_capture(), and wp_free_device_async().
241
241
  static std::unordered_map<uint64_t, CaptureInfo*> g_captures;
242
242
 
243
243
  // Memory allocated during graph capture requires special handling.
244
- // See alloc_device_async() and free_device_async().
244
+ // See wp_alloc_device_async() and wp_free_device_async().
245
245
  static std::unordered_map<void*, GraphAllocInfo> g_graph_allocs;
246
246
 
247
247
  // Memory that cannot be freed immediately gets queued here.
@@ -252,12 +252,12 @@ static std::vector<FreeInfo> g_deferred_free_list;
252
252
  // Call unload_deferred_modules() to release.
253
253
  static std::vector<ModuleInfo> g_deferred_module_list;
254
254
 
255
- void cuda_set_context_restore_policy(bool always_restore)
255
+ void wp_cuda_set_context_restore_policy(bool always_restore)
256
256
  {
257
257
  ContextGuard::always_restore = always_restore;
258
258
  }
259
259
 
260
- int cuda_get_context_restore_policy()
260
+ int wp_cuda_get_context_restore_policy()
261
261
  {
262
262
  return int(ContextGuard::always_restore);
263
263
  }
@@ -309,7 +309,13 @@ int cuda_init()
309
309
  check_cu(cuDeviceGetAttribute_f(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
310
310
  check_cu(cuDeviceGetAttribute_f(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
311
311
  g_devices[i].arch = 10 * major + minor;
312
-
312
+ #ifdef CUDA_VERSION
313
+ #if CUDA_VERSION < 13000
314
+ if (g_devices[i].arch == 110) {
315
+ g_devices[i].arch = 101; // Thor SM change
316
+ }
317
+ #endif
318
+ #endif
313
319
  g_device_map[device] = &g_devices[i];
314
320
  }
315
321
  else
@@ -342,7 +348,7 @@ static inline CUcontext get_current_context()
342
348
 
343
349
  static inline CUstream get_current_stream(void* context=NULL)
344
350
  {
345
- return static_cast<CUstream>(cuda_context_get_stream(context));
351
+ return static_cast<CUstream>(wp_cuda_context_get_stream(context));
346
352
  }
347
353
 
348
354
  static ContextInfo* get_context_info(CUcontext ctx)
@@ -475,7 +481,7 @@ static int unload_deferred_modules(void* context = NULL)
475
481
  const ModuleInfo& module_info = *it;
476
482
  if (module_info.context == context || !context)
477
483
  {
478
- cuda_unload_module(module_info.context, module_info.module);
484
+ wp_cuda_unload_module(module_info.context, module_info.module);
479
485
  ++num_unloaded_modules;
480
486
  it = g_deferred_module_list.erase(it);
481
487
  }
@@ -529,41 +535,41 @@ static inline const char* get_cuda_kernel_name(void* kernel)
529
535
  }
530
536
 
531
537
 
532
- void* alloc_pinned(size_t s)
538
+ void* wp_alloc_pinned(size_t s)
533
539
  {
534
540
  void* ptr = NULL;
535
541
  check_cuda(cudaMallocHost(&ptr, s));
536
542
  return ptr;
537
543
  }
538
544
 
539
- void free_pinned(void* ptr)
545
+ void wp_free_pinned(void* ptr)
540
546
  {
541
547
  cudaFreeHost(ptr);
542
548
  }
543
549
 
544
- void* alloc_device(void* context, size_t s)
550
+ void* wp_alloc_device(void* context, size_t s)
545
551
  {
546
- int ordinal = cuda_context_get_device_ordinal(context);
552
+ int ordinal = wp_cuda_context_get_device_ordinal(context);
547
553
 
548
554
  // use stream-ordered allocator if available
549
- if (cuda_device_is_mempool_supported(ordinal))
550
- return alloc_device_async(context, s);
555
+ if (wp_cuda_device_is_mempool_supported(ordinal))
556
+ return wp_alloc_device_async(context, s);
551
557
  else
552
- return alloc_device_default(context, s);
558
+ return wp_alloc_device_default(context, s);
553
559
  }
554
560
 
555
- void free_device(void* context, void* ptr)
561
+ void wp_free_device(void* context, void* ptr)
556
562
  {
557
- int ordinal = cuda_context_get_device_ordinal(context);
563
+ int ordinal = wp_cuda_context_get_device_ordinal(context);
558
564
 
559
565
  // use stream-ordered allocator if available
560
- if (cuda_device_is_mempool_supported(ordinal))
561
- free_device_async(context, ptr);
566
+ if (wp_cuda_device_is_mempool_supported(ordinal))
567
+ wp_free_device_async(context, ptr);
562
568
  else
563
- free_device_default(context, ptr);
569
+ wp_free_device_default(context, ptr);
564
570
  }
565
571
 
566
- void* alloc_device_default(void* context, size_t s)
572
+ void* wp_alloc_device_default(void* context, size_t s)
567
573
  {
568
574
  ContextGuard guard(context);
569
575
 
@@ -573,7 +579,7 @@ void* alloc_device_default(void* context, size_t s)
573
579
  return ptr;
574
580
  }
575
581
 
576
- void free_device_default(void* context, void* ptr)
582
+ void wp_free_device_default(void* context, void* ptr)
577
583
  {
578
584
  ContextGuard guard(context);
579
585
 
@@ -589,7 +595,7 @@ void free_device_default(void* context, void* ptr)
589
595
  }
590
596
  }
591
597
 
592
- void* alloc_device_async(void* context, size_t s)
598
+ void* wp_alloc_device_async(void* context, size_t s)
593
599
  {
594
600
  // stream-ordered allocations don't rely on the current context,
595
601
  // but we set the context here for consistent behaviour
@@ -607,7 +613,7 @@ void* alloc_device_async(void* context, size_t s)
607
613
  if (ptr)
608
614
  {
609
615
  // if the stream is capturing, the allocation requires special handling
610
- if (cuda_stream_is_capturing(stream))
616
+ if (wp_cuda_stream_is_capturing(stream))
611
617
  {
612
618
  // check if this is a known capture
613
619
  uint64_t capture_id = get_capture_id(stream);
@@ -628,7 +634,7 @@ void* alloc_device_async(void* context, size_t s)
628
634
  return ptr;
629
635
  }
630
636
 
631
- void free_device_async(void* context, void* ptr)
637
+ void wp_free_device_async(void* context, void* ptr)
632
638
  {
633
639
  // stream-ordered allocators generally don't rely on the current context,
634
640
  // but we set the context here for consistent behaviour
@@ -726,7 +732,7 @@ void free_device_async(void* context, void* ptr)
726
732
  }
727
733
  }
728
734
 
729
- bool memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
735
+ bool wp_memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
730
736
  {
731
737
  ContextGuard guard(context);
732
738
 
@@ -745,7 +751,7 @@ bool memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
745
751
  return result;
746
752
  }
747
753
 
748
- bool memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
754
+ bool wp_memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
749
755
  {
750
756
  ContextGuard guard(context);
751
757
 
@@ -764,7 +770,7 @@ bool memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
764
770
  return result;
765
771
  }
766
772
 
767
- bool memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
773
+ bool wp_memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
768
774
  {
769
775
  ContextGuard guard(context);
770
776
 
@@ -783,7 +789,7 @@ bool memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
783
789
  return result;
784
790
  }
785
791
 
786
- bool memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size_t n, void* stream)
792
+ bool wp_memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size_t n, void* stream)
787
793
  {
788
794
  // ContextGuard guard(context);
789
795
 
@@ -803,7 +809,7 @@ bool memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size
803
809
  // because cudaMemPoolGetAccess() cannot be called during graph capture.
804
810
  // - CUDA will report error 1 (invalid argument) if cudaMemcpyAsync() is called but mempool access is not enabled.
805
811
 
806
- if (!cuda_stream_is_capturing(stream))
812
+ if (!wp_cuda_stream_is_capturing(stream))
807
813
  {
808
814
  begin_cuda_range(WP_TIMING_MEMCPY, cuda_stream, get_stream_context(stream), "memcpy PtoP");
809
815
 
@@ -890,7 +896,7 @@ __global__ void memset_kernel(int* dest, int value, size_t n)
890
896
  }
891
897
  }
892
898
 
893
- void memset_device(void* context, void* dest, int value, size_t n)
899
+ void wp_memset_device(void* context, void* dest, int value, size_t n)
894
900
  {
895
901
  ContextGuard guard(context);
896
902
 
@@ -934,7 +940,7 @@ __global__ void memtile_value_kernel(T* dst, T value, size_t n)
934
940
  }
935
941
  }
936
942
 
937
- void memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
943
+ void wp_memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
938
944
  {
939
945
  ContextGuard guard(context);
940
946
 
@@ -970,12 +976,12 @@ void memtile_device(void* context, void* dst, const void* src, size_t srcsize, s
970
976
 
971
977
  // copy value to device memory
972
978
  // TODO: use a persistent stream-local staging buffer to avoid allocs?
973
- void* src_devptr = alloc_device(WP_CURRENT_CONTEXT, srcsize);
979
+ void* src_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, srcsize);
974
980
  check_cuda(cudaMemcpyAsync(src_devptr, src, srcsize, cudaMemcpyHostToDevice, get_current_stream()));
975
981
 
976
982
  wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, (dst, src_devptr, srcsize, n));
977
983
 
978
- free_device(WP_CURRENT_CONTEXT, src_devptr);
984
+ wp_free_device(WP_CURRENT_CONTEXT, src_devptr);
979
985
 
980
986
  }
981
987
  }
@@ -1202,7 +1208,7 @@ static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::in
1202
1208
  }
1203
1209
 
1204
1210
 
1205
- WP_API bool array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
1211
+ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
1206
1212
  {
1207
1213
  if (!src || !dst)
1208
1214
  return false;
@@ -1594,7 +1600,7 @@ static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t
1594
1600
  }
1595
1601
 
1596
1602
 
1597
- WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
1603
+ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
1598
1604
  {
1599
1605
  if (!arr_ptr || !value_ptr)
1600
1606
  return;
@@ -1650,7 +1656,7 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
1650
1656
 
1651
1657
  // copy value to device memory
1652
1658
  // TODO: use a persistent stream-local staging buffer to avoid allocs?
1653
- void* value_devptr = alloc_device(WP_CURRENT_CONTEXT, value_size);
1659
+ void* value_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, value_size);
1654
1660
  check_cuda(cudaMemcpyAsync(value_devptr, value_ptr, value_size, cudaMemcpyHostToDevice, get_current_stream()));
1655
1661
 
1656
1662
  // handle fabric arrays
@@ -1708,20 +1714,20 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
1708
1714
  return;
1709
1715
  }
1710
1716
 
1711
- free_device(WP_CURRENT_CONTEXT, value_devptr);
1717
+ wp_free_device(WP_CURRENT_CONTEXT, value_devptr);
1712
1718
  }
1713
1719
 
1714
- void array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
1720
+ void wp_array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
1715
1721
  {
1716
1722
  scan_device((const int*)in, (int*)out, len, inclusive);
1717
1723
  }
1718
1724
 
1719
- void array_scan_float_device(uint64_t in, uint64_t out, int len, bool inclusive)
1725
+ void wp_array_scan_float_device(uint64_t in, uint64_t out, int len, bool inclusive)
1720
1726
  {
1721
1727
  scan_device((const float*)in, (float*)out, len, inclusive);
1722
1728
  }
1723
1729
 
1724
- int cuda_driver_version()
1730
+ int wp_cuda_driver_version()
1725
1731
  {
1726
1732
  int version;
1727
1733
  if (check_cu(cuDriverGetVersion_f(&version)))
@@ -1730,17 +1736,17 @@ int cuda_driver_version()
1730
1736
  return 0;
1731
1737
  }
1732
1738
 
1733
- int cuda_toolkit_version()
1739
+ int wp_cuda_toolkit_version()
1734
1740
  {
1735
1741
  return CUDA_VERSION;
1736
1742
  }
1737
1743
 
1738
- bool cuda_driver_is_initialized()
1744
+ bool wp_cuda_driver_is_initialized()
1739
1745
  {
1740
1746
  return is_cuda_driver_initialized();
1741
1747
  }
1742
1748
 
1743
- int nvrtc_supported_arch_count()
1749
+ int wp_nvrtc_supported_arch_count()
1744
1750
  {
1745
1751
  int count;
1746
1752
  if (check_nvrtc(nvrtcGetNumSupportedArchs(&count)))
@@ -1749,7 +1755,7 @@ int nvrtc_supported_arch_count()
1749
1755
  return 0;
1750
1756
  }
1751
1757
 
1752
- void nvrtc_supported_archs(int* archs)
1758
+ void wp_nvrtc_supported_archs(int* archs)
1753
1759
  {
1754
1760
  if (archs)
1755
1761
  {
@@ -1757,14 +1763,14 @@ void nvrtc_supported_archs(int* archs)
1757
1763
  }
1758
1764
  }
1759
1765
 
1760
- int cuda_device_get_count()
1766
+ int wp_cuda_device_get_count()
1761
1767
  {
1762
1768
  int count = 0;
1763
1769
  check_cu(cuDeviceGetCount_f(&count));
1764
1770
  return count;
1765
1771
  }
1766
1772
 
1767
- void* cuda_device_get_primary_context(int ordinal)
1773
+ void* wp_cuda_device_get_primary_context(int ordinal)
1768
1774
  {
1769
1775
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1770
1776
  {
@@ -1780,75 +1786,75 @@ void* cuda_device_get_primary_context(int ordinal)
1780
1786
  return NULL;
1781
1787
  }
1782
1788
 
1783
- const char* cuda_device_get_name(int ordinal)
1789
+ const char* wp_cuda_device_get_name(int ordinal)
1784
1790
  {
1785
1791
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1786
1792
  return g_devices[ordinal].name;
1787
1793
  return NULL;
1788
1794
  }
1789
1795
 
1790
- int cuda_device_get_arch(int ordinal)
1796
+ int wp_cuda_device_get_arch(int ordinal)
1791
1797
  {
1792
1798
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1793
1799
  return g_devices[ordinal].arch;
1794
1800
  return 0;
1795
1801
  }
1796
1802
 
1797
- int cuda_device_get_sm_count(int ordinal)
1803
+ int wp_cuda_device_get_sm_count(int ordinal)
1798
1804
  {
1799
1805
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1800
1806
  return g_devices[ordinal].sm_count;
1801
1807
  return 0;
1802
1808
  }
1803
1809
 
1804
- void cuda_device_get_uuid(int ordinal, char uuid[16])
1810
+ void wp_cuda_device_get_uuid(int ordinal, char uuid[16])
1805
1811
  {
1806
1812
  memcpy(uuid, g_devices[ordinal].uuid.bytes, sizeof(char)*16);
1807
1813
  }
1808
1814
 
1809
- int cuda_device_get_pci_domain_id(int ordinal)
1815
+ int wp_cuda_device_get_pci_domain_id(int ordinal)
1810
1816
  {
1811
1817
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1812
1818
  return g_devices[ordinal].pci_domain_id;
1813
1819
  return -1;
1814
1820
  }
1815
1821
 
1816
- int cuda_device_get_pci_bus_id(int ordinal)
1822
+ int wp_cuda_device_get_pci_bus_id(int ordinal)
1817
1823
  {
1818
1824
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1819
1825
  return g_devices[ordinal].pci_bus_id;
1820
1826
  return -1;
1821
1827
  }
1822
1828
 
1823
- int cuda_device_get_pci_device_id(int ordinal)
1829
+ int wp_cuda_device_get_pci_device_id(int ordinal)
1824
1830
  {
1825
1831
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1826
1832
  return g_devices[ordinal].pci_device_id;
1827
1833
  return -1;
1828
1834
  }
1829
1835
 
1830
- int cuda_device_is_uva(int ordinal)
1836
+ int wp_cuda_device_is_uva(int ordinal)
1831
1837
  {
1832
1838
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1833
1839
  return g_devices[ordinal].is_uva;
1834
1840
  return 0;
1835
1841
  }
1836
1842
 
1837
- int cuda_device_is_mempool_supported(int ordinal)
1843
+ int wp_cuda_device_is_mempool_supported(int ordinal)
1838
1844
  {
1839
1845
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1840
1846
  return g_devices[ordinal].is_mempool_supported;
1841
1847
  return 0;
1842
1848
  }
1843
1849
 
1844
- int cuda_device_is_ipc_supported(int ordinal)
1850
+ int wp_cuda_device_is_ipc_supported(int ordinal)
1845
1851
  {
1846
1852
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1847
1853
  return g_devices[ordinal].is_ipc_supported;
1848
1854
  return 0;
1849
1855
  }
1850
1856
 
1851
- int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
1857
+ int wp_cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
1852
1858
  {
1853
1859
  if (ordinal < 0 || ordinal > int(g_devices.size()))
1854
1860
  {
@@ -1875,7 +1881,7 @@ int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
1875
1881
  return 1; // success
1876
1882
  }
1877
1883
 
1878
- uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
1884
+ uint64_t wp_cuda_device_get_mempool_release_threshold(int ordinal)
1879
1885
  {
1880
1886
  if (ordinal < 0 || ordinal > int(g_devices.size()))
1881
1887
  {
@@ -1903,7 +1909,7 @@ uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
1903
1909
  return threshold;
1904
1910
  }
1905
1911
 
1906
- uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
1912
+ uint64_t wp_cuda_device_get_mempool_used_mem_current(int ordinal)
1907
1913
  {
1908
1914
  if (ordinal < 0 || ordinal > int(g_devices.size()))
1909
1915
  {
@@ -1931,7 +1937,7 @@ uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
1931
1937
  return mem_used;
1932
1938
  }
1933
1939
 
1934
- uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
1940
+ uint64_t wp_cuda_device_get_mempool_used_mem_high(int ordinal)
1935
1941
  {
1936
1942
  if (ordinal < 0 || ordinal > int(g_devices.size()))
1937
1943
  {
@@ -1959,7 +1965,7 @@ uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
1959
1965
  return mem_high_water_mark;
1960
1966
  }
1961
1967
 
1962
- void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
1968
+ void wp_cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
1963
1969
  {
1964
1970
  // use temporary storage if user didn't specify pointers
1965
1971
  size_t tmp_free_mem, tmp_total_mem;
@@ -1996,12 +2002,12 @@ void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_me
1996
2002
  }
1997
2003
 
1998
2004
 
1999
- void* cuda_context_get_current()
2005
+ void* wp_cuda_context_get_current()
2000
2006
  {
2001
2007
  return get_current_context();
2002
2008
  }
2003
2009
 
2004
- void cuda_context_set_current(void* context)
2010
+ void wp_cuda_context_set_current(void* context)
2005
2011
  {
2006
2012
  CUcontext ctx = static_cast<CUcontext>(context);
2007
2013
  CUcontext prev_ctx = NULL;
@@ -2012,18 +2018,18 @@ void cuda_context_set_current(void* context)
2012
2018
  }
2013
2019
  }
2014
2020
 
2015
- void cuda_context_push_current(void* context)
2021
+ void wp_cuda_context_push_current(void* context)
2016
2022
  {
2017
2023
  check_cu(cuCtxPushCurrent_f(static_cast<CUcontext>(context)));
2018
2024
  }
2019
2025
 
2020
- void cuda_context_pop_current()
2026
+ void wp_cuda_context_pop_current()
2021
2027
  {
2022
2028
  CUcontext context;
2023
2029
  check_cu(cuCtxPopCurrent_f(&context));
2024
2030
  }
2025
2031
 
2026
- void* cuda_context_create(int device_ordinal)
2032
+ void* wp_cuda_context_create(int device_ordinal)
2027
2033
  {
2028
2034
  CUcontext ctx = NULL;
2029
2035
  CUdevice device;
@@ -2032,15 +2038,15 @@ void* cuda_context_create(int device_ordinal)
2032
2038
  return ctx;
2033
2039
  }
2034
2040
 
2035
- void cuda_context_destroy(void* context)
2041
+ void wp_cuda_context_destroy(void* context)
2036
2042
  {
2037
2043
  if (context)
2038
2044
  {
2039
2045
  CUcontext ctx = static_cast<CUcontext>(context);
2040
2046
 
2041
2047
  // ensure this is not the current context
2042
- if (ctx == cuda_context_get_current())
2043
- cuda_context_set_current(NULL);
2048
+ if (ctx == wp_cuda_context_get_current())
2049
+ wp_cuda_context_set_current(NULL);
2044
2050
 
2045
2051
  // release the cached info about this context
2046
2052
  ContextInfo* info = get_context_info(ctx);
@@ -2059,7 +2065,7 @@ void cuda_context_destroy(void* context)
2059
2065
  }
2060
2066
  }
2061
2067
 
2062
- void cuda_context_synchronize(void* context)
2068
+ void wp_cuda_context_synchronize(void* context)
2063
2069
  {
2064
2070
  ContextGuard guard(context);
2065
2071
 
@@ -2073,10 +2079,10 @@ void cuda_context_synchronize(void* context)
2073
2079
 
2074
2080
  unload_deferred_modules(context);
2075
2081
 
2076
- // check_cuda(cudaDeviceGraphMemTrim(cuda_context_get_device_ordinal(context)));
2082
+ // check_cuda(cudaDeviceGraphMemTrim(wp_cuda_context_get_device_ordinal(context)));
2077
2083
  }
2078
2084
 
2079
- uint64_t cuda_context_check(void* context)
2085
+ uint64_t wp_cuda_context_check(void* context)
2080
2086
  {
2081
2087
  ContextGuard guard(context);
2082
2088
 
@@ -2098,13 +2104,13 @@ uint64_t cuda_context_check(void* context)
2098
2104
  }
2099
2105
 
2100
2106
 
2101
- int cuda_context_get_device_ordinal(void* context)
2107
+ int wp_cuda_context_get_device_ordinal(void* context)
2102
2108
  {
2103
2109
  ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
2104
2110
  return info && info->device_info ? info->device_info->ordinal : -1;
2105
2111
  }
2106
2112
 
2107
- int cuda_context_is_primary(void* context)
2113
+ int wp_cuda_context_is_primary(void* context)
2108
2114
  {
2109
2115
  CUcontext ctx = static_cast<CUcontext>(context);
2110
2116
  ContextInfo* context_info = get_context_info(ctx);
@@ -2131,7 +2137,7 @@ int cuda_context_is_primary(void* context)
2131
2137
  return 0;
2132
2138
  }
2133
2139
 
2134
- void* cuda_context_get_stream(void* context)
2140
+ void* wp_cuda_context_get_stream(void* context)
2135
2141
  {
2136
2142
  ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
2137
2143
  if (info)
@@ -2141,7 +2147,7 @@ void* cuda_context_get_stream(void* context)
2141
2147
  return NULL;
2142
2148
  }
2143
2149
 
2144
- void cuda_context_set_stream(void* context, void* stream, int sync)
2150
+ void wp_cuda_context_set_stream(void* context, void* stream, int sync)
2145
2151
  {
2146
2152
  ContextInfo* context_info = get_context_info(static_cast<CUcontext>(context));
2147
2153
  if (context_info)
@@ -2165,7 +2171,7 @@ void cuda_context_set_stream(void* context, void* stream, int sync)
2165
2171
  }
2166
2172
  }
2167
2173
 
2168
- int cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
2174
+ int wp_cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
2169
2175
  {
2170
2176
  int num_devices = int(g_devices.size());
2171
2177
 
@@ -2190,7 +2196,7 @@ int cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
2190
2196
  return can_access;
2191
2197
  }
2192
2198
 
2193
- int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
2199
+ int wp_cuda_is_peer_access_enabled(void* target_context, void* peer_context)
2194
2200
  {
2195
2201
  if (!target_context || !peer_context)
2196
2202
  {
@@ -2201,8 +2207,8 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
2201
2207
  if (target_context == peer_context)
2202
2208
  return 1;
2203
2209
 
2204
- int target_ordinal = cuda_context_get_device_ordinal(target_context);
2205
- int peer_ordinal = cuda_context_get_device_ordinal(peer_context);
2210
+ int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
2211
+ int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
2206
2212
 
2207
2213
  // check if peer access is supported
2208
2214
  int can_access = 0;
@@ -2235,7 +2241,7 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
2235
2241
  }
2236
2242
  }
2237
2243
 
2238
- int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int enable)
2244
+ int wp_cuda_set_peer_access_enabled(void* target_context, void* peer_context, int enable)
2239
2245
  {
2240
2246
  if (!target_context || !peer_context)
2241
2247
  {
@@ -2246,8 +2252,8 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
2246
2252
  if (target_context == peer_context)
2247
2253
  return 1; // no-op
2248
2254
 
2249
- int target_ordinal = cuda_context_get_device_ordinal(target_context);
2250
- int peer_ordinal = cuda_context_get_device_ordinal(peer_context);
2255
+ int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
2256
+ int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
2251
2257
 
2252
2258
  // check if peer access is supported
2253
2259
  int can_access = 0;
@@ -2292,7 +2298,7 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
2292
2298
  return 1; // success
2293
2299
  }
2294
2300
 
2295
- int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
2301
+ int wp_cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
2296
2302
  {
2297
2303
  int num_devices = int(g_devices.size());
2298
2304
 
@@ -2328,7 +2334,7 @@ int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
2328
2334
  return 0;
2329
2335
  }
2330
2336
 
2331
- int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable)
2337
+ int wp_cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable)
2332
2338
  {
2333
2339
  int num_devices = int(g_devices.size());
2334
2340
 
@@ -2374,13 +2380,13 @@ int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int en
2374
2380
  return 1; // success
2375
2381
  }
2376
2382
 
2377
- void cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {
2383
+ void wp_cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {
2378
2384
  CUipcMemHandle memHandle;
2379
2385
  check_cu(cuIpcGetMemHandle_f(&memHandle, (CUdeviceptr)ptr));
2380
2386
  memcpy(out_buffer, memHandle.reserved, CU_IPC_HANDLE_SIZE);
2381
2387
  }
2382
2388
 
2383
- void* cuda_ipc_open_mem_handle(void* context, char* handle) {
2389
+ void* wp_cuda_ipc_open_mem_handle(void* context, char* handle) {
2384
2390
  ContextGuard guard(context);
2385
2391
 
2386
2392
  CUipcMemHandle memHandle;
@@ -2395,11 +2401,11 @@ void* cuda_ipc_open_mem_handle(void* context, char* handle) {
2395
2401
  return NULL;
2396
2402
  }
2397
2403
 
2398
- void cuda_ipc_close_mem_handle(void* ptr) {
2404
+ void wp_cuda_ipc_close_mem_handle(void* ptr) {
2399
2405
  check_cu(cuIpcCloseMemHandle_f((CUdeviceptr) ptr));
2400
2406
  }
2401
2407
 
2402
- void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
2408
+ void wp_cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
2403
2409
  ContextGuard guard(context);
2404
2410
 
2405
2411
  CUipcEventHandle eventHandle;
@@ -2407,7 +2413,7 @@ void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
2407
2413
  memcpy(out_buffer, eventHandle.reserved, CU_IPC_HANDLE_SIZE);
2408
2414
  }
2409
2415
 
2410
- void* cuda_ipc_open_event_handle(void* context, char* handle) {
2416
+ void* wp_cuda_ipc_open_event_handle(void* context, char* handle) {
2411
2417
  ContextGuard guard(context);
2412
2418
 
2413
2419
  CUipcEventHandle eventHandle;
@@ -2421,31 +2427,31 @@ void* cuda_ipc_open_event_handle(void* context, char* handle) {
2421
2427
  return NULL;
2422
2428
  }
2423
2429
 
2424
- void* cuda_stream_create(void* context, int priority)
2430
+ void* wp_cuda_stream_create(void* context, int priority)
2425
2431
  {
2426
2432
  ContextGuard guard(context, true);
2427
2433
 
2428
2434
  CUstream stream;
2429
2435
  if (check_cu(cuStreamCreateWithPriority_f(&stream, CU_STREAM_DEFAULT, priority)))
2430
2436
  {
2431
- cuda_stream_register(WP_CURRENT_CONTEXT, stream);
2437
+ wp_cuda_stream_register(WP_CURRENT_CONTEXT, stream);
2432
2438
  return stream;
2433
2439
  }
2434
2440
  else
2435
2441
  return NULL;
2436
2442
  }
2437
2443
 
2438
- void cuda_stream_destroy(void* context, void* stream)
2444
+ void wp_cuda_stream_destroy(void* context, void* stream)
2439
2445
  {
2440
2446
  if (!stream)
2441
2447
  return;
2442
2448
 
2443
- cuda_stream_unregister(context, stream);
2449
+ wp_cuda_stream_unregister(context, stream);
2444
2450
 
2445
2451
  check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
2446
2452
  }
2447
2453
 
2448
- int cuda_stream_query(void* stream)
2454
+ int wp_cuda_stream_query(void* stream)
2449
2455
  {
2450
2456
  CUresult res = cuStreamQuery_f(static_cast<CUstream>(stream));
2451
2457
 
@@ -2458,7 +2464,7 @@ int cuda_stream_query(void* stream)
2458
2464
  return res;
2459
2465
  }
2460
2466
 
2461
- void cuda_stream_register(void* context, void* stream)
2467
+ void wp_cuda_stream_register(void* context, void* stream)
2462
2468
  {
2463
2469
  if (!stream)
2464
2470
  return;
@@ -2470,7 +2476,7 @@ void cuda_stream_register(void* context, void* stream)
2470
2476
  check_cu(cuEventCreate_f(&stream_info.cached_event, CU_EVENT_DISABLE_TIMING));
2471
2477
  }
2472
2478
 
2473
- void cuda_stream_unregister(void* context, void* stream)
2479
+ void wp_cuda_stream_unregister(void* context, void* stream)
2474
2480
  {
2475
2481
  if (!stream)
2476
2482
  return;
@@ -2494,28 +2500,28 @@ void cuda_stream_unregister(void* context, void* stream)
2494
2500
  }
2495
2501
  }
2496
2502
 
2497
- void* cuda_stream_get_current()
2503
+ void* wp_cuda_stream_get_current()
2498
2504
  {
2499
2505
  return get_current_stream();
2500
2506
  }
2501
2507
 
2502
- void cuda_stream_synchronize(void* stream)
2508
+ void wp_cuda_stream_synchronize(void* stream)
2503
2509
  {
2504
2510
  check_cu(cuStreamSynchronize_f(static_cast<CUstream>(stream)));
2505
2511
  }
2506
2512
 
2507
- void cuda_stream_wait_event(void* stream, void* event)
2513
+ void wp_cuda_stream_wait_event(void* stream, void* event)
2508
2514
  {
2509
2515
  check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
2510
2516
  }
2511
2517
 
2512
- void cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
2518
+ void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
2513
2519
  {
2514
2520
  check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream)));
2515
2521
  check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
2516
2522
  }
2517
2523
 
2518
- int cuda_stream_is_capturing(void* stream)
2524
+ int wp_cuda_stream_is_capturing(void* stream)
2519
2525
  {
2520
2526
  cudaStreamCaptureStatus status = cudaStreamCaptureStatusNone;
2521
2527
  check_cuda(cudaStreamIsCapturing(static_cast<cudaStream_t>(stream), &status));
@@ -2523,12 +2529,12 @@ int cuda_stream_is_capturing(void* stream)
2523
2529
  return int(status != cudaStreamCaptureStatusNone);
2524
2530
  }
2525
2531
 
2526
- uint64_t cuda_stream_get_capture_id(void* stream)
2532
+ uint64_t wp_cuda_stream_get_capture_id(void* stream)
2527
2533
  {
2528
2534
  return get_capture_id(static_cast<CUstream>(stream));
2529
2535
  }
2530
2536
 
2531
- int cuda_stream_get_priority(void* stream)
2537
+ int wp_cuda_stream_get_priority(void* stream)
2532
2538
  {
2533
2539
  int priority = 0;
2534
2540
  check_cuda(cuStreamGetPriority_f(static_cast<CUstream>(stream), &priority));
@@ -2536,7 +2542,7 @@ int cuda_stream_get_priority(void* stream)
2536
2542
  return priority;
2537
2543
  }
2538
2544
 
2539
- void* cuda_event_create(void* context, unsigned flags)
2545
+ void* wp_cuda_event_create(void* context, unsigned flags)
2540
2546
  {
2541
2547
  ContextGuard guard(context, true);
2542
2548
 
@@ -2547,12 +2553,12 @@ void* cuda_event_create(void* context, unsigned flags)
2547
2553
  return NULL;
2548
2554
  }
2549
2555
 
2550
- void cuda_event_destroy(void* event)
2556
+ void wp_cuda_event_destroy(void* event)
2551
2557
  {
2552
2558
  check_cu(cuEventDestroy_f(static_cast<CUevent>(event)));
2553
2559
  }
2554
2560
 
2555
- int cuda_event_query(void* event)
2561
+ int wp_cuda_event_query(void* event)
2556
2562
  {
2557
2563
  CUresult res = cuEventQuery_f(static_cast<CUevent>(event));
2558
2564
 
@@ -2565,9 +2571,9 @@ int cuda_event_query(void* event)
2565
2571
  return res;
2566
2572
  }
2567
2573
 
2568
- void cuda_event_record(void* event, void* stream, bool timing)
2574
+ void wp_cuda_event_record(void* event, void* stream, bool timing)
2569
2575
  {
2570
- if (timing && !g_captures.empty() && cuda_stream_is_capturing(stream))
2576
+ if (timing && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
2571
2577
  {
2572
2578
  // record timing event during graph capture
2573
2579
  check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
@@ -2578,12 +2584,12 @@ void cuda_event_record(void* event, void* stream, bool timing)
2578
2584
  }
2579
2585
  }
2580
2586
 
2581
- void cuda_event_synchronize(void* event)
2587
+ void wp_cuda_event_synchronize(void* event)
2582
2588
  {
2583
2589
  check_cu(cuEventSynchronize_f(static_cast<CUevent>(event)));
2584
2590
  }
2585
2591
 
2586
- float cuda_event_elapsed_time(void* start_event, void* end_event)
2592
+ float wp_cuda_event_elapsed_time(void* start_event, void* end_event)
2587
2593
  {
2588
2594
  float elapsed = 0.0f;
2589
2595
  cudaEvent_t start = static_cast<cudaEvent_t>(start_event);
@@ -2592,7 +2598,7 @@ float cuda_event_elapsed_time(void* start_event, void* end_event)
2592
2598
  return elapsed;
2593
2599
  }
2594
2600
 
2595
- bool cuda_graph_begin_capture(void* context, void* stream, int external)
2601
+ bool wp_cuda_graph_begin_capture(void* context, void* stream, int external)
2596
2602
  {
2597
2603
  ContextGuard guard(context);
2598
2604
 
@@ -2639,7 +2645,7 @@ bool cuda_graph_begin_capture(void* context, void* stream, int external)
2639
2645
  return true;
2640
2646
  }
2641
2647
 
2642
- bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
2648
+ bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
2643
2649
  {
2644
2650
  ContextGuard guard(context);
2645
2651
 
@@ -2774,14 +2780,14 @@ bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
2774
2780
  return true;
2775
2781
  }
2776
2782
 
2777
- bool capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
2783
+ bool wp_capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
2778
2784
  {
2779
2785
  if (!check_cuda(cudaGraphDebugDotPrint((cudaGraph_t)graph, path, flags)))
2780
2786
  return false;
2781
2787
  return true;
2782
2788
  }
2783
2789
 
2784
- bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
2790
+ bool wp_cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
2785
2791
  {
2786
2792
  ContextGuard guard(context);
2787
2793
 
@@ -2789,6 +2795,13 @@ bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
2789
2795
  if (!check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, (cudaGraph_t)graph, cudaGraphInstantiateFlagAutoFreeOnLaunch)))
2790
2796
  return false;
2791
2797
 
2798
+ // Usually uploading the graph explicitly is optional, but when updating graph nodes (e.g., indirect dispatch)
2799
+ // then the upload is required because otherwise the graph nodes that get updated might not yet be uploaded, which
2800
+ // results in undefined behavior.
2801
+ CUstream cuda_stream = static_cast<CUstream>(stream);
2802
+ if (!check_cuda(cudaGraphUpload(graph_exec, cuda_stream)))
2803
+ return false;
2804
+
2792
2805
  if (graph_exec_ret)
2793
2806
  *graph_exec_ret = graph_exec;
2794
2807
 
@@ -2927,7 +2940,7 @@ static CUfunction get_conditional_kernel(void* context, const char* name)
2927
2940
  return kernel;
2928
2941
  }
2929
2942
 
2930
- bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
2943
+ bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
2931
2944
  {
2932
2945
  ContextGuard guard(context);
2933
2946
 
@@ -2937,7 +2950,7 @@ bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
2937
2950
  return true;
2938
2951
  }
2939
2952
 
2940
- bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
2953
+ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
2941
2954
  {
2942
2955
  ContextGuard guard(context);
2943
2956
 
@@ -2963,7 +2976,7 @@ bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
2963
2976
  // https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
2964
2977
  // condition is a gpu pointer
2965
2978
  // if_graph_ret and else_graph_ret should be NULL if not needed
2966
- bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
2979
+ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
2967
2980
  {
2968
2981
  bool has_if = if_graph_ret != NULL;
2969
2982
  bool has_else = else_graph_ret != NULL;
@@ -2978,21 +2991,21 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
2978
2991
  CUstream cuda_stream = static_cast<CUstream>(stream);
2979
2992
 
2980
2993
  // Get the current stream capturing graph
2981
- cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
2994
+ CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
2982
2995
  cudaGraph_t cuda_graph = NULL;
2983
2996
  const cudaGraphNode_t* capture_deps = NULL;
2984
2997
  size_t dep_count = 0;
2985
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
2998
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
2986
2999
  return false;
2987
3000
 
2988
3001
  // abort if not capturing
2989
- if (!cuda_graph || capture_status != cudaStreamCaptureStatusActive)
3002
+ if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
2990
3003
  {
2991
3004
  wp::set_error_string("Stream is not capturing");
2992
3005
  return false;
2993
3006
  }
2994
3007
 
2995
- //int driver_version = cuda_driver_version();
3008
+ //int driver_version = wp_cuda_driver_version();
2996
3009
 
2997
3010
  // IF-ELSE nodes are only supported with CUDA 12.8+
2998
3011
  // Somehow child graphs produce wrong results when an else branch is used
@@ -3000,7 +3013,7 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3000
3013
  if (num_branches == 1 /*|| driver_version >= 12080*/)
3001
3014
  {
3002
3015
  cudaGraphConditionalHandle handle;
3003
- cudaGraphConditionalHandleCreate(&handle, cuda_graph);
3016
+ check_cuda(cudaGraphConditionalHandleCreate(&handle, cuda_graph));
3004
3017
 
3005
3018
  // run a kernel to set the condition handle from the condition pointer
3006
3019
  // (need to negate the condition if only the else branch is used)
@@ -3020,22 +3033,23 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3020
3033
  kernel_args[0] = &handle;
3021
3034
  kernel_args[1] = &condition;
3022
3035
 
3023
- if (!check_cuda(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
3036
+ if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
3024
3037
  return false;
3025
3038
 
3026
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3039
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3027
3040
  return false;
3028
3041
 
3029
3042
  // create conditional node
3030
- cudaGraphNode_t condition_node;
3031
- cudaGraphNodeParams condition_params = { cudaGraphNodeTypeConditional };
3043
+ CUgraphNode condition_node;
3044
+ CUgraphNodeParams condition_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
3032
3045
  condition_params.conditional.handle = handle;
3033
- condition_params.conditional.type = cudaGraphCondTypeIf;
3046
+ condition_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
3034
3047
  condition_params.conditional.size = num_branches;
3035
- if (!check_cuda(cudaGraphAddNode(&condition_node, cuda_graph, capture_deps, dep_count, &condition_params)))
3048
+ condition_params.conditional.ctx = get_current_context();
3049
+ if (!check_cu(cuGraphAddNode_f(&condition_node, cuda_graph, capture_deps, NULL, dep_count, &condition_params)))
3036
3050
  return false;
3037
3051
 
3038
- if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
3052
+ if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
3039
3053
  return false;
3040
3054
 
3041
3055
  if (num_branches == 1)
@@ -3055,8 +3069,8 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3055
3069
  {
3056
3070
  // Create IF node followed by an additional IF node with negated condition
3057
3071
  cudaGraphConditionalHandle if_handle, else_handle;
3058
- cudaGraphConditionalHandleCreate(&if_handle, cuda_graph);
3059
- cudaGraphConditionalHandleCreate(&else_handle, cuda_graph);
3072
+ check_cuda(cudaGraphConditionalHandleCreate(&if_handle, cuda_graph));
3073
+ check_cuda(cudaGraphConditionalHandleCreate(&else_handle, cuda_graph));
3060
3074
 
3061
3075
  CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
3062
3076
  if (!kernel)
@@ -3073,26 +3087,28 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3073
3087
  if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
3074
3088
  return false;
3075
3089
 
3076
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3090
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3077
3091
  return false;
3078
3092
 
3079
- cudaGraphNode_t if_node;
3080
- cudaGraphNodeParams if_params = { cudaGraphNodeTypeConditional };
3093
+ CUgraphNode if_node;
3094
+ CUgraphNodeParams if_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
3081
3095
  if_params.conditional.handle = if_handle;
3082
- if_params.conditional.type = cudaGraphCondTypeIf;
3096
+ if_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
3083
3097
  if_params.conditional.size = 1;
3084
- if (!check_cuda(cudaGraphAddNode(&if_node, cuda_graph, capture_deps, dep_count, &if_params)))
3098
+ if_params.conditional.ctx = get_current_context();
3099
+ if (!check_cu(cuGraphAddNode_f(&if_node, cuda_graph, capture_deps, NULL, dep_count, &if_params)))
3085
3100
  return false;
3086
3101
 
3087
- cudaGraphNode_t else_node;
3088
- cudaGraphNodeParams else_params = { cudaGraphNodeTypeConditional };
3102
+ CUgraphNode else_node;
3103
+ CUgraphNodeParams else_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
3089
3104
  else_params.conditional.handle = else_handle;
3090
- else_params.conditional.type = cudaGraphCondTypeIf;
3105
+ else_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
3091
3106
  else_params.conditional.size = 1;
3092
- if (!check_cuda(cudaGraphAddNode(&else_node, cuda_graph, &if_node, 1, &else_params)))
3107
+ else_params.conditional.ctx = get_current_context();
3108
+ if (!check_cu(cuGraphAddNode_f(&else_node, cuda_graph, &if_node, NULL, 1, &else_params)))
3093
3109
  return false;
3094
3110
 
3095
- if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
3111
+ if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
3096
3112
  return false;
3097
3113
 
3098
3114
  *if_graph_ret = if_params.conditional.phGraph_out[0];
@@ -3102,21 +3118,143 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3102
3118
  return true;
3103
3119
  }
3104
3120
 
3105
- bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
3121
+ // graph node type names for intelligible error reporting
3122
+ static const char* get_graph_node_type_name(CUgraphNodeType type)
3123
+ {
3124
+ static const std::unordered_map<CUgraphNodeType, const char*> names
3125
+ {
3126
+ {CU_GRAPH_NODE_TYPE_KERNEL, "kernel launch"},
3127
+ {CU_GRAPH_NODE_TYPE_MEMCPY, "memcpy"},
3128
+ {CU_GRAPH_NODE_TYPE_MEMSET, "memset"},
3129
+ {CU_GRAPH_NODE_TYPE_HOST, "host execution"},
3130
+ {CU_GRAPH_NODE_TYPE_GRAPH, "graph launch"},
3131
+ {CU_GRAPH_NODE_TYPE_EMPTY, "empty node"},
3132
+ {CU_GRAPH_NODE_TYPE_WAIT_EVENT, "event wait"},
3133
+ {CU_GRAPH_NODE_TYPE_EVENT_RECORD, "event record"},
3134
+ {CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL, "semaphore signal"},
3135
+ {CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT, "semaphore wait"},
3136
+ {CU_GRAPH_NODE_TYPE_MEM_ALLOC, "memory allocation"},
3137
+ {CU_GRAPH_NODE_TYPE_MEM_FREE, "memory deallocation"},
3138
+ {CU_GRAPH_NODE_TYPE_BATCH_MEM_OP, "batched mem op"},
3139
+ {CU_GRAPH_NODE_TYPE_CONDITIONAL, "conditional node"},
3140
+ };
3141
+
3142
+ auto it = names.find(type);
3143
+ if (it != names.end())
3144
+ return it->second;
3145
+ else
3146
+ return "unknown node";
3147
+ }
3148
+
3149
+ // check if a graph can be launched as a child graph
3150
+ static bool is_valid_child_graph(void* child_graph)
3151
+ {
3152
+ // disallowed child graph nodes according to the documentation of cuGraphAddChildGraphNode()
3153
+ static const std::unordered_set<CUgraphNodeType> disallowed_nodes
3154
+ {
3155
+ CU_GRAPH_NODE_TYPE_MEM_ALLOC,
3156
+ CU_GRAPH_NODE_TYPE_MEM_FREE,
3157
+ CU_GRAPH_NODE_TYPE_CONDITIONAL,
3158
+ };
3159
+
3160
+ if (!child_graph)
3161
+ {
3162
+ wp::set_error_string("Child graph is null");
3163
+ return false;
3164
+ }
3165
+
3166
+ size_t num_nodes = 0;
3167
+ if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, NULL, &num_nodes)))
3168
+ return false;
3169
+ std::vector<cudaGraphNode_t> nodes(num_nodes);
3170
+ if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, nodes.data(), &num_nodes)))
3171
+ return false;
3172
+
3173
+ for (size_t i = 0; i < num_nodes; i++)
3174
+ {
3175
+ // note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
3176
+ CUgraphNodeType node_type;
3177
+ check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
3178
+ auto it = disallowed_nodes.find(node_type);
3179
+ if (it != disallowed_nodes.end())
3180
+ {
3181
+ wp::set_error_string("Child graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
3182
+ return false;
3183
+ }
3184
+ }
3185
+
3186
+ return true;
3187
+ }
3188
+
3189
+ // check if a graph can be used as a conditional body graph
3190
+ // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#condtional-node-body-graph-requirements
3191
+ bool wp_cuda_graph_check_conditional_body(void* body_graph)
3106
3192
  {
3193
+ static const std::unordered_set<CUgraphNodeType> allowed_nodes
3194
+ {
3195
+ CU_GRAPH_NODE_TYPE_MEMCPY,
3196
+ CU_GRAPH_NODE_TYPE_MEMSET,
3197
+ CU_GRAPH_NODE_TYPE_KERNEL,
3198
+ CU_GRAPH_NODE_TYPE_GRAPH,
3199
+ CU_GRAPH_NODE_TYPE_EMPTY,
3200
+ CU_GRAPH_NODE_TYPE_CONDITIONAL,
3201
+ };
3202
+
3203
+ if (!body_graph)
3204
+ {
3205
+ wp::set_error_string("Conditional body graph is null");
3206
+ return false;
3207
+ }
3208
+
3209
+ size_t num_nodes = 0;
3210
+ if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, NULL, &num_nodes)))
3211
+ return false;
3212
+ std::vector<cudaGraphNode_t> nodes(num_nodes);
3213
+ if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, nodes.data(), &num_nodes)))
3214
+ return false;
3215
+
3216
+ for (size_t i = 0; i < num_nodes; i++)
3217
+ {
3218
+ // note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
3219
+ CUgraphNodeType node_type;
3220
+ check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
3221
+ if (allowed_nodes.find(node_type) == allowed_nodes.end())
3222
+ {
3223
+ wp::set_error_string("Conditional body graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
3224
+ return false;
3225
+ }
3226
+ else if (node_type == CU_GRAPH_NODE_TYPE_GRAPH)
3227
+ {
3228
+ // check nested child graphs recursively
3229
+ cudaGraph_t child_graph = NULL;
3230
+ if (!check_cuda(cudaGraphChildGraphNodeGetGraph(nodes[i], &child_graph)))
3231
+ return false;
3232
+ if (!wp_cuda_graph_check_conditional_body(child_graph))
3233
+ return false;
3234
+ }
3235
+ }
3236
+
3237
+ return true;
3238
+ }
3239
+
3240
+ bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
3241
+ {
3242
+ if (!is_valid_child_graph(child_graph))
3243
+ return false;
3244
+
3107
3245
  ContextGuard guard(context);
3108
3246
 
3109
3247
  CUstream cuda_stream = static_cast<CUstream>(stream);
3110
3248
 
3111
3249
  // Get the current stream capturing graph
3112
- cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
3250
+ CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
3113
3251
  void* cuda_graph = NULL;
3114
- const cudaGraphNode_t* capture_deps = NULL;
3252
+ const CUgraphNode* capture_deps = NULL;
3115
3253
  size_t dep_count = 0;
3116
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
3254
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
3117
3255
  return false;
3118
3256
 
3119
- if (!cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
3257
+ if (!wp_cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
3120
3258
  return false;
3121
3259
 
3122
3260
  cudaGraphNode_t body_node;
@@ -3126,16 +3264,16 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
3126
3264
  static_cast<cudaGraph_t>(child_graph))))
3127
3265
  return false;
3128
3266
 
3129
- if (!cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
3267
+ if (!wp_cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
3130
3268
  return false;
3131
3269
 
3132
- if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
3270
+ if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
3133
3271
  return false;
3134
3272
 
3135
3273
  return true;
3136
3274
  }
3137
3275
 
3138
- bool cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
3276
+ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
3139
3277
  {
3140
3278
  // if there's no body, it's a no-op
3141
3279
  if (!body_graph_ret)
@@ -3146,15 +3284,15 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
3146
3284
  CUstream cuda_stream = static_cast<CUstream>(stream);
3147
3285
 
3148
3286
  // Get the current stream capturing graph
3149
- cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
3287
+ CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
3150
3288
  cudaGraph_t cuda_graph = NULL;
3151
3289
  const cudaGraphNode_t* capture_deps = NULL;
3152
3290
  size_t dep_count = 0;
3153
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3291
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3154
3292
  return false;
3155
3293
 
3156
3294
  // abort if not capturing
3157
- if (!cuda_graph || capture_status != cudaStreamCaptureStatusActive)
3295
+ if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
3158
3296
  {
3159
3297
  wp::set_error_string("Stream is not capturing");
3160
3298
  return false;
@@ -3179,19 +3317,20 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
3179
3317
  if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
3180
3318
  return false;
3181
3319
 
3182
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3320
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3183
3321
  return false;
3184
3322
 
3185
3323
  // insert conditional graph node
3186
- cudaGraphNode_t while_node;
3187
- cudaGraphNodeParams while_params = { cudaGraphNodeTypeConditional };
3324
+ CUgraphNode while_node;
3325
+ CUgraphNodeParams while_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
3188
3326
  while_params.conditional.handle = handle;
3189
- while_params.conditional.type = cudaGraphCondTypeWhile;
3327
+ while_params.conditional.type = CU_GRAPH_COND_TYPE_WHILE;
3190
3328
  while_params.conditional.size = 1;
3191
- if (!check_cuda(cudaGraphAddNode(&while_node, cuda_graph, capture_deps, dep_count, &while_params)))
3329
+ while_params.conditional.ctx = get_current_context();
3330
+ if (!check_cu(cuGraphAddNode_f(&while_node, cuda_graph, capture_deps, NULL, dep_count, &while_params)))
3192
3331
  return false;
3193
3332
 
3194
- if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
3333
+ if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
3195
3334
  return false;
3196
3335
 
3197
3336
  *body_graph_ret = while_params.conditional.phGraph_out[0];
@@ -3200,7 +3339,7 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
3200
3339
  return true;
3201
3340
  }
3202
3341
 
3203
- bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
3342
+ bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
3204
3343
  {
3205
3344
  ContextGuard guard(context);
3206
3345
 
@@ -3227,37 +3366,43 @@ bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint6
3227
3366
  #else
3228
3367
  // stubs for conditional graph node API if CUDA toolkit is too old.
3229
3368
 
3230
- bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
3369
+ bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
3370
+ {
3371
+ wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3372
+ return false;
3373
+ }
3374
+
3375
+ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
3231
3376
  {
3232
3377
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3233
3378
  return false;
3234
3379
  }
3235
3380
 
3236
- bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
3381
+ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
3237
3382
  {
3238
3383
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3239
3384
  return false;
3240
3385
  }
3241
3386
 
3242
- bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
3387
+ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
3243
3388
  {
3244
3389
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3245
3390
  return false;
3246
3391
  }
3247
3392
 
3248
- bool cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
3393
+ bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
3249
3394
  {
3250
3395
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3251
3396
  return false;
3252
3397
  }
3253
3398
 
3254
- bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
3399
+ bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
3255
3400
  {
3256
3401
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3257
3402
  return false;
3258
3403
  }
3259
3404
 
3260
- bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
3405
+ bool wp_cuda_graph_check_conditional_body(void* body_graph)
3261
3406
  {
3262
3407
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3263
3408
  return false;
@@ -3266,7 +3411,7 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
3266
3411
  #endif // support for conditional graph nodes
3267
3412
 
3268
3413
 
3269
- bool cuda_graph_launch(void* graph_exec, void* stream)
3414
+ bool wp_cuda_graph_launch(void* graph_exec, void* stream)
3270
3415
  {
3271
3416
  // TODO: allow naming graphs?
3272
3417
  begin_cuda_range(WP_TIMING_GRAPH, stream, get_stream_context(stream), "graph");
@@ -3278,14 +3423,14 @@ bool cuda_graph_launch(void* graph_exec, void* stream)
3278
3423
  return result;
3279
3424
  }
3280
3425
 
3281
- bool cuda_graph_destroy(void* context, void* graph)
3426
+ bool wp_cuda_graph_destroy(void* context, void* graph)
3282
3427
  {
3283
3428
  ContextGuard guard(context);
3284
3429
 
3285
3430
  return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
3286
3431
  }
3287
3432
 
3288
- bool cuda_graph_exec_destroy(void* context, void* graph_exec)
3433
+ bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec)
3289
3434
  {
3290
3435
  ContextGuard guard(context);
3291
3436
 
@@ -3337,7 +3482,7 @@ bool write_file(const char* data, size_t size, std::string filename, const char*
3337
3482
  }
3338
3483
  #endif
3339
3484
 
3340
- size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
3485
+ size_t wp_cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
3341
3486
  {
3342
3487
  // use file extension to determine whether to output PTX or CUBIN
3343
3488
  const char* output_ext = strrchr(output_path, '.');
@@ -3393,9 +3538,9 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3393
3538
  {
3394
3539
  opts.push_back("--define-macro=_DEBUG");
3395
3540
  opts.push_back("--generate-line-info");
3396
-
3397
- // disabling since it causes issues with `Unresolved extern function 'cudaGetParameterBufferV2'
3398
- //opts.push_back("--device-debug");
3541
+ #ifndef _WIN32
3542
+ opts.push_back("--device-debug"); // -G
3543
+ #endif
3399
3544
  }
3400
3545
  else
3401
3546
  {
@@ -3665,7 +3810,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3665
3810
  }
3666
3811
  }
3667
3812
 
3668
- bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
3813
+ bool wp_cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
3669
3814
  {
3670
3815
 
3671
3816
  CHECK_ANY(ltoir_output_path != nullptr);
@@ -3711,7 +3856,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3711
3856
  return res;
3712
3857
  }
3713
3858
 
3714
- bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
3859
+ bool wp_cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
3715
3860
  {
3716
3861
 
3717
3862
  CHECK_ANY(ltoir_output_path != nullptr);
@@ -3756,7 +3901,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3756
3901
  return res;
3757
3902
  }
3758
3903
 
3759
- bool cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
3904
+ bool wp_cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
3760
3905
  {
3761
3906
 
3762
3907
  CHECK_ANY(ltoir_output_path != nullptr);
@@ -3819,7 +3964,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3819
3964
 
3820
3965
  #endif
3821
3966
 
3822
- void* cuda_load_module(void* context, const char* path)
3967
+ void* wp_cuda_load_module(void* context, const char* path)
3823
3968
  {
3824
3969
  ContextGuard guard(context);
3825
3970
 
@@ -3938,7 +4083,7 @@ void* cuda_load_module(void* context, const char* path)
3938
4083
  return module;
3939
4084
  }
3940
4085
 
3941
- void cuda_unload_module(void* context, void* module)
4086
+ void wp_cuda_unload_module(void* context, void* module)
3942
4087
  {
3943
4088
  // ensure there are no graph captures in progress
3944
4089
  if (g_captures.empty())
@@ -3957,7 +4102,7 @@ void cuda_unload_module(void* context, void* module)
3957
4102
  }
3958
4103
 
3959
4104
 
3960
- int cuda_get_max_shared_memory(void* context)
4105
+ int wp_cuda_get_max_shared_memory(void* context)
3961
4106
  {
3962
4107
  ContextInfo* info = get_context_info(context);
3963
4108
  if (!info)
@@ -3967,7 +4112,7 @@ int cuda_get_max_shared_memory(void* context)
3967
4112
  return max_smem_bytes;
3968
4113
  }
3969
4114
 
3970
- bool cuda_configure_kernel_shared_memory(void* kernel, int size)
4115
+ bool wp_cuda_configure_kernel_shared_memory(void* kernel, int size)
3971
4116
  {
3972
4117
  int requested_smem_bytes = size;
3973
4118
 
@@ -3979,7 +4124,7 @@ bool cuda_configure_kernel_shared_memory(void* kernel, int size)
3979
4124
  return true;
3980
4125
  }
3981
4126
 
3982
- void* cuda_get_kernel(void* context, void* module, const char* name)
4127
+ void* wp_cuda_get_kernel(void* context, void* module, const char* name)
3983
4128
  {
3984
4129
  ContextGuard guard(context);
3985
4130
 
@@ -3994,7 +4139,7 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
3994
4139
  return kernel;
3995
4140
  }
3996
4141
 
3997
- size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
4142
+ size_t wp_cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
3998
4143
  {
3999
4144
  ContextGuard guard(context);
4000
4145
 
@@ -4048,21 +4193,21 @@ size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_block
4048
4193
  return res;
4049
4194
  }
4050
4195
 
4051
- void cuda_graphics_map(void* context, void* resource)
4196
+ void wp_cuda_graphics_map(void* context, void* resource)
4052
4197
  {
4053
4198
  ContextGuard guard(context);
4054
4199
 
4055
4200
  check_cu(cuGraphicsMapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
4056
4201
  }
4057
4202
 
4058
- void cuda_graphics_unmap(void* context, void* resource)
4203
+ void wp_cuda_graphics_unmap(void* context, void* resource)
4059
4204
  {
4060
4205
  ContextGuard guard(context);
4061
4206
 
4062
4207
  check_cu(cuGraphicsUnmapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
4063
4208
  }
4064
4209
 
4065
- void cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t* ptr, size_t* size)
4210
+ void wp_cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t* ptr, size_t* size)
4066
4211
  {
4067
4212
  ContextGuard guard(context);
4068
4213
 
@@ -4074,7 +4219,7 @@ void cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t*
4074
4219
  *size = bytes;
4075
4220
  }
4076
4221
 
4077
- void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags)
4222
+ void* wp_cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags)
4078
4223
  {
4079
4224
  ContextGuard guard(context);
4080
4225
 
@@ -4089,7 +4234,7 @@ void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsign
4089
4234
  return resource;
4090
4235
  }
4091
4236
 
4092
- void cuda_graphics_unregister_resource(void* context, void* resource)
4237
+ void wp_cuda_graphics_unregister_resource(void* context, void* resource)
4093
4238
  {
4094
4239
  ContextGuard guard(context);
4095
4240
 
@@ -4098,25 +4243,25 @@ void cuda_graphics_unregister_resource(void* context, void* resource)
4098
4243
  delete res;
4099
4244
  }
4100
4245
 
4101
- void cuda_timing_begin(int flags)
4246
+ void wp_cuda_timing_begin(int flags)
4102
4247
  {
4103
4248
  g_cuda_timing_state = new CudaTimingState(flags, g_cuda_timing_state);
4104
4249
  }
4105
4250
 
4106
- int cuda_timing_get_result_count()
4251
+ int wp_cuda_timing_get_result_count()
4107
4252
  {
4108
4253
  if (g_cuda_timing_state)
4109
4254
  return int(g_cuda_timing_state->ranges.size());
4110
4255
  return 0;
4111
4256
  }
4112
4257
 
4113
- void cuda_timing_end(timing_result_t* results, int size)
4258
+ void wp_cuda_timing_end(timing_result_t* results, int size)
4114
4259
  {
4115
4260
  if (!g_cuda_timing_state)
4116
4261
  return;
4117
4262
 
4118
4263
  // number of results to write to the user buffer
4119
- int count = std::min(cuda_timing_get_result_count(), size);
4264
+ int count = std::min(wp_cuda_timing_get_result_count(), size);
4120
4265
 
4121
4266
  // compute timings and write results
4122
4267
  for (int i = 0; i < count; i++)
@@ -4150,7 +4295,6 @@ void cuda_timing_end(timing_result_t* results, int size)
4150
4295
  #include "reduce.cu"
4151
4296
  #include "runlength_encode.cu"
4152
4297
  #include "scan.cu"
4153
- #include "marching.cu"
4154
4298
  #include "sparse.cu"
4155
4299
  #include "volume.cu"
4156
4300
  #include "volume_builder.cu"