warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (134) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +482 -110
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +47 -67
  7. warp/builtins.py +955 -137
  8. warp/codegen.py +312 -206
  9. warp/config.py +1 -1
  10. warp/context.py +1249 -784
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/fabric.py +1 -1
  18. warp/fem/cache.py +27 -19
  19. warp/fem/domain.py +2 -2
  20. warp/fem/field/nodal_field.py +2 -2
  21. warp/fem/field/virtual.py +264 -166
  22. warp/fem/geometry/geometry.py +5 -5
  23. warp/fem/integrate.py +129 -51
  24. warp/fem/space/restriction.py +4 -0
  25. warp/fem/space/shape/tet_shape_function.py +3 -10
  26. warp/jax_experimental/custom_call.py +1 -1
  27. warp/jax_experimental/ffi.py +2 -1
  28. warp/marching_cubes.py +708 -0
  29. warp/native/array.h +99 -4
  30. warp/native/builtin.h +82 -5
  31. warp/native/bvh.cpp +64 -28
  32. warp/native/bvh.cu +58 -58
  33. warp/native/bvh.h +2 -2
  34. warp/native/clang/clang.cpp +7 -7
  35. warp/native/coloring.cpp +8 -2
  36. warp/native/crt.cpp +2 -2
  37. warp/native/crt.h +3 -5
  38. warp/native/cuda_util.cpp +41 -10
  39. warp/native/cuda_util.h +10 -4
  40. warp/native/exports.h +1842 -1908
  41. warp/native/fabric.h +2 -1
  42. warp/native/hashgrid.cpp +37 -37
  43. warp/native/hashgrid.cu +2 -2
  44. warp/native/initializer_array.h +1 -1
  45. warp/native/intersect.h +2 -2
  46. warp/native/mat.h +1910 -116
  47. warp/native/mathdx.cpp +43 -43
  48. warp/native/mesh.cpp +24 -24
  49. warp/native/mesh.cu +26 -26
  50. warp/native/mesh.h +4 -2
  51. warp/native/nanovdb/GridHandle.h +179 -12
  52. warp/native/nanovdb/HostBuffer.h +8 -7
  53. warp/native/nanovdb/NanoVDB.h +517 -895
  54. warp/native/nanovdb/NodeManager.h +323 -0
  55. warp/native/nanovdb/PNanoVDB.h +2 -2
  56. warp/native/quat.h +331 -14
  57. warp/native/range.h +7 -1
  58. warp/native/reduce.cpp +10 -10
  59. warp/native/reduce.cu +13 -14
  60. warp/native/runlength_encode.cpp +2 -2
  61. warp/native/runlength_encode.cu +5 -5
  62. warp/native/scan.cpp +3 -3
  63. warp/native/scan.cu +4 -4
  64. warp/native/sort.cpp +10 -10
  65. warp/native/sort.cu +22 -22
  66. warp/native/sparse.cpp +8 -8
  67. warp/native/sparse.cu +13 -13
  68. warp/native/spatial.h +366 -17
  69. warp/native/temp_buffer.h +2 -2
  70. warp/native/tile.h +283 -69
  71. warp/native/vec.h +381 -14
  72. warp/native/volume.cpp +54 -54
  73. warp/native/volume.cu +1 -1
  74. warp/native/volume.h +2 -1
  75. warp/native/volume_builder.cu +30 -37
  76. warp/native/warp.cpp +150 -149
  77. warp/native/warp.cu +323 -192
  78. warp/native/warp.h +227 -226
  79. warp/optim/linear.py +736 -271
  80. warp/render/imgui_manager.py +289 -0
  81. warp/render/render_opengl.py +85 -6
  82. warp/sim/graph_coloring.py +2 -2
  83. warp/sparse.py +558 -175
  84. warp/tests/aux_test_module_aot.py +7 -0
  85. warp/tests/cuda/test_async.py +3 -3
  86. warp/tests/cuda/test_conditional_captures.py +101 -0
  87. warp/tests/geometry/test_marching_cubes.py +233 -12
  88. warp/tests/sim/test_coloring.py +6 -6
  89. warp/tests/test_array.py +56 -5
  90. warp/tests/test_codegen.py +3 -2
  91. warp/tests/test_context.py +8 -15
  92. warp/tests/test_enum.py +136 -0
  93. warp/tests/test_examples.py +2 -2
  94. warp/tests/test_fem.py +45 -2
  95. warp/tests/test_fixedarray.py +229 -0
  96. warp/tests/test_func.py +18 -15
  97. warp/tests/test_future_annotations.py +7 -5
  98. warp/tests/test_linear_solvers.py +30 -0
  99. warp/tests/test_map.py +1 -1
  100. warp/tests/test_mat.py +1518 -378
  101. warp/tests/test_mat_assign_copy.py +178 -0
  102. warp/tests/test_mat_constructors.py +574 -0
  103. warp/tests/test_module_aot.py +287 -0
  104. warp/tests/test_print.py +69 -0
  105. warp/tests/test_quat.py +140 -34
  106. warp/tests/test_quat_assign_copy.py +145 -0
  107. warp/tests/test_reload.py +2 -1
  108. warp/tests/test_sparse.py +71 -0
  109. warp/tests/test_spatial.py +140 -34
  110. warp/tests/test_spatial_assign_copy.py +160 -0
  111. warp/tests/test_struct.py +43 -3
  112. warp/tests/test_types.py +0 -20
  113. warp/tests/test_vec.py +179 -34
  114. warp/tests/test_vec_assign_copy.py +143 -0
  115. warp/tests/tile/test_tile.py +184 -18
  116. warp/tests/tile/test_tile_cholesky.py +605 -0
  117. warp/tests/tile/test_tile_load.py +169 -0
  118. warp/tests/tile/test_tile_mathdx.py +2 -558
  119. warp/tests/tile/test_tile_matmul.py +1 -1
  120. warp/tests/tile/test_tile_mlp.py +1 -1
  121. warp/tests/tile/test_tile_shared_memory.py +5 -5
  122. warp/tests/unittest_suites.py +6 -0
  123. warp/tests/walkthrough_debug.py +1 -1
  124. warp/thirdparty/unittest_parallel.py +108 -9
  125. warp/types.py +554 -264
  126. warp/utils.py +68 -86
  127. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
  128. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
  129. warp/native/marching.cpp +0 -19
  130. warp/native/marching.cu +0 -514
  131. warp/native/marching.h +0 -19
  132. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
  133. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
  134. {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/warp.cu CHANGED
@@ -168,7 +168,7 @@ struct ContextInfo
168
168
  {
169
169
  DeviceInfo* device_info = NULL;
170
170
 
171
- // the current stream, managed from Python (see cuda_context_set_stream() and cuda_context_get_stream())
171
+ // the current stream, managed from Python (see wp_cuda_context_set_stream() and wp_cuda_context_get_stream())
172
172
  CUstream stream = NULL;
173
173
 
174
174
  // conditional graph node support, loaded on demand if the driver supports it (CUDA 12.4+)
@@ -237,11 +237,11 @@ static std::unordered_map<CUstream, StreamInfo> g_streams;
237
237
 
238
238
  // Ongoing graph captures registered using wp.capture_begin().
239
239
  // This maps the capture id to the stream where capture was started.
240
- // See cuda_graph_begin_capture(), cuda_graph_end_capture(), and free_device_async().
240
+ // See wp_cuda_graph_begin_capture(), wp_cuda_graph_end_capture(), and wp_free_device_async().
241
241
  static std::unordered_map<uint64_t, CaptureInfo*> g_captures;
242
242
 
243
243
  // Memory allocated during graph capture requires special handling.
244
- // See alloc_device_async() and free_device_async().
244
+ // See wp_alloc_device_async() and wp_free_device_async().
245
245
  static std::unordered_map<void*, GraphAllocInfo> g_graph_allocs;
246
246
 
247
247
  // Memory that cannot be freed immediately gets queued here.
@@ -252,12 +252,12 @@ static std::vector<FreeInfo> g_deferred_free_list;
252
252
  // Call unload_deferred_modules() to release.
253
253
  static std::vector<ModuleInfo> g_deferred_module_list;
254
254
 
255
- void cuda_set_context_restore_policy(bool always_restore)
255
+ void wp_cuda_set_context_restore_policy(bool always_restore)
256
256
  {
257
257
  ContextGuard::always_restore = always_restore;
258
258
  }
259
259
 
260
- int cuda_get_context_restore_policy()
260
+ int wp_cuda_get_context_restore_policy()
261
261
  {
262
262
  return int(ContextGuard::always_restore);
263
263
  }
@@ -348,7 +348,7 @@ static inline CUcontext get_current_context()
348
348
 
349
349
  static inline CUstream get_current_stream(void* context=NULL)
350
350
  {
351
- return static_cast<CUstream>(cuda_context_get_stream(context));
351
+ return static_cast<CUstream>(wp_cuda_context_get_stream(context));
352
352
  }
353
353
 
354
354
  static ContextInfo* get_context_info(CUcontext ctx)
@@ -481,7 +481,7 @@ static int unload_deferred_modules(void* context = NULL)
481
481
  const ModuleInfo& module_info = *it;
482
482
  if (module_info.context == context || !context)
483
483
  {
484
- cuda_unload_module(module_info.context, module_info.module);
484
+ wp_cuda_unload_module(module_info.context, module_info.module);
485
485
  ++num_unloaded_modules;
486
486
  it = g_deferred_module_list.erase(it);
487
487
  }
@@ -535,41 +535,41 @@ static inline const char* get_cuda_kernel_name(void* kernel)
535
535
  }
536
536
 
537
537
 
538
- void* alloc_pinned(size_t s)
538
+ void* wp_alloc_pinned(size_t s)
539
539
  {
540
540
  void* ptr = NULL;
541
541
  check_cuda(cudaMallocHost(&ptr, s));
542
542
  return ptr;
543
543
  }
544
544
 
545
- void free_pinned(void* ptr)
545
+ void wp_free_pinned(void* ptr)
546
546
  {
547
547
  cudaFreeHost(ptr);
548
548
  }
549
549
 
550
- void* alloc_device(void* context, size_t s)
550
+ void* wp_alloc_device(void* context, size_t s)
551
551
  {
552
- int ordinal = cuda_context_get_device_ordinal(context);
552
+ int ordinal = wp_cuda_context_get_device_ordinal(context);
553
553
 
554
554
  // use stream-ordered allocator if available
555
- if (cuda_device_is_mempool_supported(ordinal))
556
- return alloc_device_async(context, s);
555
+ if (wp_cuda_device_is_mempool_supported(ordinal))
556
+ return wp_alloc_device_async(context, s);
557
557
  else
558
- return alloc_device_default(context, s);
558
+ return wp_alloc_device_default(context, s);
559
559
  }
560
560
 
561
- void free_device(void* context, void* ptr)
561
+ void wp_free_device(void* context, void* ptr)
562
562
  {
563
- int ordinal = cuda_context_get_device_ordinal(context);
563
+ int ordinal = wp_cuda_context_get_device_ordinal(context);
564
564
 
565
565
  // use stream-ordered allocator if available
566
- if (cuda_device_is_mempool_supported(ordinal))
567
- free_device_async(context, ptr);
566
+ if (wp_cuda_device_is_mempool_supported(ordinal))
567
+ wp_free_device_async(context, ptr);
568
568
  else
569
- free_device_default(context, ptr);
569
+ wp_free_device_default(context, ptr);
570
570
  }
571
571
 
572
- void* alloc_device_default(void* context, size_t s)
572
+ void* wp_alloc_device_default(void* context, size_t s)
573
573
  {
574
574
  ContextGuard guard(context);
575
575
 
@@ -579,7 +579,7 @@ void* alloc_device_default(void* context, size_t s)
579
579
  return ptr;
580
580
  }
581
581
 
582
- void free_device_default(void* context, void* ptr)
582
+ void wp_free_device_default(void* context, void* ptr)
583
583
  {
584
584
  ContextGuard guard(context);
585
585
 
@@ -595,7 +595,7 @@ void free_device_default(void* context, void* ptr)
595
595
  }
596
596
  }
597
597
 
598
- void* alloc_device_async(void* context, size_t s)
598
+ void* wp_alloc_device_async(void* context, size_t s)
599
599
  {
600
600
  // stream-ordered allocations don't rely on the current context,
601
601
  // but we set the context here for consistent behaviour
@@ -613,7 +613,7 @@ void* alloc_device_async(void* context, size_t s)
613
613
  if (ptr)
614
614
  {
615
615
  // if the stream is capturing, the allocation requires special handling
616
- if (cuda_stream_is_capturing(stream))
616
+ if (wp_cuda_stream_is_capturing(stream))
617
617
  {
618
618
  // check if this is a known capture
619
619
  uint64_t capture_id = get_capture_id(stream);
@@ -634,7 +634,7 @@ void* alloc_device_async(void* context, size_t s)
634
634
  return ptr;
635
635
  }
636
636
 
637
- void free_device_async(void* context, void* ptr)
637
+ void wp_free_device_async(void* context, void* ptr)
638
638
  {
639
639
  // stream-ordered allocators generally don't rely on the current context,
640
640
  // but we set the context here for consistent behaviour
@@ -732,7 +732,7 @@ void free_device_async(void* context, void* ptr)
732
732
  }
733
733
  }
734
734
 
735
- bool memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
735
+ bool wp_memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
736
736
  {
737
737
  ContextGuard guard(context);
738
738
 
@@ -751,7 +751,7 @@ bool memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
751
751
  return result;
752
752
  }
753
753
 
754
- bool memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
754
+ bool wp_memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
755
755
  {
756
756
  ContextGuard guard(context);
757
757
 
@@ -770,7 +770,7 @@ bool memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
770
770
  return result;
771
771
  }
772
772
 
773
- bool memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
773
+ bool wp_memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
774
774
  {
775
775
  ContextGuard guard(context);
776
776
 
@@ -789,7 +789,7 @@ bool memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
789
789
  return result;
790
790
  }
791
791
 
792
- bool memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size_t n, void* stream)
792
+ bool wp_memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size_t n, void* stream)
793
793
  {
794
794
  // ContextGuard guard(context);
795
795
 
@@ -809,7 +809,7 @@ bool memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size
809
809
  // because cudaMemPoolGetAccess() cannot be called during graph capture.
810
810
  // - CUDA will report error 1 (invalid argument) if cudaMemcpyAsync() is called but mempool access is not enabled.
811
811
 
812
- if (!cuda_stream_is_capturing(stream))
812
+ if (!wp_cuda_stream_is_capturing(stream))
813
813
  {
814
814
  begin_cuda_range(WP_TIMING_MEMCPY, cuda_stream, get_stream_context(stream), "memcpy PtoP");
815
815
 
@@ -896,7 +896,7 @@ __global__ void memset_kernel(int* dest, int value, size_t n)
896
896
  }
897
897
  }
898
898
 
899
- void memset_device(void* context, void* dest, int value, size_t n)
899
+ void wp_memset_device(void* context, void* dest, int value, size_t n)
900
900
  {
901
901
  ContextGuard guard(context);
902
902
 
@@ -940,7 +940,7 @@ __global__ void memtile_value_kernel(T* dst, T value, size_t n)
940
940
  }
941
941
  }
942
942
 
943
- void memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
943
+ void wp_memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
944
944
  {
945
945
  ContextGuard guard(context);
946
946
 
@@ -976,12 +976,12 @@ void memtile_device(void* context, void* dst, const void* src, size_t srcsize, s
976
976
 
977
977
  // copy value to device memory
978
978
  // TODO: use a persistent stream-local staging buffer to avoid allocs?
979
- void* src_devptr = alloc_device(WP_CURRENT_CONTEXT, srcsize);
979
+ void* src_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, srcsize);
980
980
  check_cuda(cudaMemcpyAsync(src_devptr, src, srcsize, cudaMemcpyHostToDevice, get_current_stream()));
981
981
 
982
982
  wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, (dst, src_devptr, srcsize, n));
983
983
 
984
- free_device(WP_CURRENT_CONTEXT, src_devptr);
984
+ wp_free_device(WP_CURRENT_CONTEXT, src_devptr);
985
985
 
986
986
  }
987
987
  }
@@ -1208,7 +1208,7 @@ static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::in
1208
1208
  }
1209
1209
 
1210
1210
 
1211
- WP_API bool array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
1211
+ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
1212
1212
  {
1213
1213
  if (!src || !dst)
1214
1214
  return false;
@@ -1600,7 +1600,7 @@ static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t
1600
1600
  }
1601
1601
 
1602
1602
 
1603
- WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
1603
+ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
1604
1604
  {
1605
1605
  if (!arr_ptr || !value_ptr)
1606
1606
  return;
@@ -1656,7 +1656,7 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
1656
1656
 
1657
1657
  // copy value to device memory
1658
1658
  // TODO: use a persistent stream-local staging buffer to avoid allocs?
1659
- void* value_devptr = alloc_device(WP_CURRENT_CONTEXT, value_size);
1659
+ void* value_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, value_size);
1660
1660
  check_cuda(cudaMemcpyAsync(value_devptr, value_ptr, value_size, cudaMemcpyHostToDevice, get_current_stream()));
1661
1661
 
1662
1662
  // handle fabric arrays
@@ -1714,20 +1714,20 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
1714
1714
  return;
1715
1715
  }
1716
1716
 
1717
- free_device(WP_CURRENT_CONTEXT, value_devptr);
1717
+ wp_free_device(WP_CURRENT_CONTEXT, value_devptr);
1718
1718
  }
1719
1719
 
1720
- void array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
1720
+ void wp_array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
1721
1721
  {
1722
1722
  scan_device((const int*)in, (int*)out, len, inclusive);
1723
1723
  }
1724
1724
 
1725
- void array_scan_float_device(uint64_t in, uint64_t out, int len, bool inclusive)
1725
+ void wp_array_scan_float_device(uint64_t in, uint64_t out, int len, bool inclusive)
1726
1726
  {
1727
1727
  scan_device((const float*)in, (float*)out, len, inclusive);
1728
1728
  }
1729
1729
 
1730
- int cuda_driver_version()
1730
+ int wp_cuda_driver_version()
1731
1731
  {
1732
1732
  int version;
1733
1733
  if (check_cu(cuDriverGetVersion_f(&version)))
@@ -1736,17 +1736,17 @@ int cuda_driver_version()
1736
1736
  return 0;
1737
1737
  }
1738
1738
 
1739
- int cuda_toolkit_version()
1739
+ int wp_cuda_toolkit_version()
1740
1740
  {
1741
1741
  return CUDA_VERSION;
1742
1742
  }
1743
1743
 
1744
- bool cuda_driver_is_initialized()
1744
+ bool wp_cuda_driver_is_initialized()
1745
1745
  {
1746
1746
  return is_cuda_driver_initialized();
1747
1747
  }
1748
1748
 
1749
- int nvrtc_supported_arch_count()
1749
+ int wp_nvrtc_supported_arch_count()
1750
1750
  {
1751
1751
  int count;
1752
1752
  if (check_nvrtc(nvrtcGetNumSupportedArchs(&count)))
@@ -1755,7 +1755,7 @@ int nvrtc_supported_arch_count()
1755
1755
  return 0;
1756
1756
  }
1757
1757
 
1758
- void nvrtc_supported_archs(int* archs)
1758
+ void wp_nvrtc_supported_archs(int* archs)
1759
1759
  {
1760
1760
  if (archs)
1761
1761
  {
@@ -1763,14 +1763,14 @@ void nvrtc_supported_archs(int* archs)
1763
1763
  }
1764
1764
  }
1765
1765
 
1766
- int cuda_device_get_count()
1766
+ int wp_cuda_device_get_count()
1767
1767
  {
1768
1768
  int count = 0;
1769
1769
  check_cu(cuDeviceGetCount_f(&count));
1770
1770
  return count;
1771
1771
  }
1772
1772
 
1773
- void* cuda_device_get_primary_context(int ordinal)
1773
+ void* wp_cuda_device_get_primary_context(int ordinal)
1774
1774
  {
1775
1775
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1776
1776
  {
@@ -1786,75 +1786,75 @@ void* cuda_device_get_primary_context(int ordinal)
1786
1786
  return NULL;
1787
1787
  }
1788
1788
 
1789
- const char* cuda_device_get_name(int ordinal)
1789
+ const char* wp_cuda_device_get_name(int ordinal)
1790
1790
  {
1791
1791
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1792
1792
  return g_devices[ordinal].name;
1793
1793
  return NULL;
1794
1794
  }
1795
1795
 
1796
- int cuda_device_get_arch(int ordinal)
1796
+ int wp_cuda_device_get_arch(int ordinal)
1797
1797
  {
1798
1798
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1799
1799
  return g_devices[ordinal].arch;
1800
1800
  return 0;
1801
1801
  }
1802
1802
 
1803
- int cuda_device_get_sm_count(int ordinal)
1803
+ int wp_cuda_device_get_sm_count(int ordinal)
1804
1804
  {
1805
1805
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1806
1806
  return g_devices[ordinal].sm_count;
1807
1807
  return 0;
1808
1808
  }
1809
1809
 
1810
- void cuda_device_get_uuid(int ordinal, char uuid[16])
1810
+ void wp_cuda_device_get_uuid(int ordinal, char uuid[16])
1811
1811
  {
1812
1812
  memcpy(uuid, g_devices[ordinal].uuid.bytes, sizeof(char)*16);
1813
1813
  }
1814
1814
 
1815
- int cuda_device_get_pci_domain_id(int ordinal)
1815
+ int wp_cuda_device_get_pci_domain_id(int ordinal)
1816
1816
  {
1817
1817
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1818
1818
  return g_devices[ordinal].pci_domain_id;
1819
1819
  return -1;
1820
1820
  }
1821
1821
 
1822
- int cuda_device_get_pci_bus_id(int ordinal)
1822
+ int wp_cuda_device_get_pci_bus_id(int ordinal)
1823
1823
  {
1824
1824
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1825
1825
  return g_devices[ordinal].pci_bus_id;
1826
1826
  return -1;
1827
1827
  }
1828
1828
 
1829
- int cuda_device_get_pci_device_id(int ordinal)
1829
+ int wp_cuda_device_get_pci_device_id(int ordinal)
1830
1830
  {
1831
1831
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1832
1832
  return g_devices[ordinal].pci_device_id;
1833
1833
  return -1;
1834
1834
  }
1835
1835
 
1836
- int cuda_device_is_uva(int ordinal)
1836
+ int wp_cuda_device_is_uva(int ordinal)
1837
1837
  {
1838
1838
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1839
1839
  return g_devices[ordinal].is_uva;
1840
1840
  return 0;
1841
1841
  }
1842
1842
 
1843
- int cuda_device_is_mempool_supported(int ordinal)
1843
+ int wp_cuda_device_is_mempool_supported(int ordinal)
1844
1844
  {
1845
1845
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1846
1846
  return g_devices[ordinal].is_mempool_supported;
1847
1847
  return 0;
1848
1848
  }
1849
1849
 
1850
- int cuda_device_is_ipc_supported(int ordinal)
1850
+ int wp_cuda_device_is_ipc_supported(int ordinal)
1851
1851
  {
1852
1852
  if (ordinal >= 0 && ordinal < int(g_devices.size()))
1853
1853
  return g_devices[ordinal].is_ipc_supported;
1854
1854
  return 0;
1855
1855
  }
1856
1856
 
1857
- int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
1857
+ int wp_cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
1858
1858
  {
1859
1859
  if (ordinal < 0 || ordinal > int(g_devices.size()))
1860
1860
  {
@@ -1881,7 +1881,7 @@ int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
1881
1881
  return 1; // success
1882
1882
  }
1883
1883
 
1884
- uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
1884
+ uint64_t wp_cuda_device_get_mempool_release_threshold(int ordinal)
1885
1885
  {
1886
1886
  if (ordinal < 0 || ordinal > int(g_devices.size()))
1887
1887
  {
@@ -1909,7 +1909,7 @@ uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
1909
1909
  return threshold;
1910
1910
  }
1911
1911
 
1912
- uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
1912
+ uint64_t wp_cuda_device_get_mempool_used_mem_current(int ordinal)
1913
1913
  {
1914
1914
  if (ordinal < 0 || ordinal > int(g_devices.size()))
1915
1915
  {
@@ -1937,7 +1937,7 @@ uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
1937
1937
  return mem_used;
1938
1938
  }
1939
1939
 
1940
- uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
1940
+ uint64_t wp_cuda_device_get_mempool_used_mem_high(int ordinal)
1941
1941
  {
1942
1942
  if (ordinal < 0 || ordinal > int(g_devices.size()))
1943
1943
  {
@@ -1965,7 +1965,7 @@ uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
1965
1965
  return mem_high_water_mark;
1966
1966
  }
1967
1967
 
1968
- void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
1968
+ void wp_cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
1969
1969
  {
1970
1970
  // use temporary storage if user didn't specify pointers
1971
1971
  size_t tmp_free_mem, tmp_total_mem;
@@ -2002,12 +2002,12 @@ void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_me
2002
2002
  }
2003
2003
 
2004
2004
 
2005
- void* cuda_context_get_current()
2005
+ void* wp_cuda_context_get_current()
2006
2006
  {
2007
2007
  return get_current_context();
2008
2008
  }
2009
2009
 
2010
- void cuda_context_set_current(void* context)
2010
+ void wp_cuda_context_set_current(void* context)
2011
2011
  {
2012
2012
  CUcontext ctx = static_cast<CUcontext>(context);
2013
2013
  CUcontext prev_ctx = NULL;
@@ -2018,18 +2018,18 @@ void cuda_context_set_current(void* context)
2018
2018
  }
2019
2019
  }
2020
2020
 
2021
- void cuda_context_push_current(void* context)
2021
+ void wp_cuda_context_push_current(void* context)
2022
2022
  {
2023
2023
  check_cu(cuCtxPushCurrent_f(static_cast<CUcontext>(context)));
2024
2024
  }
2025
2025
 
2026
- void cuda_context_pop_current()
2026
+ void wp_cuda_context_pop_current()
2027
2027
  {
2028
2028
  CUcontext context;
2029
2029
  check_cu(cuCtxPopCurrent_f(&context));
2030
2030
  }
2031
2031
 
2032
- void* cuda_context_create(int device_ordinal)
2032
+ void* wp_cuda_context_create(int device_ordinal)
2033
2033
  {
2034
2034
  CUcontext ctx = NULL;
2035
2035
  CUdevice device;
@@ -2038,15 +2038,15 @@ void* cuda_context_create(int device_ordinal)
2038
2038
  return ctx;
2039
2039
  }
2040
2040
 
2041
- void cuda_context_destroy(void* context)
2041
+ void wp_cuda_context_destroy(void* context)
2042
2042
  {
2043
2043
  if (context)
2044
2044
  {
2045
2045
  CUcontext ctx = static_cast<CUcontext>(context);
2046
2046
 
2047
2047
  // ensure this is not the current context
2048
- if (ctx == cuda_context_get_current())
2049
- cuda_context_set_current(NULL);
2048
+ if (ctx == wp_cuda_context_get_current())
2049
+ wp_cuda_context_set_current(NULL);
2050
2050
 
2051
2051
  // release the cached info about this context
2052
2052
  ContextInfo* info = get_context_info(ctx);
@@ -2065,7 +2065,7 @@ void cuda_context_destroy(void* context)
2065
2065
  }
2066
2066
  }
2067
2067
 
2068
- void cuda_context_synchronize(void* context)
2068
+ void wp_cuda_context_synchronize(void* context)
2069
2069
  {
2070
2070
  ContextGuard guard(context);
2071
2071
 
@@ -2079,10 +2079,10 @@ void cuda_context_synchronize(void* context)
2079
2079
 
2080
2080
  unload_deferred_modules(context);
2081
2081
 
2082
- // check_cuda(cudaDeviceGraphMemTrim(cuda_context_get_device_ordinal(context)));
2082
+ // check_cuda(cudaDeviceGraphMemTrim(wp_cuda_context_get_device_ordinal(context)));
2083
2083
  }
2084
2084
 
2085
- uint64_t cuda_context_check(void* context)
2085
+ uint64_t wp_cuda_context_check(void* context)
2086
2086
  {
2087
2087
  ContextGuard guard(context);
2088
2088
 
@@ -2104,13 +2104,13 @@ uint64_t cuda_context_check(void* context)
2104
2104
  }
2105
2105
 
2106
2106
 
2107
- int cuda_context_get_device_ordinal(void* context)
2107
+ int wp_cuda_context_get_device_ordinal(void* context)
2108
2108
  {
2109
2109
  ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
2110
2110
  return info && info->device_info ? info->device_info->ordinal : -1;
2111
2111
  }
2112
2112
 
2113
- int cuda_context_is_primary(void* context)
2113
+ int wp_cuda_context_is_primary(void* context)
2114
2114
  {
2115
2115
  CUcontext ctx = static_cast<CUcontext>(context);
2116
2116
  ContextInfo* context_info = get_context_info(ctx);
@@ -2137,7 +2137,7 @@ int cuda_context_is_primary(void* context)
2137
2137
  return 0;
2138
2138
  }
2139
2139
 
2140
- void* cuda_context_get_stream(void* context)
2140
+ void* wp_cuda_context_get_stream(void* context)
2141
2141
  {
2142
2142
  ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
2143
2143
  if (info)
@@ -2147,7 +2147,7 @@ void* cuda_context_get_stream(void* context)
2147
2147
  return NULL;
2148
2148
  }
2149
2149
 
2150
- void cuda_context_set_stream(void* context, void* stream, int sync)
2150
+ void wp_cuda_context_set_stream(void* context, void* stream, int sync)
2151
2151
  {
2152
2152
  ContextInfo* context_info = get_context_info(static_cast<CUcontext>(context));
2153
2153
  if (context_info)
@@ -2171,7 +2171,7 @@ void cuda_context_set_stream(void* context, void* stream, int sync)
2171
2171
  }
2172
2172
  }
2173
2173
 
2174
- int cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
2174
+ int wp_cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
2175
2175
  {
2176
2176
  int num_devices = int(g_devices.size());
2177
2177
 
@@ -2196,7 +2196,7 @@ int cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
2196
2196
  return can_access;
2197
2197
  }
2198
2198
 
2199
- int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
2199
+ int wp_cuda_is_peer_access_enabled(void* target_context, void* peer_context)
2200
2200
  {
2201
2201
  if (!target_context || !peer_context)
2202
2202
  {
@@ -2207,8 +2207,8 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
2207
2207
  if (target_context == peer_context)
2208
2208
  return 1;
2209
2209
 
2210
- int target_ordinal = cuda_context_get_device_ordinal(target_context);
2211
- int peer_ordinal = cuda_context_get_device_ordinal(peer_context);
2210
+ int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
2211
+ int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
2212
2212
 
2213
2213
  // check if peer access is supported
2214
2214
  int can_access = 0;
@@ -2241,7 +2241,7 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
2241
2241
  }
2242
2242
  }
2243
2243
 
2244
- int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int enable)
2244
+ int wp_cuda_set_peer_access_enabled(void* target_context, void* peer_context, int enable)
2245
2245
  {
2246
2246
  if (!target_context || !peer_context)
2247
2247
  {
@@ -2252,8 +2252,8 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
2252
2252
  if (target_context == peer_context)
2253
2253
  return 1; // no-op
2254
2254
 
2255
- int target_ordinal = cuda_context_get_device_ordinal(target_context);
2256
- int peer_ordinal = cuda_context_get_device_ordinal(peer_context);
2255
+ int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
2256
+ int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
2257
2257
 
2258
2258
  // check if peer access is supported
2259
2259
  int can_access = 0;
@@ -2298,7 +2298,7 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
2298
2298
  return 1; // success
2299
2299
  }
2300
2300
 
2301
- int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
2301
+ int wp_cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
2302
2302
  {
2303
2303
  int num_devices = int(g_devices.size());
2304
2304
 
@@ -2334,7 +2334,7 @@ int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
2334
2334
  return 0;
2335
2335
  }
2336
2336
 
2337
- int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable)
2337
+ int wp_cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable)
2338
2338
  {
2339
2339
  int num_devices = int(g_devices.size());
2340
2340
 
@@ -2380,13 +2380,13 @@ int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int en
2380
2380
  return 1; // success
2381
2381
  }
2382
2382
 
2383
- void cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {
2383
+ void wp_cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {
2384
2384
  CUipcMemHandle memHandle;
2385
2385
  check_cu(cuIpcGetMemHandle_f(&memHandle, (CUdeviceptr)ptr));
2386
2386
  memcpy(out_buffer, memHandle.reserved, CU_IPC_HANDLE_SIZE);
2387
2387
  }
2388
2388
 
2389
- void* cuda_ipc_open_mem_handle(void* context, char* handle) {
2389
+ void* wp_cuda_ipc_open_mem_handle(void* context, char* handle) {
2390
2390
  ContextGuard guard(context);
2391
2391
 
2392
2392
  CUipcMemHandle memHandle;
@@ -2401,11 +2401,11 @@ void* cuda_ipc_open_mem_handle(void* context, char* handle) {
2401
2401
  return NULL;
2402
2402
  }
2403
2403
 
2404
- void cuda_ipc_close_mem_handle(void* ptr) {
2404
+ void wp_cuda_ipc_close_mem_handle(void* ptr) {
2405
2405
  check_cu(cuIpcCloseMemHandle_f((CUdeviceptr) ptr));
2406
2406
  }
2407
2407
 
2408
- void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
2408
+ void wp_cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
2409
2409
  ContextGuard guard(context);
2410
2410
 
2411
2411
  CUipcEventHandle eventHandle;
@@ -2413,7 +2413,7 @@ void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
2413
2413
  memcpy(out_buffer, eventHandle.reserved, CU_IPC_HANDLE_SIZE);
2414
2414
  }
2415
2415
 
2416
- void* cuda_ipc_open_event_handle(void* context, char* handle) {
2416
+ void* wp_cuda_ipc_open_event_handle(void* context, char* handle) {
2417
2417
  ContextGuard guard(context);
2418
2418
 
2419
2419
  CUipcEventHandle eventHandle;
@@ -2427,31 +2427,31 @@ void* cuda_ipc_open_event_handle(void* context, char* handle) {
2427
2427
  return NULL;
2428
2428
  }
2429
2429
 
2430
- void* cuda_stream_create(void* context, int priority)
2430
+ void* wp_cuda_stream_create(void* context, int priority)
2431
2431
  {
2432
2432
  ContextGuard guard(context, true);
2433
2433
 
2434
2434
  CUstream stream;
2435
2435
  if (check_cu(cuStreamCreateWithPriority_f(&stream, CU_STREAM_DEFAULT, priority)))
2436
2436
  {
2437
- cuda_stream_register(WP_CURRENT_CONTEXT, stream);
2437
+ wp_cuda_stream_register(WP_CURRENT_CONTEXT, stream);
2438
2438
  return stream;
2439
2439
  }
2440
2440
  else
2441
2441
  return NULL;
2442
2442
  }
2443
2443
 
2444
- void cuda_stream_destroy(void* context, void* stream)
2444
+ void wp_cuda_stream_destroy(void* context, void* stream)
2445
2445
  {
2446
2446
  if (!stream)
2447
2447
  return;
2448
2448
 
2449
- cuda_stream_unregister(context, stream);
2449
+ wp_cuda_stream_unregister(context, stream);
2450
2450
 
2451
2451
  check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
2452
2452
  }
2453
2453
 
2454
- int cuda_stream_query(void* stream)
2454
+ int wp_cuda_stream_query(void* stream)
2455
2455
  {
2456
2456
  CUresult res = cuStreamQuery_f(static_cast<CUstream>(stream));
2457
2457
 
@@ -2464,7 +2464,7 @@ int cuda_stream_query(void* stream)
2464
2464
  return res;
2465
2465
  }
2466
2466
 
2467
- void cuda_stream_register(void* context, void* stream)
2467
+ void wp_cuda_stream_register(void* context, void* stream)
2468
2468
  {
2469
2469
  if (!stream)
2470
2470
  return;
@@ -2476,7 +2476,7 @@ void cuda_stream_register(void* context, void* stream)
2476
2476
  check_cu(cuEventCreate_f(&stream_info.cached_event, CU_EVENT_DISABLE_TIMING));
2477
2477
  }
2478
2478
 
2479
- void cuda_stream_unregister(void* context, void* stream)
2479
+ void wp_cuda_stream_unregister(void* context, void* stream)
2480
2480
  {
2481
2481
  if (!stream)
2482
2482
  return;
@@ -2500,28 +2500,28 @@ void cuda_stream_unregister(void* context, void* stream)
2500
2500
  }
2501
2501
  }
2502
2502
 
2503
- void* cuda_stream_get_current()
2503
+ void* wp_cuda_stream_get_current()
2504
2504
  {
2505
2505
  return get_current_stream();
2506
2506
  }
2507
2507
 
2508
- void cuda_stream_synchronize(void* stream)
2508
+ void wp_cuda_stream_synchronize(void* stream)
2509
2509
  {
2510
2510
  check_cu(cuStreamSynchronize_f(static_cast<CUstream>(stream)));
2511
2511
  }
2512
2512
 
2513
- void cuda_stream_wait_event(void* stream, void* event)
2513
+ void wp_cuda_stream_wait_event(void* stream, void* event)
2514
2514
  {
2515
2515
  check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
2516
2516
  }
2517
2517
 
2518
- void cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
2518
+ void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
2519
2519
  {
2520
2520
  check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream)));
2521
2521
  check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
2522
2522
  }
2523
2523
 
2524
- int cuda_stream_is_capturing(void* stream)
2524
+ int wp_cuda_stream_is_capturing(void* stream)
2525
2525
  {
2526
2526
  cudaStreamCaptureStatus status = cudaStreamCaptureStatusNone;
2527
2527
  check_cuda(cudaStreamIsCapturing(static_cast<cudaStream_t>(stream), &status));
@@ -2529,12 +2529,12 @@ int cuda_stream_is_capturing(void* stream)
2529
2529
  return int(status != cudaStreamCaptureStatusNone);
2530
2530
  }
2531
2531
 
2532
- uint64_t cuda_stream_get_capture_id(void* stream)
2532
+ uint64_t wp_cuda_stream_get_capture_id(void* stream)
2533
2533
  {
2534
2534
  return get_capture_id(static_cast<CUstream>(stream));
2535
2535
  }
2536
2536
 
2537
- int cuda_stream_get_priority(void* stream)
2537
+ int wp_cuda_stream_get_priority(void* stream)
2538
2538
  {
2539
2539
  int priority = 0;
2540
2540
  check_cuda(cuStreamGetPriority_f(static_cast<CUstream>(stream), &priority));
@@ -2542,7 +2542,7 @@ int cuda_stream_get_priority(void* stream)
2542
2542
  return priority;
2543
2543
  }
2544
2544
 
2545
- void* cuda_event_create(void* context, unsigned flags)
2545
+ void* wp_cuda_event_create(void* context, unsigned flags)
2546
2546
  {
2547
2547
  ContextGuard guard(context, true);
2548
2548
 
@@ -2553,12 +2553,12 @@ void* cuda_event_create(void* context, unsigned flags)
2553
2553
  return NULL;
2554
2554
  }
2555
2555
 
2556
- void cuda_event_destroy(void* event)
2556
+ void wp_cuda_event_destroy(void* event)
2557
2557
  {
2558
2558
  check_cu(cuEventDestroy_f(static_cast<CUevent>(event)));
2559
2559
  }
2560
2560
 
2561
- int cuda_event_query(void* event)
2561
+ int wp_cuda_event_query(void* event)
2562
2562
  {
2563
2563
  CUresult res = cuEventQuery_f(static_cast<CUevent>(event));
2564
2564
 
@@ -2571,9 +2571,9 @@ int cuda_event_query(void* event)
2571
2571
  return res;
2572
2572
  }
2573
2573
 
2574
- void cuda_event_record(void* event, void* stream, bool timing)
2574
+ void wp_cuda_event_record(void* event, void* stream, bool timing)
2575
2575
  {
2576
- if (timing && !g_captures.empty() && cuda_stream_is_capturing(stream))
2576
+ if (timing && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
2577
2577
  {
2578
2578
  // record timing event during graph capture
2579
2579
  check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
@@ -2584,12 +2584,12 @@ void cuda_event_record(void* event, void* stream, bool timing)
2584
2584
  }
2585
2585
  }
2586
2586
 
2587
- void cuda_event_synchronize(void* event)
2587
+ void wp_cuda_event_synchronize(void* event)
2588
2588
  {
2589
2589
  check_cu(cuEventSynchronize_f(static_cast<CUevent>(event)));
2590
2590
  }
2591
2591
 
2592
- float cuda_event_elapsed_time(void* start_event, void* end_event)
2592
+ float wp_cuda_event_elapsed_time(void* start_event, void* end_event)
2593
2593
  {
2594
2594
  float elapsed = 0.0f;
2595
2595
  cudaEvent_t start = static_cast<cudaEvent_t>(start_event);
@@ -2598,7 +2598,7 @@ float cuda_event_elapsed_time(void* start_event, void* end_event)
2598
2598
  return elapsed;
2599
2599
  }
2600
2600
 
2601
- bool cuda_graph_begin_capture(void* context, void* stream, int external)
2601
+ bool wp_cuda_graph_begin_capture(void* context, void* stream, int external)
2602
2602
  {
2603
2603
  ContextGuard guard(context);
2604
2604
 
@@ -2645,7 +2645,7 @@ bool cuda_graph_begin_capture(void* context, void* stream, int external)
2645
2645
  return true;
2646
2646
  }
2647
2647
 
2648
- bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
2648
+ bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
2649
2649
  {
2650
2650
  ContextGuard guard(context);
2651
2651
 
@@ -2780,14 +2780,14 @@ bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
2780
2780
  return true;
2781
2781
  }
2782
2782
 
2783
- bool capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
2783
+ bool wp_capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
2784
2784
  {
2785
2785
  if (!check_cuda(cudaGraphDebugDotPrint((cudaGraph_t)graph, path, flags)))
2786
2786
  return false;
2787
2787
  return true;
2788
2788
  }
2789
2789
 
2790
- bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
2790
+ bool wp_cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
2791
2791
  {
2792
2792
  ContextGuard guard(context);
2793
2793
 
@@ -2940,7 +2940,7 @@ static CUfunction get_conditional_kernel(void* context, const char* name)
2940
2940
  return kernel;
2941
2941
  }
2942
2942
 
2943
- bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
2943
+ bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
2944
2944
  {
2945
2945
  ContextGuard guard(context);
2946
2946
 
@@ -2950,7 +2950,7 @@ bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
2950
2950
  return true;
2951
2951
  }
2952
2952
 
2953
- bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
2953
+ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
2954
2954
  {
2955
2955
  ContextGuard guard(context);
2956
2956
 
@@ -2976,7 +2976,7 @@ bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
2976
2976
  // https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
2977
2977
  // condition is a gpu pointer
2978
2978
  // if_graph_ret and else_graph_ret should be NULL if not needed
2979
- bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
2979
+ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
2980
2980
  {
2981
2981
  bool has_if = if_graph_ret != NULL;
2982
2982
  bool has_else = else_graph_ret != NULL;
@@ -2991,21 +2991,21 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
2991
2991
  CUstream cuda_stream = static_cast<CUstream>(stream);
2992
2992
 
2993
2993
  // Get the current stream capturing graph
2994
- cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
2994
+ CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
2995
2995
  cudaGraph_t cuda_graph = NULL;
2996
2996
  const cudaGraphNode_t* capture_deps = NULL;
2997
2997
  size_t dep_count = 0;
2998
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
2998
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
2999
2999
  return false;
3000
3000
 
3001
3001
  // abort if not capturing
3002
- if (!cuda_graph || capture_status != cudaStreamCaptureStatusActive)
3002
+ if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
3003
3003
  {
3004
3004
  wp::set_error_string("Stream is not capturing");
3005
3005
  return false;
3006
3006
  }
3007
3007
 
3008
- //int driver_version = cuda_driver_version();
3008
+ //int driver_version = wp_cuda_driver_version();
3009
3009
 
3010
3010
  // IF-ELSE nodes are only supported with CUDA 12.8+
3011
3011
  // Somehow child graphs produce wrong results when an else branch is used
@@ -3013,7 +3013,7 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3013
3013
  if (num_branches == 1 /*|| driver_version >= 12080*/)
3014
3014
  {
3015
3015
  cudaGraphConditionalHandle handle;
3016
- cudaGraphConditionalHandleCreate(&handle, cuda_graph);
3016
+ check_cuda(cudaGraphConditionalHandleCreate(&handle, cuda_graph));
3017
3017
 
3018
3018
  // run a kernel to set the condition handle from the condition pointer
3019
3019
  // (need to negate the condition if only the else branch is used)
@@ -3033,22 +3033,23 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3033
3033
  kernel_args[0] = &handle;
3034
3034
  kernel_args[1] = &condition;
3035
3035
 
3036
- if (!check_cuda(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
3036
+ if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
3037
3037
  return false;
3038
3038
 
3039
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3039
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3040
3040
  return false;
3041
3041
 
3042
3042
  // create conditional node
3043
- cudaGraphNode_t condition_node;
3044
- cudaGraphNodeParams condition_params = { cudaGraphNodeTypeConditional };
3043
+ CUgraphNode condition_node;
3044
+ CUgraphNodeParams condition_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
3045
3045
  condition_params.conditional.handle = handle;
3046
- condition_params.conditional.type = cudaGraphCondTypeIf;
3046
+ condition_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
3047
3047
  condition_params.conditional.size = num_branches;
3048
- if (!check_cuda(cudaGraphAddNode(&condition_node, cuda_graph, capture_deps, dep_count, &condition_params)))
3048
+ condition_params.conditional.ctx = get_current_context();
3049
+ if (!check_cu(cuGraphAddNode_f(&condition_node, cuda_graph, capture_deps, NULL, dep_count, &condition_params)))
3049
3050
  return false;
3050
3051
 
3051
- if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
3052
+ if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
3052
3053
  return false;
3053
3054
 
3054
3055
  if (num_branches == 1)
@@ -3068,8 +3069,8 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3068
3069
  {
3069
3070
  // Create IF node followed by an additional IF node with negated condition
3070
3071
  cudaGraphConditionalHandle if_handle, else_handle;
3071
- cudaGraphConditionalHandleCreate(&if_handle, cuda_graph);
3072
- cudaGraphConditionalHandleCreate(&else_handle, cuda_graph);
3072
+ check_cuda(cudaGraphConditionalHandleCreate(&if_handle, cuda_graph));
3073
+ check_cuda(cudaGraphConditionalHandleCreate(&else_handle, cuda_graph));
3073
3074
 
3074
3075
  CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
3075
3076
  if (!kernel)
@@ -3086,26 +3087,28 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3086
3087
  if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
3087
3088
  return false;
3088
3089
 
3089
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3090
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3090
3091
  return false;
3091
3092
 
3092
- cudaGraphNode_t if_node;
3093
- cudaGraphNodeParams if_params = { cudaGraphNodeTypeConditional };
3093
+ CUgraphNode if_node;
3094
+ CUgraphNodeParams if_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
3094
3095
  if_params.conditional.handle = if_handle;
3095
- if_params.conditional.type = cudaGraphCondTypeIf;
3096
+ if_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
3096
3097
  if_params.conditional.size = 1;
3097
- if (!check_cuda(cudaGraphAddNode(&if_node, cuda_graph, capture_deps, dep_count, &if_params)))
3098
+ if_params.conditional.ctx = get_current_context();
3099
+ if (!check_cu(cuGraphAddNode_f(&if_node, cuda_graph, capture_deps, NULL, dep_count, &if_params)))
3098
3100
  return false;
3099
3101
 
3100
- cudaGraphNode_t else_node;
3101
- cudaGraphNodeParams else_params = { cudaGraphNodeTypeConditional };
3102
+ CUgraphNode else_node;
3103
+ CUgraphNodeParams else_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
3102
3104
  else_params.conditional.handle = else_handle;
3103
- else_params.conditional.type = cudaGraphCondTypeIf;
3105
+ else_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
3104
3106
  else_params.conditional.size = 1;
3105
- if (!check_cuda(cudaGraphAddNode(&else_node, cuda_graph, &if_node, 1, &else_params)))
3107
+ else_params.conditional.ctx = get_current_context();
3108
+ if (!check_cu(cuGraphAddNode_f(&else_node, cuda_graph, &if_node, NULL, 1, &else_params)))
3106
3109
  return false;
3107
3110
 
3108
- if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
3111
+ if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
3109
3112
  return false;
3110
3113
 
3111
3114
  *if_graph_ret = if_params.conditional.phGraph_out[0];
@@ -3115,21 +3118,143 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
3115
3118
  return true;
3116
3119
  }
3117
3120
 
3118
- bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
3121
+ // graph node type names for intelligible error reporting
3122
+ static const char* get_graph_node_type_name(CUgraphNodeType type)
3123
+ {
3124
+ static const std::unordered_map<CUgraphNodeType, const char*> names
3125
+ {
3126
+ {CU_GRAPH_NODE_TYPE_KERNEL, "kernel launch"},
3127
+ {CU_GRAPH_NODE_TYPE_MEMCPY, "memcpy"},
3128
+ {CU_GRAPH_NODE_TYPE_MEMSET, "memset"},
3129
+ {CU_GRAPH_NODE_TYPE_HOST, "host execution"},
3130
+ {CU_GRAPH_NODE_TYPE_GRAPH, "graph launch"},
3131
+ {CU_GRAPH_NODE_TYPE_EMPTY, "empty node"},
3132
+ {CU_GRAPH_NODE_TYPE_WAIT_EVENT, "event wait"},
3133
+ {CU_GRAPH_NODE_TYPE_EVENT_RECORD, "event record"},
3134
+ {CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL, "semaphore signal"},
3135
+ {CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT, "semaphore wait"},
3136
+ {CU_GRAPH_NODE_TYPE_MEM_ALLOC, "memory allocation"},
3137
+ {CU_GRAPH_NODE_TYPE_MEM_FREE, "memory deallocation"},
3138
+ {CU_GRAPH_NODE_TYPE_BATCH_MEM_OP, "batched mem op"},
3139
+ {CU_GRAPH_NODE_TYPE_CONDITIONAL, "conditional node"},
3140
+ };
3141
+
3142
+ auto it = names.find(type);
3143
+ if (it != names.end())
3144
+ return it->second;
3145
+ else
3146
+ return "unknown node";
3147
+ }
3148
+
3149
+ // check if a graph can be launched as a child graph
3150
+ static bool is_valid_child_graph(void* child_graph)
3119
3151
  {
3152
+ // disallowed child graph nodes according to the documentation of cuGraphAddChildGraphNode()
3153
+ static const std::unordered_set<CUgraphNodeType> disallowed_nodes
3154
+ {
3155
+ CU_GRAPH_NODE_TYPE_MEM_ALLOC,
3156
+ CU_GRAPH_NODE_TYPE_MEM_FREE,
3157
+ CU_GRAPH_NODE_TYPE_CONDITIONAL,
3158
+ };
3159
+
3160
+ if (!child_graph)
3161
+ {
3162
+ wp::set_error_string("Child graph is null");
3163
+ return false;
3164
+ }
3165
+
3166
+ size_t num_nodes = 0;
3167
+ if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, NULL, &num_nodes)))
3168
+ return false;
3169
+ std::vector<cudaGraphNode_t> nodes(num_nodes);
3170
+ if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, nodes.data(), &num_nodes)))
3171
+ return false;
3172
+
3173
+ for (size_t i = 0; i < num_nodes; i++)
3174
+ {
3175
+ // note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
3176
+ CUgraphNodeType node_type;
3177
+ check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
3178
+ auto it = disallowed_nodes.find(node_type);
3179
+ if (it != disallowed_nodes.end())
3180
+ {
3181
+ wp::set_error_string("Child graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
3182
+ return false;
3183
+ }
3184
+ }
3185
+
3186
+ return true;
3187
+ }
3188
+
3189
+ // check if a graph can be used as a conditional body graph
3190
+ // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#condtional-node-body-graph-requirements
3191
+ bool wp_cuda_graph_check_conditional_body(void* body_graph)
3192
+ {
3193
+ static const std::unordered_set<CUgraphNodeType> allowed_nodes
3194
+ {
3195
+ CU_GRAPH_NODE_TYPE_MEMCPY,
3196
+ CU_GRAPH_NODE_TYPE_MEMSET,
3197
+ CU_GRAPH_NODE_TYPE_KERNEL,
3198
+ CU_GRAPH_NODE_TYPE_GRAPH,
3199
+ CU_GRAPH_NODE_TYPE_EMPTY,
3200
+ CU_GRAPH_NODE_TYPE_CONDITIONAL,
3201
+ };
3202
+
3203
+ if (!body_graph)
3204
+ {
3205
+ wp::set_error_string("Conditional body graph is null");
3206
+ return false;
3207
+ }
3208
+
3209
+ size_t num_nodes = 0;
3210
+ if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, NULL, &num_nodes)))
3211
+ return false;
3212
+ std::vector<cudaGraphNode_t> nodes(num_nodes);
3213
+ if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, nodes.data(), &num_nodes)))
3214
+ return false;
3215
+
3216
+ for (size_t i = 0; i < num_nodes; i++)
3217
+ {
3218
+ // note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
3219
+ CUgraphNodeType node_type;
3220
+ check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
3221
+ if (allowed_nodes.find(node_type) == allowed_nodes.end())
3222
+ {
3223
+ wp::set_error_string("Conditional body graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
3224
+ return false;
3225
+ }
3226
+ else if (node_type == CU_GRAPH_NODE_TYPE_GRAPH)
3227
+ {
3228
+ // check nested child graphs recursively
3229
+ cudaGraph_t child_graph = NULL;
3230
+ if (!check_cuda(cudaGraphChildGraphNodeGetGraph(nodes[i], &child_graph)))
3231
+ return false;
3232
+ if (!wp_cuda_graph_check_conditional_body(child_graph))
3233
+ return false;
3234
+ }
3235
+ }
3236
+
3237
+ return true;
3238
+ }
3239
+
3240
+ bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
3241
+ {
3242
+ if (!is_valid_child_graph(child_graph))
3243
+ return false;
3244
+
3120
3245
  ContextGuard guard(context);
3121
3246
 
3122
3247
  CUstream cuda_stream = static_cast<CUstream>(stream);
3123
3248
 
3124
3249
  // Get the current stream capturing graph
3125
- cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
3250
+ CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
3126
3251
  void* cuda_graph = NULL;
3127
- const cudaGraphNode_t* capture_deps = NULL;
3252
+ const CUgraphNode* capture_deps = NULL;
3128
3253
  size_t dep_count = 0;
3129
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
3254
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
3130
3255
  return false;
3131
3256
 
3132
- if (!cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
3257
+ if (!wp_cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
3133
3258
  return false;
3134
3259
 
3135
3260
  cudaGraphNode_t body_node;
@@ -3139,16 +3264,16 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
3139
3264
  static_cast<cudaGraph_t>(child_graph))))
3140
3265
  return false;
3141
3266
 
3142
- if (!cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
3267
+ if (!wp_cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
3143
3268
  return false;
3144
3269
 
3145
- if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
3270
+ if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
3146
3271
  return false;
3147
3272
 
3148
3273
  return true;
3149
3274
  }
3150
3275
 
3151
- bool cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
3276
+ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
3152
3277
  {
3153
3278
  // if there's no body, it's a no-op
3154
3279
  if (!body_graph_ret)
@@ -3159,15 +3284,15 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
3159
3284
  CUstream cuda_stream = static_cast<CUstream>(stream);
3160
3285
 
3161
3286
  // Get the current stream capturing graph
3162
- cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
3287
+ CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
3163
3288
  cudaGraph_t cuda_graph = NULL;
3164
3289
  const cudaGraphNode_t* capture_deps = NULL;
3165
3290
  size_t dep_count = 0;
3166
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3291
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3167
3292
  return false;
3168
3293
 
3169
3294
  // abort if not capturing
3170
- if (!cuda_graph || capture_status != cudaStreamCaptureStatusActive)
3295
+ if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
3171
3296
  {
3172
3297
  wp::set_error_string("Stream is not capturing");
3173
3298
  return false;
@@ -3192,19 +3317,20 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
3192
3317
  if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
3193
3318
  return false;
3194
3319
 
3195
- if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3320
+ if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
3196
3321
  return false;
3197
3322
 
3198
3323
  // insert conditional graph node
3199
- cudaGraphNode_t while_node;
3200
- cudaGraphNodeParams while_params = { cudaGraphNodeTypeConditional };
3324
+ CUgraphNode while_node;
3325
+ CUgraphNodeParams while_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
3201
3326
  while_params.conditional.handle = handle;
3202
- while_params.conditional.type = cudaGraphCondTypeWhile;
3327
+ while_params.conditional.type = CU_GRAPH_COND_TYPE_WHILE;
3203
3328
  while_params.conditional.size = 1;
3204
- if (!check_cuda(cudaGraphAddNode(&while_node, cuda_graph, capture_deps, dep_count, &while_params)))
3329
+ while_params.conditional.ctx = get_current_context();
3330
+ if (!check_cu(cuGraphAddNode_f(&while_node, cuda_graph, capture_deps, NULL, dep_count, &while_params)))
3205
3331
  return false;
3206
3332
 
3207
- if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
3333
+ if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
3208
3334
  return false;
3209
3335
 
3210
3336
  *body_graph_ret = while_params.conditional.phGraph_out[0];
@@ -3213,7 +3339,7 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
3213
3339
  return true;
3214
3340
  }
3215
3341
 
3216
- bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
3342
+ bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
3217
3343
  {
3218
3344
  ContextGuard guard(context);
3219
3345
 
@@ -3240,37 +3366,43 @@ bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint6
3240
3366
  #else
3241
3367
  // stubs for conditional graph node API if CUDA toolkit is too old.
3242
3368
 
3243
- bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
3369
+ bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
3244
3370
  {
3245
3371
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3246
3372
  return false;
3247
3373
  }
3248
3374
 
3249
- bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
3375
+ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
3250
3376
  {
3251
3377
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3252
3378
  return false;
3253
3379
  }
3254
3380
 
3255
- bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
3381
+ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
3256
3382
  {
3257
3383
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3258
3384
  return false;
3259
3385
  }
3260
3386
 
3261
- bool cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
3387
+ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
3262
3388
  {
3263
3389
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3264
3390
  return false;
3265
3391
  }
3266
3392
 
3267
- bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
3393
+ bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
3268
3394
  {
3269
3395
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3270
3396
  return false;
3271
3397
  }
3272
3398
 
3273
- bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
3399
+ bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
3400
+ {
3401
+ wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3402
+ return false;
3403
+ }
3404
+
3405
+ bool wp_cuda_graph_check_conditional_body(void* body_graph)
3274
3406
  {
3275
3407
  wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
3276
3408
  return false;
@@ -3279,7 +3411,7 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
3279
3411
  #endif // support for conditional graph nodes
3280
3412
 
3281
3413
 
3282
- bool cuda_graph_launch(void* graph_exec, void* stream)
3414
+ bool wp_cuda_graph_launch(void* graph_exec, void* stream)
3283
3415
  {
3284
3416
  // TODO: allow naming graphs?
3285
3417
  begin_cuda_range(WP_TIMING_GRAPH, stream, get_stream_context(stream), "graph");
@@ -3291,14 +3423,14 @@ bool cuda_graph_launch(void* graph_exec, void* stream)
3291
3423
  return result;
3292
3424
  }
3293
3425
 
3294
- bool cuda_graph_destroy(void* context, void* graph)
3426
+ bool wp_cuda_graph_destroy(void* context, void* graph)
3295
3427
  {
3296
3428
  ContextGuard guard(context);
3297
3429
 
3298
3430
  return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
3299
3431
  }
3300
3432
 
3301
- bool cuda_graph_exec_destroy(void* context, void* graph_exec)
3433
+ bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec)
3302
3434
  {
3303
3435
  ContextGuard guard(context);
3304
3436
 
@@ -3350,7 +3482,7 @@ bool write_file(const char* data, size_t size, std::string filename, const char*
3350
3482
  }
3351
3483
  #endif
3352
3484
 
3353
- size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
3485
+ size_t wp_cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
3354
3486
  {
3355
3487
  // use file extension to determine whether to output PTX or CUBIN
3356
3488
  const char* output_ext = strrchr(output_path, '.');
@@ -3406,9 +3538,9 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3406
3538
  {
3407
3539
  opts.push_back("--define-macro=_DEBUG");
3408
3540
  opts.push_back("--generate-line-info");
3409
-
3410
- // disabling since it causes issues with `Unresolved extern function 'cudaGetParameterBufferV2'
3411
- //opts.push_back("--device-debug");
3541
+ #ifndef _WIN32
3542
+ opts.push_back("--device-debug"); // -G
3543
+ #endif
3412
3544
  }
3413
3545
  else
3414
3546
  {
@@ -3678,7 +3810,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3678
3810
  }
3679
3811
  }
3680
3812
 
3681
- bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
3813
+ bool wp_cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
3682
3814
  {
3683
3815
 
3684
3816
  CHECK_ANY(ltoir_output_path != nullptr);
@@ -3724,7 +3856,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3724
3856
  return res;
3725
3857
  }
3726
3858
 
3727
- bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
3859
+ bool wp_cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
3728
3860
  {
3729
3861
 
3730
3862
  CHECK_ANY(ltoir_output_path != nullptr);
@@ -3769,7 +3901,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3769
3901
  return res;
3770
3902
  }
3771
3903
 
3772
- bool cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
3904
+ bool wp_cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
3773
3905
  {
3774
3906
 
3775
3907
  CHECK_ANY(ltoir_output_path != nullptr);
@@ -3832,7 +3964,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
3832
3964
 
3833
3965
  #endif
3834
3966
 
3835
- void* cuda_load_module(void* context, const char* path)
3967
+ void* wp_cuda_load_module(void* context, const char* path)
3836
3968
  {
3837
3969
  ContextGuard guard(context);
3838
3970
 
@@ -3951,7 +4083,7 @@ void* cuda_load_module(void* context, const char* path)
3951
4083
  return module;
3952
4084
  }
3953
4085
 
3954
- void cuda_unload_module(void* context, void* module)
4086
+ void wp_cuda_unload_module(void* context, void* module)
3955
4087
  {
3956
4088
  // ensure there are no graph captures in progress
3957
4089
  if (g_captures.empty())
@@ -3970,7 +4102,7 @@ void cuda_unload_module(void* context, void* module)
3970
4102
  }
3971
4103
 
3972
4104
 
3973
- int cuda_get_max_shared_memory(void* context)
4105
+ int wp_cuda_get_max_shared_memory(void* context)
3974
4106
  {
3975
4107
  ContextInfo* info = get_context_info(context);
3976
4108
  if (!info)
@@ -3980,7 +4112,7 @@ int cuda_get_max_shared_memory(void* context)
3980
4112
  return max_smem_bytes;
3981
4113
  }
3982
4114
 
3983
- bool cuda_configure_kernel_shared_memory(void* kernel, int size)
4115
+ bool wp_cuda_configure_kernel_shared_memory(void* kernel, int size)
3984
4116
  {
3985
4117
  int requested_smem_bytes = size;
3986
4118
 
@@ -3992,7 +4124,7 @@ bool cuda_configure_kernel_shared_memory(void* kernel, int size)
3992
4124
  return true;
3993
4125
  }
3994
4126
 
3995
- void* cuda_get_kernel(void* context, void* module, const char* name)
4127
+ void* wp_cuda_get_kernel(void* context, void* module, const char* name)
3996
4128
  {
3997
4129
  ContextGuard guard(context);
3998
4130
 
@@ -4007,7 +4139,7 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
4007
4139
  return kernel;
4008
4140
  }
4009
4141
 
4010
- size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
4142
+ size_t wp_cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
4011
4143
  {
4012
4144
  ContextGuard guard(context);
4013
4145
 
@@ -4061,21 +4193,21 @@ size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_block
4061
4193
  return res;
4062
4194
  }
4063
4195
 
4064
- void cuda_graphics_map(void* context, void* resource)
4196
+ void wp_cuda_graphics_map(void* context, void* resource)
4065
4197
  {
4066
4198
  ContextGuard guard(context);
4067
4199
 
4068
4200
  check_cu(cuGraphicsMapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
4069
4201
  }
4070
4202
 
4071
- void cuda_graphics_unmap(void* context, void* resource)
4203
+ void wp_cuda_graphics_unmap(void* context, void* resource)
4072
4204
  {
4073
4205
  ContextGuard guard(context);
4074
4206
 
4075
4207
  check_cu(cuGraphicsUnmapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
4076
4208
  }
4077
4209
 
4078
- void cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t* ptr, size_t* size)
4210
+ void wp_cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t* ptr, size_t* size)
4079
4211
  {
4080
4212
  ContextGuard guard(context);
4081
4213
 
@@ -4087,7 +4219,7 @@ void cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t*
4087
4219
  *size = bytes;
4088
4220
  }
4089
4221
 
4090
- void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags)
4222
+ void* wp_cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags)
4091
4223
  {
4092
4224
  ContextGuard guard(context);
4093
4225
 
@@ -4102,7 +4234,7 @@ void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsign
4102
4234
  return resource;
4103
4235
  }
4104
4236
 
4105
- void cuda_graphics_unregister_resource(void* context, void* resource)
4237
+ void wp_cuda_graphics_unregister_resource(void* context, void* resource)
4106
4238
  {
4107
4239
  ContextGuard guard(context);
4108
4240
 
@@ -4111,25 +4243,25 @@ void cuda_graphics_unregister_resource(void* context, void* resource)
4111
4243
  delete res;
4112
4244
  }
4113
4245
 
4114
- void cuda_timing_begin(int flags)
4246
+ void wp_cuda_timing_begin(int flags)
4115
4247
  {
4116
4248
  g_cuda_timing_state = new CudaTimingState(flags, g_cuda_timing_state);
4117
4249
  }
4118
4250
 
4119
- int cuda_timing_get_result_count()
4251
+ int wp_cuda_timing_get_result_count()
4120
4252
  {
4121
4253
  if (g_cuda_timing_state)
4122
4254
  return int(g_cuda_timing_state->ranges.size());
4123
4255
  return 0;
4124
4256
  }
4125
4257
 
4126
- void cuda_timing_end(timing_result_t* results, int size)
4258
+ void wp_cuda_timing_end(timing_result_t* results, int size)
4127
4259
  {
4128
4260
  if (!g_cuda_timing_state)
4129
4261
  return;
4130
4262
 
4131
4263
  // number of results to write to the user buffer
4132
- int count = std::min(cuda_timing_get_result_count(), size);
4264
+ int count = std::min(wp_cuda_timing_get_result_count(), size);
4133
4265
 
4134
4266
  // compute timings and write results
4135
4267
  for (int i = 0; i < count; i++)
@@ -4163,7 +4295,6 @@ void cuda_timing_end(timing_result_t* results, int size)
4163
4295
  #include "reduce.cu"
4164
4296
  #include "runlength_encode.cu"
4165
4297
  #include "scan.cu"
4166
- #include "marching.cu"
4167
4298
  #include "sparse.cu"
4168
4299
  #include "volume.cu"
4169
4300
  #include "volume_builder.cu"