warp-lang 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.0__py3-none-macosx_10_13_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +282 -103
- warp/__init__.pyi +482 -110
- warp/bin/libwarp-clang.dylib +0 -0
- warp/bin/libwarp.dylib +0 -0
- warp/build.py +93 -30
- warp/build_dll.py +47 -67
- warp/builtins.py +955 -137
- warp/codegen.py +312 -206
- warp/config.py +1 -1
- warp/context.py +1249 -784
- warp/examples/core/example_marching_cubes.py +1 -0
- warp/examples/core/example_render_opengl.py +100 -3
- warp/examples/fem/example_apic_fluid.py +98 -52
- warp/examples/fem/example_convection_diffusion_dg.py +25 -4
- warp/examples/fem/example_diffusion_mgpu.py +8 -3
- warp/examples/fem/utils.py +68 -22
- warp/fabric.py +1 -1
- warp/fem/cache.py +27 -19
- warp/fem/domain.py +2 -2
- warp/fem/field/nodal_field.py +2 -2
- warp/fem/field/virtual.py +264 -166
- warp/fem/geometry/geometry.py +5 -5
- warp/fem/integrate.py +129 -51
- warp/fem/space/restriction.py +4 -0
- warp/fem/space/shape/tet_shape_function.py +3 -10
- warp/jax_experimental/custom_call.py +1 -1
- warp/jax_experimental/ffi.py +2 -1
- warp/marching_cubes.py +708 -0
- warp/native/array.h +99 -4
- warp/native/builtin.h +82 -5
- warp/native/bvh.cpp +64 -28
- warp/native/bvh.cu +58 -58
- warp/native/bvh.h +2 -2
- warp/native/clang/clang.cpp +7 -7
- warp/native/coloring.cpp +8 -2
- warp/native/crt.cpp +2 -2
- warp/native/crt.h +3 -5
- warp/native/cuda_util.cpp +41 -10
- warp/native/cuda_util.h +10 -4
- warp/native/exports.h +1842 -1908
- warp/native/fabric.h +2 -1
- warp/native/hashgrid.cpp +37 -37
- warp/native/hashgrid.cu +2 -2
- warp/native/initializer_array.h +1 -1
- warp/native/intersect.h +2 -2
- warp/native/mat.h +1910 -116
- warp/native/mathdx.cpp +43 -43
- warp/native/mesh.cpp +24 -24
- warp/native/mesh.cu +26 -26
- warp/native/mesh.h +4 -2
- warp/native/nanovdb/GridHandle.h +179 -12
- warp/native/nanovdb/HostBuffer.h +8 -7
- warp/native/nanovdb/NanoVDB.h +517 -895
- warp/native/nanovdb/NodeManager.h +323 -0
- warp/native/nanovdb/PNanoVDB.h +2 -2
- warp/native/quat.h +331 -14
- warp/native/range.h +7 -1
- warp/native/reduce.cpp +10 -10
- warp/native/reduce.cu +13 -14
- warp/native/runlength_encode.cpp +2 -2
- warp/native/runlength_encode.cu +5 -5
- warp/native/scan.cpp +3 -3
- warp/native/scan.cu +4 -4
- warp/native/sort.cpp +10 -10
- warp/native/sort.cu +22 -22
- warp/native/sparse.cpp +8 -8
- warp/native/sparse.cu +13 -13
- warp/native/spatial.h +366 -17
- warp/native/temp_buffer.h +2 -2
- warp/native/tile.h +283 -69
- warp/native/vec.h +381 -14
- warp/native/volume.cpp +54 -54
- warp/native/volume.cu +1 -1
- warp/native/volume.h +2 -1
- warp/native/volume_builder.cu +30 -37
- warp/native/warp.cpp +150 -149
- warp/native/warp.cu +323 -192
- warp/native/warp.h +227 -226
- warp/optim/linear.py +736 -271
- warp/render/imgui_manager.py +289 -0
- warp/render/render_opengl.py +85 -6
- warp/sim/graph_coloring.py +2 -2
- warp/sparse.py +558 -175
- warp/tests/aux_test_module_aot.py +7 -0
- warp/tests/cuda/test_async.py +3 -3
- warp/tests/cuda/test_conditional_captures.py +101 -0
- warp/tests/geometry/test_marching_cubes.py +233 -12
- warp/tests/sim/test_coloring.py +6 -6
- warp/tests/test_array.py +56 -5
- warp/tests/test_codegen.py +3 -2
- warp/tests/test_context.py +8 -15
- warp/tests/test_enum.py +136 -0
- warp/tests/test_examples.py +2 -2
- warp/tests/test_fem.py +45 -2
- warp/tests/test_fixedarray.py +229 -0
- warp/tests/test_func.py +18 -15
- warp/tests/test_future_annotations.py +7 -5
- warp/tests/test_linear_solvers.py +30 -0
- warp/tests/test_map.py +1 -1
- warp/tests/test_mat.py +1518 -378
- warp/tests/test_mat_assign_copy.py +178 -0
- warp/tests/test_mat_constructors.py +574 -0
- warp/tests/test_module_aot.py +287 -0
- warp/tests/test_print.py +69 -0
- warp/tests/test_quat.py +140 -34
- warp/tests/test_quat_assign_copy.py +145 -0
- warp/tests/test_reload.py +2 -1
- warp/tests/test_sparse.py +71 -0
- warp/tests/test_spatial.py +140 -34
- warp/tests/test_spatial_assign_copy.py +160 -0
- warp/tests/test_struct.py +43 -3
- warp/tests/test_types.py +0 -20
- warp/tests/test_vec.py +179 -34
- warp/tests/test_vec_assign_copy.py +143 -0
- warp/tests/tile/test_tile.py +184 -18
- warp/tests/tile/test_tile_cholesky.py +605 -0
- warp/tests/tile/test_tile_load.py +169 -0
- warp/tests/tile/test_tile_mathdx.py +2 -558
- warp/tests/tile/test_tile_matmul.py +1 -1
- warp/tests/tile/test_tile_mlp.py +1 -1
- warp/tests/tile/test_tile_shared_memory.py +5 -5
- warp/tests/unittest_suites.py +6 -0
- warp/tests/walkthrough_debug.py +1 -1
- warp/thirdparty/unittest_parallel.py +108 -9
- warp/types.py +554 -264
- warp/utils.py +68 -86
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
- warp/native/marching.cpp +0 -19
- warp/native/marching.cu +0 -514
- warp/native/marching.h +0 -19
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/warp.cu
CHANGED
|
@@ -168,7 +168,7 @@ struct ContextInfo
|
|
|
168
168
|
{
|
|
169
169
|
DeviceInfo* device_info = NULL;
|
|
170
170
|
|
|
171
|
-
// the current stream, managed from Python (see
|
|
171
|
+
// the current stream, managed from Python (see wp_cuda_context_set_stream() and wp_cuda_context_get_stream())
|
|
172
172
|
CUstream stream = NULL;
|
|
173
173
|
|
|
174
174
|
// conditional graph node support, loaded on demand if the driver supports it (CUDA 12.4+)
|
|
@@ -237,11 +237,11 @@ static std::unordered_map<CUstream, StreamInfo> g_streams;
|
|
|
237
237
|
|
|
238
238
|
// Ongoing graph captures registered using wp.capture_begin().
|
|
239
239
|
// This maps the capture id to the stream where capture was started.
|
|
240
|
-
// See
|
|
240
|
+
// See wp_cuda_graph_begin_capture(), wp_cuda_graph_end_capture(), and wp_free_device_async().
|
|
241
241
|
static std::unordered_map<uint64_t, CaptureInfo*> g_captures;
|
|
242
242
|
|
|
243
243
|
// Memory allocated during graph capture requires special handling.
|
|
244
|
-
// See
|
|
244
|
+
// See wp_alloc_device_async() and wp_free_device_async().
|
|
245
245
|
static std::unordered_map<void*, GraphAllocInfo> g_graph_allocs;
|
|
246
246
|
|
|
247
247
|
// Memory that cannot be freed immediately gets queued here.
|
|
@@ -252,12 +252,12 @@ static std::vector<FreeInfo> g_deferred_free_list;
|
|
|
252
252
|
// Call unload_deferred_modules() to release.
|
|
253
253
|
static std::vector<ModuleInfo> g_deferred_module_list;
|
|
254
254
|
|
|
255
|
-
void
|
|
255
|
+
void wp_cuda_set_context_restore_policy(bool always_restore)
|
|
256
256
|
{
|
|
257
257
|
ContextGuard::always_restore = always_restore;
|
|
258
258
|
}
|
|
259
259
|
|
|
260
|
-
int
|
|
260
|
+
int wp_cuda_get_context_restore_policy()
|
|
261
261
|
{
|
|
262
262
|
return int(ContextGuard::always_restore);
|
|
263
263
|
}
|
|
@@ -348,7 +348,7 @@ static inline CUcontext get_current_context()
|
|
|
348
348
|
|
|
349
349
|
static inline CUstream get_current_stream(void* context=NULL)
|
|
350
350
|
{
|
|
351
|
-
return static_cast<CUstream>(
|
|
351
|
+
return static_cast<CUstream>(wp_cuda_context_get_stream(context));
|
|
352
352
|
}
|
|
353
353
|
|
|
354
354
|
static ContextInfo* get_context_info(CUcontext ctx)
|
|
@@ -481,7 +481,7 @@ static int unload_deferred_modules(void* context = NULL)
|
|
|
481
481
|
const ModuleInfo& module_info = *it;
|
|
482
482
|
if (module_info.context == context || !context)
|
|
483
483
|
{
|
|
484
|
-
|
|
484
|
+
wp_cuda_unload_module(module_info.context, module_info.module);
|
|
485
485
|
++num_unloaded_modules;
|
|
486
486
|
it = g_deferred_module_list.erase(it);
|
|
487
487
|
}
|
|
@@ -535,41 +535,41 @@ static inline const char* get_cuda_kernel_name(void* kernel)
|
|
|
535
535
|
}
|
|
536
536
|
|
|
537
537
|
|
|
538
|
-
void*
|
|
538
|
+
void* wp_alloc_pinned(size_t s)
|
|
539
539
|
{
|
|
540
540
|
void* ptr = NULL;
|
|
541
541
|
check_cuda(cudaMallocHost(&ptr, s));
|
|
542
542
|
return ptr;
|
|
543
543
|
}
|
|
544
544
|
|
|
545
|
-
void
|
|
545
|
+
void wp_free_pinned(void* ptr)
|
|
546
546
|
{
|
|
547
547
|
cudaFreeHost(ptr);
|
|
548
548
|
}
|
|
549
549
|
|
|
550
|
-
void*
|
|
550
|
+
void* wp_alloc_device(void* context, size_t s)
|
|
551
551
|
{
|
|
552
|
-
int ordinal =
|
|
552
|
+
int ordinal = wp_cuda_context_get_device_ordinal(context);
|
|
553
553
|
|
|
554
554
|
// use stream-ordered allocator if available
|
|
555
|
-
if (
|
|
556
|
-
return
|
|
555
|
+
if (wp_cuda_device_is_mempool_supported(ordinal))
|
|
556
|
+
return wp_alloc_device_async(context, s);
|
|
557
557
|
else
|
|
558
|
-
return
|
|
558
|
+
return wp_alloc_device_default(context, s);
|
|
559
559
|
}
|
|
560
560
|
|
|
561
|
-
void
|
|
561
|
+
void wp_free_device(void* context, void* ptr)
|
|
562
562
|
{
|
|
563
|
-
int ordinal =
|
|
563
|
+
int ordinal = wp_cuda_context_get_device_ordinal(context);
|
|
564
564
|
|
|
565
565
|
// use stream-ordered allocator if available
|
|
566
|
-
if (
|
|
567
|
-
|
|
566
|
+
if (wp_cuda_device_is_mempool_supported(ordinal))
|
|
567
|
+
wp_free_device_async(context, ptr);
|
|
568
568
|
else
|
|
569
|
-
|
|
569
|
+
wp_free_device_default(context, ptr);
|
|
570
570
|
}
|
|
571
571
|
|
|
572
|
-
void*
|
|
572
|
+
void* wp_alloc_device_default(void* context, size_t s)
|
|
573
573
|
{
|
|
574
574
|
ContextGuard guard(context);
|
|
575
575
|
|
|
@@ -579,7 +579,7 @@ void* alloc_device_default(void* context, size_t s)
|
|
|
579
579
|
return ptr;
|
|
580
580
|
}
|
|
581
581
|
|
|
582
|
-
void
|
|
582
|
+
void wp_free_device_default(void* context, void* ptr)
|
|
583
583
|
{
|
|
584
584
|
ContextGuard guard(context);
|
|
585
585
|
|
|
@@ -595,7 +595,7 @@ void free_device_default(void* context, void* ptr)
|
|
|
595
595
|
}
|
|
596
596
|
}
|
|
597
597
|
|
|
598
|
-
void*
|
|
598
|
+
void* wp_alloc_device_async(void* context, size_t s)
|
|
599
599
|
{
|
|
600
600
|
// stream-ordered allocations don't rely on the current context,
|
|
601
601
|
// but we set the context here for consistent behaviour
|
|
@@ -613,7 +613,7 @@ void* alloc_device_async(void* context, size_t s)
|
|
|
613
613
|
if (ptr)
|
|
614
614
|
{
|
|
615
615
|
// if the stream is capturing, the allocation requires special handling
|
|
616
|
-
if (
|
|
616
|
+
if (wp_cuda_stream_is_capturing(stream))
|
|
617
617
|
{
|
|
618
618
|
// check if this is a known capture
|
|
619
619
|
uint64_t capture_id = get_capture_id(stream);
|
|
@@ -634,7 +634,7 @@ void* alloc_device_async(void* context, size_t s)
|
|
|
634
634
|
return ptr;
|
|
635
635
|
}
|
|
636
636
|
|
|
637
|
-
void
|
|
637
|
+
void wp_free_device_async(void* context, void* ptr)
|
|
638
638
|
{
|
|
639
639
|
// stream-ordered allocators generally don't rely on the current context,
|
|
640
640
|
// but we set the context here for consistent behaviour
|
|
@@ -732,7 +732,7 @@ void free_device_async(void* context, void* ptr)
|
|
|
732
732
|
}
|
|
733
733
|
}
|
|
734
734
|
|
|
735
|
-
bool
|
|
735
|
+
bool wp_memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
|
|
736
736
|
{
|
|
737
737
|
ContextGuard guard(context);
|
|
738
738
|
|
|
@@ -751,7 +751,7 @@ bool memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
|
|
|
751
751
|
return result;
|
|
752
752
|
}
|
|
753
753
|
|
|
754
|
-
bool
|
|
754
|
+
bool wp_memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
|
|
755
755
|
{
|
|
756
756
|
ContextGuard guard(context);
|
|
757
757
|
|
|
@@ -770,7 +770,7 @@ bool memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
|
|
|
770
770
|
return result;
|
|
771
771
|
}
|
|
772
772
|
|
|
773
|
-
bool
|
|
773
|
+
bool wp_memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
|
|
774
774
|
{
|
|
775
775
|
ContextGuard guard(context);
|
|
776
776
|
|
|
@@ -789,7 +789,7 @@ bool memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
|
|
|
789
789
|
return result;
|
|
790
790
|
}
|
|
791
791
|
|
|
792
|
-
bool
|
|
792
|
+
bool wp_memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size_t n, void* stream)
|
|
793
793
|
{
|
|
794
794
|
// ContextGuard guard(context);
|
|
795
795
|
|
|
@@ -809,7 +809,7 @@ bool memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size
|
|
|
809
809
|
// because cudaMemPoolGetAccess() cannot be called during graph capture.
|
|
810
810
|
// - CUDA will report error 1 (invalid argument) if cudaMemcpyAsync() is called but mempool access is not enabled.
|
|
811
811
|
|
|
812
|
-
if (!
|
|
812
|
+
if (!wp_cuda_stream_is_capturing(stream))
|
|
813
813
|
{
|
|
814
814
|
begin_cuda_range(WP_TIMING_MEMCPY, cuda_stream, get_stream_context(stream), "memcpy PtoP");
|
|
815
815
|
|
|
@@ -896,7 +896,7 @@ __global__ void memset_kernel(int* dest, int value, size_t n)
|
|
|
896
896
|
}
|
|
897
897
|
}
|
|
898
898
|
|
|
899
|
-
void
|
|
899
|
+
void wp_memset_device(void* context, void* dest, int value, size_t n)
|
|
900
900
|
{
|
|
901
901
|
ContextGuard guard(context);
|
|
902
902
|
|
|
@@ -940,7 +940,7 @@ __global__ void memtile_value_kernel(T* dst, T value, size_t n)
|
|
|
940
940
|
}
|
|
941
941
|
}
|
|
942
942
|
|
|
943
|
-
void
|
|
943
|
+
void wp_memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
|
|
944
944
|
{
|
|
945
945
|
ContextGuard guard(context);
|
|
946
946
|
|
|
@@ -976,12 +976,12 @@ void memtile_device(void* context, void* dst, const void* src, size_t srcsize, s
|
|
|
976
976
|
|
|
977
977
|
// copy value to device memory
|
|
978
978
|
// TODO: use a persistent stream-local staging buffer to avoid allocs?
|
|
979
|
-
void* src_devptr =
|
|
979
|
+
void* src_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, srcsize);
|
|
980
980
|
check_cuda(cudaMemcpyAsync(src_devptr, src, srcsize, cudaMemcpyHostToDevice, get_current_stream()));
|
|
981
981
|
|
|
982
982
|
wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, (dst, src_devptr, srcsize, n));
|
|
983
983
|
|
|
984
|
-
|
|
984
|
+
wp_free_device(WP_CURRENT_CONTEXT, src_devptr);
|
|
985
985
|
|
|
986
986
|
}
|
|
987
987
|
}
|
|
@@ -1208,7 +1208,7 @@ static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::in
|
|
|
1208
1208
|
}
|
|
1209
1209
|
|
|
1210
1210
|
|
|
1211
|
-
WP_API bool
|
|
1211
|
+
WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
|
|
1212
1212
|
{
|
|
1213
1213
|
if (!src || !dst)
|
|
1214
1214
|
return false;
|
|
@@ -1600,7 +1600,7 @@ static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t
|
|
|
1600
1600
|
}
|
|
1601
1601
|
|
|
1602
1602
|
|
|
1603
|
-
WP_API void
|
|
1603
|
+
WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
|
|
1604
1604
|
{
|
|
1605
1605
|
if (!arr_ptr || !value_ptr)
|
|
1606
1606
|
return;
|
|
@@ -1656,7 +1656,7 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
|
|
|
1656
1656
|
|
|
1657
1657
|
// copy value to device memory
|
|
1658
1658
|
// TODO: use a persistent stream-local staging buffer to avoid allocs?
|
|
1659
|
-
void* value_devptr =
|
|
1659
|
+
void* value_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, value_size);
|
|
1660
1660
|
check_cuda(cudaMemcpyAsync(value_devptr, value_ptr, value_size, cudaMemcpyHostToDevice, get_current_stream()));
|
|
1661
1661
|
|
|
1662
1662
|
// handle fabric arrays
|
|
@@ -1714,20 +1714,20 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
|
|
|
1714
1714
|
return;
|
|
1715
1715
|
}
|
|
1716
1716
|
|
|
1717
|
-
|
|
1717
|
+
wp_free_device(WP_CURRENT_CONTEXT, value_devptr);
|
|
1718
1718
|
}
|
|
1719
1719
|
|
|
1720
|
-
void
|
|
1720
|
+
void wp_array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
|
|
1721
1721
|
{
|
|
1722
1722
|
scan_device((const int*)in, (int*)out, len, inclusive);
|
|
1723
1723
|
}
|
|
1724
1724
|
|
|
1725
|
-
void
|
|
1725
|
+
void wp_array_scan_float_device(uint64_t in, uint64_t out, int len, bool inclusive)
|
|
1726
1726
|
{
|
|
1727
1727
|
scan_device((const float*)in, (float*)out, len, inclusive);
|
|
1728
1728
|
}
|
|
1729
1729
|
|
|
1730
|
-
int
|
|
1730
|
+
int wp_cuda_driver_version()
|
|
1731
1731
|
{
|
|
1732
1732
|
int version;
|
|
1733
1733
|
if (check_cu(cuDriverGetVersion_f(&version)))
|
|
@@ -1736,17 +1736,17 @@ int cuda_driver_version()
|
|
|
1736
1736
|
return 0;
|
|
1737
1737
|
}
|
|
1738
1738
|
|
|
1739
|
-
int
|
|
1739
|
+
int wp_cuda_toolkit_version()
|
|
1740
1740
|
{
|
|
1741
1741
|
return CUDA_VERSION;
|
|
1742
1742
|
}
|
|
1743
1743
|
|
|
1744
|
-
bool
|
|
1744
|
+
bool wp_cuda_driver_is_initialized()
|
|
1745
1745
|
{
|
|
1746
1746
|
return is_cuda_driver_initialized();
|
|
1747
1747
|
}
|
|
1748
1748
|
|
|
1749
|
-
int
|
|
1749
|
+
int wp_nvrtc_supported_arch_count()
|
|
1750
1750
|
{
|
|
1751
1751
|
int count;
|
|
1752
1752
|
if (check_nvrtc(nvrtcGetNumSupportedArchs(&count)))
|
|
@@ -1755,7 +1755,7 @@ int nvrtc_supported_arch_count()
|
|
|
1755
1755
|
return 0;
|
|
1756
1756
|
}
|
|
1757
1757
|
|
|
1758
|
-
void
|
|
1758
|
+
void wp_nvrtc_supported_archs(int* archs)
|
|
1759
1759
|
{
|
|
1760
1760
|
if (archs)
|
|
1761
1761
|
{
|
|
@@ -1763,14 +1763,14 @@ void nvrtc_supported_archs(int* archs)
|
|
|
1763
1763
|
}
|
|
1764
1764
|
}
|
|
1765
1765
|
|
|
1766
|
-
int
|
|
1766
|
+
int wp_cuda_device_get_count()
|
|
1767
1767
|
{
|
|
1768
1768
|
int count = 0;
|
|
1769
1769
|
check_cu(cuDeviceGetCount_f(&count));
|
|
1770
1770
|
return count;
|
|
1771
1771
|
}
|
|
1772
1772
|
|
|
1773
|
-
void*
|
|
1773
|
+
void* wp_cuda_device_get_primary_context(int ordinal)
|
|
1774
1774
|
{
|
|
1775
1775
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1776
1776
|
{
|
|
@@ -1786,75 +1786,75 @@ void* cuda_device_get_primary_context(int ordinal)
|
|
|
1786
1786
|
return NULL;
|
|
1787
1787
|
}
|
|
1788
1788
|
|
|
1789
|
-
const char*
|
|
1789
|
+
const char* wp_cuda_device_get_name(int ordinal)
|
|
1790
1790
|
{
|
|
1791
1791
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1792
1792
|
return g_devices[ordinal].name;
|
|
1793
1793
|
return NULL;
|
|
1794
1794
|
}
|
|
1795
1795
|
|
|
1796
|
-
int
|
|
1796
|
+
int wp_cuda_device_get_arch(int ordinal)
|
|
1797
1797
|
{
|
|
1798
1798
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1799
1799
|
return g_devices[ordinal].arch;
|
|
1800
1800
|
return 0;
|
|
1801
1801
|
}
|
|
1802
1802
|
|
|
1803
|
-
int
|
|
1803
|
+
int wp_cuda_device_get_sm_count(int ordinal)
|
|
1804
1804
|
{
|
|
1805
1805
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1806
1806
|
return g_devices[ordinal].sm_count;
|
|
1807
1807
|
return 0;
|
|
1808
1808
|
}
|
|
1809
1809
|
|
|
1810
|
-
void
|
|
1810
|
+
void wp_cuda_device_get_uuid(int ordinal, char uuid[16])
|
|
1811
1811
|
{
|
|
1812
1812
|
memcpy(uuid, g_devices[ordinal].uuid.bytes, sizeof(char)*16);
|
|
1813
1813
|
}
|
|
1814
1814
|
|
|
1815
|
-
int
|
|
1815
|
+
int wp_cuda_device_get_pci_domain_id(int ordinal)
|
|
1816
1816
|
{
|
|
1817
1817
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1818
1818
|
return g_devices[ordinal].pci_domain_id;
|
|
1819
1819
|
return -1;
|
|
1820
1820
|
}
|
|
1821
1821
|
|
|
1822
|
-
int
|
|
1822
|
+
int wp_cuda_device_get_pci_bus_id(int ordinal)
|
|
1823
1823
|
{
|
|
1824
1824
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1825
1825
|
return g_devices[ordinal].pci_bus_id;
|
|
1826
1826
|
return -1;
|
|
1827
1827
|
}
|
|
1828
1828
|
|
|
1829
|
-
int
|
|
1829
|
+
int wp_cuda_device_get_pci_device_id(int ordinal)
|
|
1830
1830
|
{
|
|
1831
1831
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1832
1832
|
return g_devices[ordinal].pci_device_id;
|
|
1833
1833
|
return -1;
|
|
1834
1834
|
}
|
|
1835
1835
|
|
|
1836
|
-
int
|
|
1836
|
+
int wp_cuda_device_is_uva(int ordinal)
|
|
1837
1837
|
{
|
|
1838
1838
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1839
1839
|
return g_devices[ordinal].is_uva;
|
|
1840
1840
|
return 0;
|
|
1841
1841
|
}
|
|
1842
1842
|
|
|
1843
|
-
int
|
|
1843
|
+
int wp_cuda_device_is_mempool_supported(int ordinal)
|
|
1844
1844
|
{
|
|
1845
1845
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1846
1846
|
return g_devices[ordinal].is_mempool_supported;
|
|
1847
1847
|
return 0;
|
|
1848
1848
|
}
|
|
1849
1849
|
|
|
1850
|
-
int
|
|
1850
|
+
int wp_cuda_device_is_ipc_supported(int ordinal)
|
|
1851
1851
|
{
|
|
1852
1852
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1853
1853
|
return g_devices[ordinal].is_ipc_supported;
|
|
1854
1854
|
return 0;
|
|
1855
1855
|
}
|
|
1856
1856
|
|
|
1857
|
-
int
|
|
1857
|
+
int wp_cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
|
|
1858
1858
|
{
|
|
1859
1859
|
if (ordinal < 0 || ordinal > int(g_devices.size()))
|
|
1860
1860
|
{
|
|
@@ -1881,7 +1881,7 @@ int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
|
|
|
1881
1881
|
return 1; // success
|
|
1882
1882
|
}
|
|
1883
1883
|
|
|
1884
|
-
uint64_t
|
|
1884
|
+
uint64_t wp_cuda_device_get_mempool_release_threshold(int ordinal)
|
|
1885
1885
|
{
|
|
1886
1886
|
if (ordinal < 0 || ordinal > int(g_devices.size()))
|
|
1887
1887
|
{
|
|
@@ -1909,7 +1909,7 @@ uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
|
|
|
1909
1909
|
return threshold;
|
|
1910
1910
|
}
|
|
1911
1911
|
|
|
1912
|
-
uint64_t
|
|
1912
|
+
uint64_t wp_cuda_device_get_mempool_used_mem_current(int ordinal)
|
|
1913
1913
|
{
|
|
1914
1914
|
if (ordinal < 0 || ordinal > int(g_devices.size()))
|
|
1915
1915
|
{
|
|
@@ -1937,7 +1937,7 @@ uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
|
|
|
1937
1937
|
return mem_used;
|
|
1938
1938
|
}
|
|
1939
1939
|
|
|
1940
|
-
uint64_t
|
|
1940
|
+
uint64_t wp_cuda_device_get_mempool_used_mem_high(int ordinal)
|
|
1941
1941
|
{
|
|
1942
1942
|
if (ordinal < 0 || ordinal > int(g_devices.size()))
|
|
1943
1943
|
{
|
|
@@ -1965,7 +1965,7 @@ uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
|
|
|
1965
1965
|
return mem_high_water_mark;
|
|
1966
1966
|
}
|
|
1967
1967
|
|
|
1968
|
-
void
|
|
1968
|
+
void wp_cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
|
|
1969
1969
|
{
|
|
1970
1970
|
// use temporary storage if user didn't specify pointers
|
|
1971
1971
|
size_t tmp_free_mem, tmp_total_mem;
|
|
@@ -2002,12 +2002,12 @@ void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_me
|
|
|
2002
2002
|
}
|
|
2003
2003
|
|
|
2004
2004
|
|
|
2005
|
-
void*
|
|
2005
|
+
void* wp_cuda_context_get_current()
|
|
2006
2006
|
{
|
|
2007
2007
|
return get_current_context();
|
|
2008
2008
|
}
|
|
2009
2009
|
|
|
2010
|
-
void
|
|
2010
|
+
void wp_cuda_context_set_current(void* context)
|
|
2011
2011
|
{
|
|
2012
2012
|
CUcontext ctx = static_cast<CUcontext>(context);
|
|
2013
2013
|
CUcontext prev_ctx = NULL;
|
|
@@ -2018,18 +2018,18 @@ void cuda_context_set_current(void* context)
|
|
|
2018
2018
|
}
|
|
2019
2019
|
}
|
|
2020
2020
|
|
|
2021
|
-
void
|
|
2021
|
+
void wp_cuda_context_push_current(void* context)
|
|
2022
2022
|
{
|
|
2023
2023
|
check_cu(cuCtxPushCurrent_f(static_cast<CUcontext>(context)));
|
|
2024
2024
|
}
|
|
2025
2025
|
|
|
2026
|
-
void
|
|
2026
|
+
void wp_cuda_context_pop_current()
|
|
2027
2027
|
{
|
|
2028
2028
|
CUcontext context;
|
|
2029
2029
|
check_cu(cuCtxPopCurrent_f(&context));
|
|
2030
2030
|
}
|
|
2031
2031
|
|
|
2032
|
-
void*
|
|
2032
|
+
void* wp_cuda_context_create(int device_ordinal)
|
|
2033
2033
|
{
|
|
2034
2034
|
CUcontext ctx = NULL;
|
|
2035
2035
|
CUdevice device;
|
|
@@ -2038,15 +2038,15 @@ void* cuda_context_create(int device_ordinal)
|
|
|
2038
2038
|
return ctx;
|
|
2039
2039
|
}
|
|
2040
2040
|
|
|
2041
|
-
void
|
|
2041
|
+
void wp_cuda_context_destroy(void* context)
|
|
2042
2042
|
{
|
|
2043
2043
|
if (context)
|
|
2044
2044
|
{
|
|
2045
2045
|
CUcontext ctx = static_cast<CUcontext>(context);
|
|
2046
2046
|
|
|
2047
2047
|
// ensure this is not the current context
|
|
2048
|
-
if (ctx ==
|
|
2049
|
-
|
|
2048
|
+
if (ctx == wp_cuda_context_get_current())
|
|
2049
|
+
wp_cuda_context_set_current(NULL);
|
|
2050
2050
|
|
|
2051
2051
|
// release the cached info about this context
|
|
2052
2052
|
ContextInfo* info = get_context_info(ctx);
|
|
@@ -2065,7 +2065,7 @@ void cuda_context_destroy(void* context)
|
|
|
2065
2065
|
}
|
|
2066
2066
|
}
|
|
2067
2067
|
|
|
2068
|
-
void
|
|
2068
|
+
void wp_cuda_context_synchronize(void* context)
|
|
2069
2069
|
{
|
|
2070
2070
|
ContextGuard guard(context);
|
|
2071
2071
|
|
|
@@ -2079,10 +2079,10 @@ void cuda_context_synchronize(void* context)
|
|
|
2079
2079
|
|
|
2080
2080
|
unload_deferred_modules(context);
|
|
2081
2081
|
|
|
2082
|
-
// check_cuda(cudaDeviceGraphMemTrim(
|
|
2082
|
+
// check_cuda(cudaDeviceGraphMemTrim(wp_cuda_context_get_device_ordinal(context)));
|
|
2083
2083
|
}
|
|
2084
2084
|
|
|
2085
|
-
uint64_t
|
|
2085
|
+
uint64_t wp_cuda_context_check(void* context)
|
|
2086
2086
|
{
|
|
2087
2087
|
ContextGuard guard(context);
|
|
2088
2088
|
|
|
@@ -2104,13 +2104,13 @@ uint64_t cuda_context_check(void* context)
|
|
|
2104
2104
|
}
|
|
2105
2105
|
|
|
2106
2106
|
|
|
2107
|
-
int
|
|
2107
|
+
int wp_cuda_context_get_device_ordinal(void* context)
|
|
2108
2108
|
{
|
|
2109
2109
|
ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
|
|
2110
2110
|
return info && info->device_info ? info->device_info->ordinal : -1;
|
|
2111
2111
|
}
|
|
2112
2112
|
|
|
2113
|
-
int
|
|
2113
|
+
int wp_cuda_context_is_primary(void* context)
|
|
2114
2114
|
{
|
|
2115
2115
|
CUcontext ctx = static_cast<CUcontext>(context);
|
|
2116
2116
|
ContextInfo* context_info = get_context_info(ctx);
|
|
@@ -2137,7 +2137,7 @@ int cuda_context_is_primary(void* context)
|
|
|
2137
2137
|
return 0;
|
|
2138
2138
|
}
|
|
2139
2139
|
|
|
2140
|
-
void*
|
|
2140
|
+
void* wp_cuda_context_get_stream(void* context)
|
|
2141
2141
|
{
|
|
2142
2142
|
ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
|
|
2143
2143
|
if (info)
|
|
@@ -2147,7 +2147,7 @@ void* cuda_context_get_stream(void* context)
|
|
|
2147
2147
|
return NULL;
|
|
2148
2148
|
}
|
|
2149
2149
|
|
|
2150
|
-
void
|
|
2150
|
+
void wp_cuda_context_set_stream(void* context, void* stream, int sync)
|
|
2151
2151
|
{
|
|
2152
2152
|
ContextInfo* context_info = get_context_info(static_cast<CUcontext>(context));
|
|
2153
2153
|
if (context_info)
|
|
@@ -2171,7 +2171,7 @@ void cuda_context_set_stream(void* context, void* stream, int sync)
|
|
|
2171
2171
|
}
|
|
2172
2172
|
}
|
|
2173
2173
|
|
|
2174
|
-
int
|
|
2174
|
+
int wp_cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
|
|
2175
2175
|
{
|
|
2176
2176
|
int num_devices = int(g_devices.size());
|
|
2177
2177
|
|
|
@@ -2196,7 +2196,7 @@ int cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
|
|
|
2196
2196
|
return can_access;
|
|
2197
2197
|
}
|
|
2198
2198
|
|
|
2199
|
-
int
|
|
2199
|
+
int wp_cuda_is_peer_access_enabled(void* target_context, void* peer_context)
|
|
2200
2200
|
{
|
|
2201
2201
|
if (!target_context || !peer_context)
|
|
2202
2202
|
{
|
|
@@ -2207,8 +2207,8 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
|
|
|
2207
2207
|
if (target_context == peer_context)
|
|
2208
2208
|
return 1;
|
|
2209
2209
|
|
|
2210
|
-
int target_ordinal =
|
|
2211
|
-
int peer_ordinal =
|
|
2210
|
+
int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
|
|
2211
|
+
int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
|
|
2212
2212
|
|
|
2213
2213
|
// check if peer access is supported
|
|
2214
2214
|
int can_access = 0;
|
|
@@ -2241,7 +2241,7 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
|
|
|
2241
2241
|
}
|
|
2242
2242
|
}
|
|
2243
2243
|
|
|
2244
|
-
int
|
|
2244
|
+
int wp_cuda_set_peer_access_enabled(void* target_context, void* peer_context, int enable)
|
|
2245
2245
|
{
|
|
2246
2246
|
if (!target_context || !peer_context)
|
|
2247
2247
|
{
|
|
@@ -2252,8 +2252,8 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
|
|
|
2252
2252
|
if (target_context == peer_context)
|
|
2253
2253
|
return 1; // no-op
|
|
2254
2254
|
|
|
2255
|
-
int target_ordinal =
|
|
2256
|
-
int peer_ordinal =
|
|
2255
|
+
int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
|
|
2256
|
+
int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
|
|
2257
2257
|
|
|
2258
2258
|
// check if peer access is supported
|
|
2259
2259
|
int can_access = 0;
|
|
@@ -2298,7 +2298,7 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
|
|
|
2298
2298
|
return 1; // success
|
|
2299
2299
|
}
|
|
2300
2300
|
|
|
2301
|
-
int
|
|
2301
|
+
int wp_cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
|
|
2302
2302
|
{
|
|
2303
2303
|
int num_devices = int(g_devices.size());
|
|
2304
2304
|
|
|
@@ -2334,7 +2334,7 @@ int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
|
|
|
2334
2334
|
return 0;
|
|
2335
2335
|
}
|
|
2336
2336
|
|
|
2337
|
-
int
|
|
2337
|
+
int wp_cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable)
|
|
2338
2338
|
{
|
|
2339
2339
|
int num_devices = int(g_devices.size());
|
|
2340
2340
|
|
|
@@ -2380,13 +2380,13 @@ int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int en
|
|
|
2380
2380
|
return 1; // success
|
|
2381
2381
|
}
|
|
2382
2382
|
|
|
2383
|
-
void
|
|
2383
|
+
void wp_cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {
|
|
2384
2384
|
CUipcMemHandle memHandle;
|
|
2385
2385
|
check_cu(cuIpcGetMemHandle_f(&memHandle, (CUdeviceptr)ptr));
|
|
2386
2386
|
memcpy(out_buffer, memHandle.reserved, CU_IPC_HANDLE_SIZE);
|
|
2387
2387
|
}
|
|
2388
2388
|
|
|
2389
|
-
void*
|
|
2389
|
+
void* wp_cuda_ipc_open_mem_handle(void* context, char* handle) {
|
|
2390
2390
|
ContextGuard guard(context);
|
|
2391
2391
|
|
|
2392
2392
|
CUipcMemHandle memHandle;
|
|
@@ -2401,11 +2401,11 @@ void* cuda_ipc_open_mem_handle(void* context, char* handle) {
|
|
|
2401
2401
|
return NULL;
|
|
2402
2402
|
}
|
|
2403
2403
|
|
|
2404
|
-
void
|
|
2404
|
+
void wp_cuda_ipc_close_mem_handle(void* ptr) {
|
|
2405
2405
|
check_cu(cuIpcCloseMemHandle_f((CUdeviceptr) ptr));
|
|
2406
2406
|
}
|
|
2407
2407
|
|
|
2408
|
-
void
|
|
2408
|
+
void wp_cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
|
|
2409
2409
|
ContextGuard guard(context);
|
|
2410
2410
|
|
|
2411
2411
|
CUipcEventHandle eventHandle;
|
|
@@ -2413,7 +2413,7 @@ void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
|
|
|
2413
2413
|
memcpy(out_buffer, eventHandle.reserved, CU_IPC_HANDLE_SIZE);
|
|
2414
2414
|
}
|
|
2415
2415
|
|
|
2416
|
-
void*
|
|
2416
|
+
void* wp_cuda_ipc_open_event_handle(void* context, char* handle) {
|
|
2417
2417
|
ContextGuard guard(context);
|
|
2418
2418
|
|
|
2419
2419
|
CUipcEventHandle eventHandle;
|
|
@@ -2427,31 +2427,31 @@ void* cuda_ipc_open_event_handle(void* context, char* handle) {
|
|
|
2427
2427
|
return NULL;
|
|
2428
2428
|
}
|
|
2429
2429
|
|
|
2430
|
-
void*
|
|
2430
|
+
void* wp_cuda_stream_create(void* context, int priority)
|
|
2431
2431
|
{
|
|
2432
2432
|
ContextGuard guard(context, true);
|
|
2433
2433
|
|
|
2434
2434
|
CUstream stream;
|
|
2435
2435
|
if (check_cu(cuStreamCreateWithPriority_f(&stream, CU_STREAM_DEFAULT, priority)))
|
|
2436
2436
|
{
|
|
2437
|
-
|
|
2437
|
+
wp_cuda_stream_register(WP_CURRENT_CONTEXT, stream);
|
|
2438
2438
|
return stream;
|
|
2439
2439
|
}
|
|
2440
2440
|
else
|
|
2441
2441
|
return NULL;
|
|
2442
2442
|
}
|
|
2443
2443
|
|
|
2444
|
-
void
|
|
2444
|
+
void wp_cuda_stream_destroy(void* context, void* stream)
|
|
2445
2445
|
{
|
|
2446
2446
|
if (!stream)
|
|
2447
2447
|
return;
|
|
2448
2448
|
|
|
2449
|
-
|
|
2449
|
+
wp_cuda_stream_unregister(context, stream);
|
|
2450
2450
|
|
|
2451
2451
|
check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
|
|
2452
2452
|
}
|
|
2453
2453
|
|
|
2454
|
-
int
|
|
2454
|
+
int wp_cuda_stream_query(void* stream)
|
|
2455
2455
|
{
|
|
2456
2456
|
CUresult res = cuStreamQuery_f(static_cast<CUstream>(stream));
|
|
2457
2457
|
|
|
@@ -2464,7 +2464,7 @@ int cuda_stream_query(void* stream)
|
|
|
2464
2464
|
return res;
|
|
2465
2465
|
}
|
|
2466
2466
|
|
|
2467
|
-
void
|
|
2467
|
+
void wp_cuda_stream_register(void* context, void* stream)
|
|
2468
2468
|
{
|
|
2469
2469
|
if (!stream)
|
|
2470
2470
|
return;
|
|
@@ -2476,7 +2476,7 @@ void cuda_stream_register(void* context, void* stream)
|
|
|
2476
2476
|
check_cu(cuEventCreate_f(&stream_info.cached_event, CU_EVENT_DISABLE_TIMING));
|
|
2477
2477
|
}
|
|
2478
2478
|
|
|
2479
|
-
void
|
|
2479
|
+
void wp_cuda_stream_unregister(void* context, void* stream)
|
|
2480
2480
|
{
|
|
2481
2481
|
if (!stream)
|
|
2482
2482
|
return;
|
|
@@ -2500,28 +2500,28 @@ void cuda_stream_unregister(void* context, void* stream)
|
|
|
2500
2500
|
}
|
|
2501
2501
|
}
|
|
2502
2502
|
|
|
2503
|
-
void*
|
|
2503
|
+
void* wp_cuda_stream_get_current()
|
|
2504
2504
|
{
|
|
2505
2505
|
return get_current_stream();
|
|
2506
2506
|
}
|
|
2507
2507
|
|
|
2508
|
-
void
|
|
2508
|
+
void wp_cuda_stream_synchronize(void* stream)
|
|
2509
2509
|
{
|
|
2510
2510
|
check_cu(cuStreamSynchronize_f(static_cast<CUstream>(stream)));
|
|
2511
2511
|
}
|
|
2512
2512
|
|
|
2513
|
-
void
|
|
2513
|
+
void wp_cuda_stream_wait_event(void* stream, void* event)
|
|
2514
2514
|
{
|
|
2515
2515
|
check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
|
|
2516
2516
|
}
|
|
2517
2517
|
|
|
2518
|
-
void
|
|
2518
|
+
void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
|
|
2519
2519
|
{
|
|
2520
2520
|
check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream)));
|
|
2521
2521
|
check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
|
|
2522
2522
|
}
|
|
2523
2523
|
|
|
2524
|
-
int
|
|
2524
|
+
int wp_cuda_stream_is_capturing(void* stream)
|
|
2525
2525
|
{
|
|
2526
2526
|
cudaStreamCaptureStatus status = cudaStreamCaptureStatusNone;
|
|
2527
2527
|
check_cuda(cudaStreamIsCapturing(static_cast<cudaStream_t>(stream), &status));
|
|
@@ -2529,12 +2529,12 @@ int cuda_stream_is_capturing(void* stream)
|
|
|
2529
2529
|
return int(status != cudaStreamCaptureStatusNone);
|
|
2530
2530
|
}
|
|
2531
2531
|
|
|
2532
|
-
uint64_t
|
|
2532
|
+
uint64_t wp_cuda_stream_get_capture_id(void* stream)
|
|
2533
2533
|
{
|
|
2534
2534
|
return get_capture_id(static_cast<CUstream>(stream));
|
|
2535
2535
|
}
|
|
2536
2536
|
|
|
2537
|
-
int
|
|
2537
|
+
int wp_cuda_stream_get_priority(void* stream)
|
|
2538
2538
|
{
|
|
2539
2539
|
int priority = 0;
|
|
2540
2540
|
check_cuda(cuStreamGetPriority_f(static_cast<CUstream>(stream), &priority));
|
|
@@ -2542,7 +2542,7 @@ int cuda_stream_get_priority(void* stream)
|
|
|
2542
2542
|
return priority;
|
|
2543
2543
|
}
|
|
2544
2544
|
|
|
2545
|
-
void*
|
|
2545
|
+
void* wp_cuda_event_create(void* context, unsigned flags)
|
|
2546
2546
|
{
|
|
2547
2547
|
ContextGuard guard(context, true);
|
|
2548
2548
|
|
|
@@ -2553,12 +2553,12 @@ void* cuda_event_create(void* context, unsigned flags)
|
|
|
2553
2553
|
return NULL;
|
|
2554
2554
|
}
|
|
2555
2555
|
|
|
2556
|
-
void
|
|
2556
|
+
void wp_cuda_event_destroy(void* event)
|
|
2557
2557
|
{
|
|
2558
2558
|
check_cu(cuEventDestroy_f(static_cast<CUevent>(event)));
|
|
2559
2559
|
}
|
|
2560
2560
|
|
|
2561
|
-
int
|
|
2561
|
+
int wp_cuda_event_query(void* event)
|
|
2562
2562
|
{
|
|
2563
2563
|
CUresult res = cuEventQuery_f(static_cast<CUevent>(event));
|
|
2564
2564
|
|
|
@@ -2571,9 +2571,9 @@ int cuda_event_query(void* event)
|
|
|
2571
2571
|
return res;
|
|
2572
2572
|
}
|
|
2573
2573
|
|
|
2574
|
-
void
|
|
2574
|
+
void wp_cuda_event_record(void* event, void* stream, bool timing)
|
|
2575
2575
|
{
|
|
2576
|
-
if (timing && !g_captures.empty() &&
|
|
2576
|
+
if (timing && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
|
|
2577
2577
|
{
|
|
2578
2578
|
// record timing event during graph capture
|
|
2579
2579
|
check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
|
|
@@ -2584,12 +2584,12 @@ void cuda_event_record(void* event, void* stream, bool timing)
|
|
|
2584
2584
|
}
|
|
2585
2585
|
}
|
|
2586
2586
|
|
|
2587
|
-
void
|
|
2587
|
+
void wp_cuda_event_synchronize(void* event)
|
|
2588
2588
|
{
|
|
2589
2589
|
check_cu(cuEventSynchronize_f(static_cast<CUevent>(event)));
|
|
2590
2590
|
}
|
|
2591
2591
|
|
|
2592
|
-
float
|
|
2592
|
+
float wp_cuda_event_elapsed_time(void* start_event, void* end_event)
|
|
2593
2593
|
{
|
|
2594
2594
|
float elapsed = 0.0f;
|
|
2595
2595
|
cudaEvent_t start = static_cast<cudaEvent_t>(start_event);
|
|
@@ -2598,7 +2598,7 @@ float cuda_event_elapsed_time(void* start_event, void* end_event)
|
|
|
2598
2598
|
return elapsed;
|
|
2599
2599
|
}
|
|
2600
2600
|
|
|
2601
|
-
bool
|
|
2601
|
+
bool wp_cuda_graph_begin_capture(void* context, void* stream, int external)
|
|
2602
2602
|
{
|
|
2603
2603
|
ContextGuard guard(context);
|
|
2604
2604
|
|
|
@@ -2645,7 +2645,7 @@ bool cuda_graph_begin_capture(void* context, void* stream, int external)
|
|
|
2645
2645
|
return true;
|
|
2646
2646
|
}
|
|
2647
2647
|
|
|
2648
|
-
bool
|
|
2648
|
+
bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
|
|
2649
2649
|
{
|
|
2650
2650
|
ContextGuard guard(context);
|
|
2651
2651
|
|
|
@@ -2780,14 +2780,14 @@ bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
|
|
|
2780
2780
|
return true;
|
|
2781
2781
|
}
|
|
2782
2782
|
|
|
2783
|
-
bool
|
|
2783
|
+
bool wp_capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
|
|
2784
2784
|
{
|
|
2785
2785
|
if (!check_cuda(cudaGraphDebugDotPrint((cudaGraph_t)graph, path, flags)))
|
|
2786
2786
|
return false;
|
|
2787
2787
|
return true;
|
|
2788
2788
|
}
|
|
2789
2789
|
|
|
2790
|
-
bool
|
|
2790
|
+
bool wp_cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
|
|
2791
2791
|
{
|
|
2792
2792
|
ContextGuard guard(context);
|
|
2793
2793
|
|
|
@@ -2940,7 +2940,7 @@ static CUfunction get_conditional_kernel(void* context, const char* name)
|
|
|
2940
2940
|
return kernel;
|
|
2941
2941
|
}
|
|
2942
2942
|
|
|
2943
|
-
bool
|
|
2943
|
+
bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
|
|
2944
2944
|
{
|
|
2945
2945
|
ContextGuard guard(context);
|
|
2946
2946
|
|
|
@@ -2950,7 +2950,7 @@ bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
|
|
|
2950
2950
|
return true;
|
|
2951
2951
|
}
|
|
2952
2952
|
|
|
2953
|
-
bool
|
|
2953
|
+
bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
|
|
2954
2954
|
{
|
|
2955
2955
|
ContextGuard guard(context);
|
|
2956
2956
|
|
|
@@ -2976,7 +2976,7 @@ bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
|
|
|
2976
2976
|
// https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
|
|
2977
2977
|
// condition is a gpu pointer
|
|
2978
2978
|
// if_graph_ret and else_graph_ret should be NULL if not needed
|
|
2979
|
-
bool
|
|
2979
|
+
bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
|
|
2980
2980
|
{
|
|
2981
2981
|
bool has_if = if_graph_ret != NULL;
|
|
2982
2982
|
bool has_else = else_graph_ret != NULL;
|
|
@@ -2991,21 +2991,21 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
2991
2991
|
CUstream cuda_stream = static_cast<CUstream>(stream);
|
|
2992
2992
|
|
|
2993
2993
|
// Get the current stream capturing graph
|
|
2994
|
-
|
|
2994
|
+
CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
|
|
2995
2995
|
cudaGraph_t cuda_graph = NULL;
|
|
2996
2996
|
const cudaGraphNode_t* capture_deps = NULL;
|
|
2997
2997
|
size_t dep_count = 0;
|
|
2998
|
-
if (!
|
|
2998
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
2999
2999
|
return false;
|
|
3000
3000
|
|
|
3001
3001
|
// abort if not capturing
|
|
3002
|
-
if (!cuda_graph || capture_status !=
|
|
3002
|
+
if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
|
|
3003
3003
|
{
|
|
3004
3004
|
wp::set_error_string("Stream is not capturing");
|
|
3005
3005
|
return false;
|
|
3006
3006
|
}
|
|
3007
3007
|
|
|
3008
|
-
//int driver_version =
|
|
3008
|
+
//int driver_version = wp_cuda_driver_version();
|
|
3009
3009
|
|
|
3010
3010
|
// IF-ELSE nodes are only supported with CUDA 12.8+
|
|
3011
3011
|
// Somehow child graphs produce wrong results when an else branch is used
|
|
@@ -3013,7 +3013,7 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3013
3013
|
if (num_branches == 1 /*|| driver_version >= 12080*/)
|
|
3014
3014
|
{
|
|
3015
3015
|
cudaGraphConditionalHandle handle;
|
|
3016
|
-
cudaGraphConditionalHandleCreate(&handle, cuda_graph);
|
|
3016
|
+
check_cuda(cudaGraphConditionalHandleCreate(&handle, cuda_graph));
|
|
3017
3017
|
|
|
3018
3018
|
// run a kernel to set the condition handle from the condition pointer
|
|
3019
3019
|
// (need to negate the condition if only the else branch is used)
|
|
@@ -3033,22 +3033,23 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3033
3033
|
kernel_args[0] = &handle;
|
|
3034
3034
|
kernel_args[1] = &condition;
|
|
3035
3035
|
|
|
3036
|
-
if (!
|
|
3036
|
+
if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
|
|
3037
3037
|
return false;
|
|
3038
3038
|
|
|
3039
|
-
if (!
|
|
3039
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
3040
3040
|
return false;
|
|
3041
3041
|
|
|
3042
3042
|
// create conditional node
|
|
3043
|
-
|
|
3044
|
-
|
|
3043
|
+
CUgraphNode condition_node;
|
|
3044
|
+
CUgraphNodeParams condition_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
|
|
3045
3045
|
condition_params.conditional.handle = handle;
|
|
3046
|
-
condition_params.conditional.type =
|
|
3046
|
+
condition_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
|
|
3047
3047
|
condition_params.conditional.size = num_branches;
|
|
3048
|
-
|
|
3048
|
+
condition_params.conditional.ctx = get_current_context();
|
|
3049
|
+
if (!check_cu(cuGraphAddNode_f(&condition_node, cuda_graph, capture_deps, NULL, dep_count, &condition_params)))
|
|
3049
3050
|
return false;
|
|
3050
3051
|
|
|
3051
|
-
if (!
|
|
3052
|
+
if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
|
|
3052
3053
|
return false;
|
|
3053
3054
|
|
|
3054
3055
|
if (num_branches == 1)
|
|
@@ -3068,8 +3069,8 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3068
3069
|
{
|
|
3069
3070
|
// Create IF node followed by an additional IF node with negated condition
|
|
3070
3071
|
cudaGraphConditionalHandle if_handle, else_handle;
|
|
3071
|
-
cudaGraphConditionalHandleCreate(&if_handle, cuda_graph);
|
|
3072
|
-
cudaGraphConditionalHandleCreate(&else_handle, cuda_graph);
|
|
3072
|
+
check_cuda(cudaGraphConditionalHandleCreate(&if_handle, cuda_graph));
|
|
3073
|
+
check_cuda(cudaGraphConditionalHandleCreate(&else_handle, cuda_graph));
|
|
3073
3074
|
|
|
3074
3075
|
CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
|
|
3075
3076
|
if (!kernel)
|
|
@@ -3086,26 +3087,28 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3086
3087
|
if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
|
|
3087
3088
|
return false;
|
|
3088
3089
|
|
|
3089
|
-
if (!
|
|
3090
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
3090
3091
|
return false;
|
|
3091
3092
|
|
|
3092
|
-
|
|
3093
|
-
|
|
3093
|
+
CUgraphNode if_node;
|
|
3094
|
+
CUgraphNodeParams if_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
|
|
3094
3095
|
if_params.conditional.handle = if_handle;
|
|
3095
|
-
if_params.conditional.type =
|
|
3096
|
+
if_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
|
|
3096
3097
|
if_params.conditional.size = 1;
|
|
3097
|
-
|
|
3098
|
+
if_params.conditional.ctx = get_current_context();
|
|
3099
|
+
if (!check_cu(cuGraphAddNode_f(&if_node, cuda_graph, capture_deps, NULL, dep_count, &if_params)))
|
|
3098
3100
|
return false;
|
|
3099
3101
|
|
|
3100
|
-
|
|
3101
|
-
|
|
3102
|
+
CUgraphNode else_node;
|
|
3103
|
+
CUgraphNodeParams else_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
|
|
3102
3104
|
else_params.conditional.handle = else_handle;
|
|
3103
|
-
else_params.conditional.type =
|
|
3105
|
+
else_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
|
|
3104
3106
|
else_params.conditional.size = 1;
|
|
3105
|
-
|
|
3107
|
+
else_params.conditional.ctx = get_current_context();
|
|
3108
|
+
if (!check_cu(cuGraphAddNode_f(&else_node, cuda_graph, &if_node, NULL, 1, &else_params)))
|
|
3106
3109
|
return false;
|
|
3107
3110
|
|
|
3108
|
-
if (!
|
|
3111
|
+
if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
|
|
3109
3112
|
return false;
|
|
3110
3113
|
|
|
3111
3114
|
*if_graph_ret = if_params.conditional.phGraph_out[0];
|
|
@@ -3115,21 +3118,143 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3115
3118
|
return true;
|
|
3116
3119
|
}
|
|
3117
3120
|
|
|
3118
|
-
|
|
3121
|
+
// graph node type names for intelligible error reporting
|
|
3122
|
+
static const char* get_graph_node_type_name(CUgraphNodeType type)
|
|
3123
|
+
{
|
|
3124
|
+
static const std::unordered_map<CUgraphNodeType, const char*> names
|
|
3125
|
+
{
|
|
3126
|
+
{CU_GRAPH_NODE_TYPE_KERNEL, "kernel launch"},
|
|
3127
|
+
{CU_GRAPH_NODE_TYPE_MEMCPY, "memcpy"},
|
|
3128
|
+
{CU_GRAPH_NODE_TYPE_MEMSET, "memset"},
|
|
3129
|
+
{CU_GRAPH_NODE_TYPE_HOST, "host execution"},
|
|
3130
|
+
{CU_GRAPH_NODE_TYPE_GRAPH, "graph launch"},
|
|
3131
|
+
{CU_GRAPH_NODE_TYPE_EMPTY, "empty node"},
|
|
3132
|
+
{CU_GRAPH_NODE_TYPE_WAIT_EVENT, "event wait"},
|
|
3133
|
+
{CU_GRAPH_NODE_TYPE_EVENT_RECORD, "event record"},
|
|
3134
|
+
{CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL, "semaphore signal"},
|
|
3135
|
+
{CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT, "semaphore wait"},
|
|
3136
|
+
{CU_GRAPH_NODE_TYPE_MEM_ALLOC, "memory allocation"},
|
|
3137
|
+
{CU_GRAPH_NODE_TYPE_MEM_FREE, "memory deallocation"},
|
|
3138
|
+
{CU_GRAPH_NODE_TYPE_BATCH_MEM_OP, "batched mem op"},
|
|
3139
|
+
{CU_GRAPH_NODE_TYPE_CONDITIONAL, "conditional node"},
|
|
3140
|
+
};
|
|
3141
|
+
|
|
3142
|
+
auto it = names.find(type);
|
|
3143
|
+
if (it != names.end())
|
|
3144
|
+
return it->second;
|
|
3145
|
+
else
|
|
3146
|
+
return "unknown node";
|
|
3147
|
+
}
|
|
3148
|
+
|
|
3149
|
+
// check if a graph can be launched as a child graph
|
|
3150
|
+
static bool is_valid_child_graph(void* child_graph)
|
|
3119
3151
|
{
|
|
3152
|
+
// disallowed child graph nodes according to the documentation of cuGraphAddChildGraphNode()
|
|
3153
|
+
static const std::unordered_set<CUgraphNodeType> disallowed_nodes
|
|
3154
|
+
{
|
|
3155
|
+
CU_GRAPH_NODE_TYPE_MEM_ALLOC,
|
|
3156
|
+
CU_GRAPH_NODE_TYPE_MEM_FREE,
|
|
3157
|
+
CU_GRAPH_NODE_TYPE_CONDITIONAL,
|
|
3158
|
+
};
|
|
3159
|
+
|
|
3160
|
+
if (!child_graph)
|
|
3161
|
+
{
|
|
3162
|
+
wp::set_error_string("Child graph is null");
|
|
3163
|
+
return false;
|
|
3164
|
+
}
|
|
3165
|
+
|
|
3166
|
+
size_t num_nodes = 0;
|
|
3167
|
+
if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, NULL, &num_nodes)))
|
|
3168
|
+
return false;
|
|
3169
|
+
std::vector<cudaGraphNode_t> nodes(num_nodes);
|
|
3170
|
+
if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, nodes.data(), &num_nodes)))
|
|
3171
|
+
return false;
|
|
3172
|
+
|
|
3173
|
+
for (size_t i = 0; i < num_nodes; i++)
|
|
3174
|
+
{
|
|
3175
|
+
// note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
|
|
3176
|
+
CUgraphNodeType node_type;
|
|
3177
|
+
check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
|
|
3178
|
+
auto it = disallowed_nodes.find(node_type);
|
|
3179
|
+
if (it != disallowed_nodes.end())
|
|
3180
|
+
{
|
|
3181
|
+
wp::set_error_string("Child graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
|
|
3182
|
+
return false;
|
|
3183
|
+
}
|
|
3184
|
+
}
|
|
3185
|
+
|
|
3186
|
+
return true;
|
|
3187
|
+
}
|
|
3188
|
+
|
|
3189
|
+
// check if a graph can be used as a conditional body graph
|
|
3190
|
+
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#condtional-node-body-graph-requirements
|
|
3191
|
+
bool wp_cuda_graph_check_conditional_body(void* body_graph)
|
|
3192
|
+
{
|
|
3193
|
+
static const std::unordered_set<CUgraphNodeType> allowed_nodes
|
|
3194
|
+
{
|
|
3195
|
+
CU_GRAPH_NODE_TYPE_MEMCPY,
|
|
3196
|
+
CU_GRAPH_NODE_TYPE_MEMSET,
|
|
3197
|
+
CU_GRAPH_NODE_TYPE_KERNEL,
|
|
3198
|
+
CU_GRAPH_NODE_TYPE_GRAPH,
|
|
3199
|
+
CU_GRAPH_NODE_TYPE_EMPTY,
|
|
3200
|
+
CU_GRAPH_NODE_TYPE_CONDITIONAL,
|
|
3201
|
+
};
|
|
3202
|
+
|
|
3203
|
+
if (!body_graph)
|
|
3204
|
+
{
|
|
3205
|
+
wp::set_error_string("Conditional body graph is null");
|
|
3206
|
+
return false;
|
|
3207
|
+
}
|
|
3208
|
+
|
|
3209
|
+
size_t num_nodes = 0;
|
|
3210
|
+
if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, NULL, &num_nodes)))
|
|
3211
|
+
return false;
|
|
3212
|
+
std::vector<cudaGraphNode_t> nodes(num_nodes);
|
|
3213
|
+
if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, nodes.data(), &num_nodes)))
|
|
3214
|
+
return false;
|
|
3215
|
+
|
|
3216
|
+
for (size_t i = 0; i < num_nodes; i++)
|
|
3217
|
+
{
|
|
3218
|
+
// note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
|
|
3219
|
+
CUgraphNodeType node_type;
|
|
3220
|
+
check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
|
|
3221
|
+
if (allowed_nodes.find(node_type) == allowed_nodes.end())
|
|
3222
|
+
{
|
|
3223
|
+
wp::set_error_string("Conditional body graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
|
|
3224
|
+
return false;
|
|
3225
|
+
}
|
|
3226
|
+
else if (node_type == CU_GRAPH_NODE_TYPE_GRAPH)
|
|
3227
|
+
{
|
|
3228
|
+
// check nested child graphs recursively
|
|
3229
|
+
cudaGraph_t child_graph = NULL;
|
|
3230
|
+
if (!check_cuda(cudaGraphChildGraphNodeGetGraph(nodes[i], &child_graph)))
|
|
3231
|
+
return false;
|
|
3232
|
+
if (!wp_cuda_graph_check_conditional_body(child_graph))
|
|
3233
|
+
return false;
|
|
3234
|
+
}
|
|
3235
|
+
}
|
|
3236
|
+
|
|
3237
|
+
return true;
|
|
3238
|
+
}
|
|
3239
|
+
|
|
3240
|
+
bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
|
|
3241
|
+
{
|
|
3242
|
+
if (!is_valid_child_graph(child_graph))
|
|
3243
|
+
return false;
|
|
3244
|
+
|
|
3120
3245
|
ContextGuard guard(context);
|
|
3121
3246
|
|
|
3122
3247
|
CUstream cuda_stream = static_cast<CUstream>(stream);
|
|
3123
3248
|
|
|
3124
3249
|
// Get the current stream capturing graph
|
|
3125
|
-
|
|
3250
|
+
CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
|
|
3126
3251
|
void* cuda_graph = NULL;
|
|
3127
|
-
const
|
|
3252
|
+
const CUgraphNode* capture_deps = NULL;
|
|
3128
3253
|
size_t dep_count = 0;
|
|
3129
|
-
if (!
|
|
3254
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
|
|
3130
3255
|
return false;
|
|
3131
3256
|
|
|
3132
|
-
if (!
|
|
3257
|
+
if (!wp_cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
|
|
3133
3258
|
return false;
|
|
3134
3259
|
|
|
3135
3260
|
cudaGraphNode_t body_node;
|
|
@@ -3139,16 +3264,16 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
|
|
|
3139
3264
|
static_cast<cudaGraph_t>(child_graph))))
|
|
3140
3265
|
return false;
|
|
3141
3266
|
|
|
3142
|
-
if (!
|
|
3267
|
+
if (!wp_cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
|
|
3143
3268
|
return false;
|
|
3144
3269
|
|
|
3145
|
-
if (!
|
|
3270
|
+
if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
|
|
3146
3271
|
return false;
|
|
3147
3272
|
|
|
3148
3273
|
return true;
|
|
3149
3274
|
}
|
|
3150
3275
|
|
|
3151
|
-
bool
|
|
3276
|
+
bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
|
|
3152
3277
|
{
|
|
3153
3278
|
// if there's no body, it's a no-op
|
|
3154
3279
|
if (!body_graph_ret)
|
|
@@ -3159,15 +3284,15 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
|
|
|
3159
3284
|
CUstream cuda_stream = static_cast<CUstream>(stream);
|
|
3160
3285
|
|
|
3161
3286
|
// Get the current stream capturing graph
|
|
3162
|
-
|
|
3287
|
+
CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
|
|
3163
3288
|
cudaGraph_t cuda_graph = NULL;
|
|
3164
3289
|
const cudaGraphNode_t* capture_deps = NULL;
|
|
3165
3290
|
size_t dep_count = 0;
|
|
3166
|
-
if (!
|
|
3291
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
3167
3292
|
return false;
|
|
3168
3293
|
|
|
3169
3294
|
// abort if not capturing
|
|
3170
|
-
if (!cuda_graph || capture_status !=
|
|
3295
|
+
if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
|
|
3171
3296
|
{
|
|
3172
3297
|
wp::set_error_string("Stream is not capturing");
|
|
3173
3298
|
return false;
|
|
@@ -3192,19 +3317,20 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
|
|
|
3192
3317
|
if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
|
|
3193
3318
|
return false;
|
|
3194
3319
|
|
|
3195
|
-
if (!
|
|
3320
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
3196
3321
|
return false;
|
|
3197
3322
|
|
|
3198
3323
|
// insert conditional graph node
|
|
3199
|
-
|
|
3200
|
-
|
|
3324
|
+
CUgraphNode while_node;
|
|
3325
|
+
CUgraphNodeParams while_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
|
|
3201
3326
|
while_params.conditional.handle = handle;
|
|
3202
|
-
while_params.conditional.type =
|
|
3327
|
+
while_params.conditional.type = CU_GRAPH_COND_TYPE_WHILE;
|
|
3203
3328
|
while_params.conditional.size = 1;
|
|
3204
|
-
|
|
3329
|
+
while_params.conditional.ctx = get_current_context();
|
|
3330
|
+
if (!check_cu(cuGraphAddNode_f(&while_node, cuda_graph, capture_deps, NULL, dep_count, &while_params)))
|
|
3205
3331
|
return false;
|
|
3206
3332
|
|
|
3207
|
-
if (!
|
|
3333
|
+
if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
|
|
3208
3334
|
return false;
|
|
3209
3335
|
|
|
3210
3336
|
*body_graph_ret = while_params.conditional.phGraph_out[0];
|
|
@@ -3213,7 +3339,7 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
|
|
|
3213
3339
|
return true;
|
|
3214
3340
|
}
|
|
3215
3341
|
|
|
3216
|
-
bool
|
|
3342
|
+
bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
|
|
3217
3343
|
{
|
|
3218
3344
|
ContextGuard guard(context);
|
|
3219
3345
|
|
|
@@ -3240,37 +3366,43 @@ bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint6
|
|
|
3240
3366
|
#else
|
|
3241
3367
|
// stubs for conditional graph node API if CUDA toolkit is too old.
|
|
3242
3368
|
|
|
3243
|
-
bool
|
|
3369
|
+
bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
|
|
3244
3370
|
{
|
|
3245
3371
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3246
3372
|
return false;
|
|
3247
3373
|
}
|
|
3248
3374
|
|
|
3249
|
-
bool
|
|
3375
|
+
bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
|
|
3250
3376
|
{
|
|
3251
3377
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3252
3378
|
return false;
|
|
3253
3379
|
}
|
|
3254
3380
|
|
|
3255
|
-
bool
|
|
3381
|
+
bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
|
|
3256
3382
|
{
|
|
3257
3383
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3258
3384
|
return false;
|
|
3259
3385
|
}
|
|
3260
3386
|
|
|
3261
|
-
bool
|
|
3387
|
+
bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
|
|
3262
3388
|
{
|
|
3263
3389
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3264
3390
|
return false;
|
|
3265
3391
|
}
|
|
3266
3392
|
|
|
3267
|
-
bool
|
|
3393
|
+
bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
|
|
3268
3394
|
{
|
|
3269
3395
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3270
3396
|
return false;
|
|
3271
3397
|
}
|
|
3272
3398
|
|
|
3273
|
-
bool
|
|
3399
|
+
bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
|
|
3400
|
+
{
|
|
3401
|
+
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3402
|
+
return false;
|
|
3403
|
+
}
|
|
3404
|
+
|
|
3405
|
+
bool wp_cuda_graph_check_conditional_body(void* body_graph)
|
|
3274
3406
|
{
|
|
3275
3407
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3276
3408
|
return false;
|
|
@@ -3279,7 +3411,7 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
|
|
|
3279
3411
|
#endif // support for conditional graph nodes
|
|
3280
3412
|
|
|
3281
3413
|
|
|
3282
|
-
bool
|
|
3414
|
+
bool wp_cuda_graph_launch(void* graph_exec, void* stream)
|
|
3283
3415
|
{
|
|
3284
3416
|
// TODO: allow naming graphs?
|
|
3285
3417
|
begin_cuda_range(WP_TIMING_GRAPH, stream, get_stream_context(stream), "graph");
|
|
@@ -3291,14 +3423,14 @@ bool cuda_graph_launch(void* graph_exec, void* stream)
|
|
|
3291
3423
|
return result;
|
|
3292
3424
|
}
|
|
3293
3425
|
|
|
3294
|
-
bool
|
|
3426
|
+
bool wp_cuda_graph_destroy(void* context, void* graph)
|
|
3295
3427
|
{
|
|
3296
3428
|
ContextGuard guard(context);
|
|
3297
3429
|
|
|
3298
3430
|
return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
|
|
3299
3431
|
}
|
|
3300
3432
|
|
|
3301
|
-
bool
|
|
3433
|
+
bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec)
|
|
3302
3434
|
{
|
|
3303
3435
|
ContextGuard guard(context);
|
|
3304
3436
|
|
|
@@ -3350,7 +3482,7 @@ bool write_file(const char* data, size_t size, std::string filename, const char*
|
|
|
3350
3482
|
}
|
|
3351
3483
|
#endif
|
|
3352
3484
|
|
|
3353
|
-
size_t
|
|
3485
|
+
size_t wp_cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
|
|
3354
3486
|
{
|
|
3355
3487
|
// use file extension to determine whether to output PTX or CUBIN
|
|
3356
3488
|
const char* output_ext = strrchr(output_path, '.');
|
|
@@ -3406,9 +3538,9 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3406
3538
|
{
|
|
3407
3539
|
opts.push_back("--define-macro=_DEBUG");
|
|
3408
3540
|
opts.push_back("--generate-line-info");
|
|
3409
|
-
|
|
3410
|
-
//
|
|
3411
|
-
|
|
3541
|
+
#ifndef _WIN32
|
|
3542
|
+
opts.push_back("--device-debug"); // -G
|
|
3543
|
+
#endif
|
|
3412
3544
|
}
|
|
3413
3545
|
else
|
|
3414
3546
|
{
|
|
@@ -3678,7 +3810,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3678
3810
|
}
|
|
3679
3811
|
}
|
|
3680
3812
|
|
|
3681
|
-
bool
|
|
3813
|
+
bool wp_cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
|
|
3682
3814
|
{
|
|
3683
3815
|
|
|
3684
3816
|
CHECK_ANY(ltoir_output_path != nullptr);
|
|
@@ -3724,7 +3856,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3724
3856
|
return res;
|
|
3725
3857
|
}
|
|
3726
3858
|
|
|
3727
|
-
bool
|
|
3859
|
+
bool wp_cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
|
|
3728
3860
|
{
|
|
3729
3861
|
|
|
3730
3862
|
CHECK_ANY(ltoir_output_path != nullptr);
|
|
@@ -3769,7 +3901,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3769
3901
|
return res;
|
|
3770
3902
|
}
|
|
3771
3903
|
|
|
3772
|
-
bool
|
|
3904
|
+
bool wp_cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
|
|
3773
3905
|
{
|
|
3774
3906
|
|
|
3775
3907
|
CHECK_ANY(ltoir_output_path != nullptr);
|
|
@@ -3832,7 +3964,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3832
3964
|
|
|
3833
3965
|
#endif
|
|
3834
3966
|
|
|
3835
|
-
void*
|
|
3967
|
+
void* wp_cuda_load_module(void* context, const char* path)
|
|
3836
3968
|
{
|
|
3837
3969
|
ContextGuard guard(context);
|
|
3838
3970
|
|
|
@@ -3951,7 +4083,7 @@ void* cuda_load_module(void* context, const char* path)
|
|
|
3951
4083
|
return module;
|
|
3952
4084
|
}
|
|
3953
4085
|
|
|
3954
|
-
void
|
|
4086
|
+
void wp_cuda_unload_module(void* context, void* module)
|
|
3955
4087
|
{
|
|
3956
4088
|
// ensure there are no graph captures in progress
|
|
3957
4089
|
if (g_captures.empty())
|
|
@@ -3970,7 +4102,7 @@ void cuda_unload_module(void* context, void* module)
|
|
|
3970
4102
|
}
|
|
3971
4103
|
|
|
3972
4104
|
|
|
3973
|
-
int
|
|
4105
|
+
int wp_cuda_get_max_shared_memory(void* context)
|
|
3974
4106
|
{
|
|
3975
4107
|
ContextInfo* info = get_context_info(context);
|
|
3976
4108
|
if (!info)
|
|
@@ -3980,7 +4112,7 @@ int cuda_get_max_shared_memory(void* context)
|
|
|
3980
4112
|
return max_smem_bytes;
|
|
3981
4113
|
}
|
|
3982
4114
|
|
|
3983
|
-
bool
|
|
4115
|
+
bool wp_cuda_configure_kernel_shared_memory(void* kernel, int size)
|
|
3984
4116
|
{
|
|
3985
4117
|
int requested_smem_bytes = size;
|
|
3986
4118
|
|
|
@@ -3992,7 +4124,7 @@ bool cuda_configure_kernel_shared_memory(void* kernel, int size)
|
|
|
3992
4124
|
return true;
|
|
3993
4125
|
}
|
|
3994
4126
|
|
|
3995
|
-
void*
|
|
4127
|
+
void* wp_cuda_get_kernel(void* context, void* module, const char* name)
|
|
3996
4128
|
{
|
|
3997
4129
|
ContextGuard guard(context);
|
|
3998
4130
|
|
|
@@ -4007,7 +4139,7 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
|
|
|
4007
4139
|
return kernel;
|
|
4008
4140
|
}
|
|
4009
4141
|
|
|
4010
|
-
size_t
|
|
4142
|
+
size_t wp_cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
|
|
4011
4143
|
{
|
|
4012
4144
|
ContextGuard guard(context);
|
|
4013
4145
|
|
|
@@ -4061,21 +4193,21 @@ size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_block
|
|
|
4061
4193
|
return res;
|
|
4062
4194
|
}
|
|
4063
4195
|
|
|
4064
|
-
void
|
|
4196
|
+
void wp_cuda_graphics_map(void* context, void* resource)
|
|
4065
4197
|
{
|
|
4066
4198
|
ContextGuard guard(context);
|
|
4067
4199
|
|
|
4068
4200
|
check_cu(cuGraphicsMapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
|
|
4069
4201
|
}
|
|
4070
4202
|
|
|
4071
|
-
void
|
|
4203
|
+
void wp_cuda_graphics_unmap(void* context, void* resource)
|
|
4072
4204
|
{
|
|
4073
4205
|
ContextGuard guard(context);
|
|
4074
4206
|
|
|
4075
4207
|
check_cu(cuGraphicsUnmapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
|
|
4076
4208
|
}
|
|
4077
4209
|
|
|
4078
|
-
void
|
|
4210
|
+
void wp_cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t* ptr, size_t* size)
|
|
4079
4211
|
{
|
|
4080
4212
|
ContextGuard guard(context);
|
|
4081
4213
|
|
|
@@ -4087,7 +4219,7 @@ void cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t*
|
|
|
4087
4219
|
*size = bytes;
|
|
4088
4220
|
}
|
|
4089
4221
|
|
|
4090
|
-
void*
|
|
4222
|
+
void* wp_cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags)
|
|
4091
4223
|
{
|
|
4092
4224
|
ContextGuard guard(context);
|
|
4093
4225
|
|
|
@@ -4102,7 +4234,7 @@ void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsign
|
|
|
4102
4234
|
return resource;
|
|
4103
4235
|
}
|
|
4104
4236
|
|
|
4105
|
-
void
|
|
4237
|
+
void wp_cuda_graphics_unregister_resource(void* context, void* resource)
|
|
4106
4238
|
{
|
|
4107
4239
|
ContextGuard guard(context);
|
|
4108
4240
|
|
|
@@ -4111,25 +4243,25 @@ void cuda_graphics_unregister_resource(void* context, void* resource)
|
|
|
4111
4243
|
delete res;
|
|
4112
4244
|
}
|
|
4113
4245
|
|
|
4114
|
-
void
|
|
4246
|
+
void wp_cuda_timing_begin(int flags)
|
|
4115
4247
|
{
|
|
4116
4248
|
g_cuda_timing_state = new CudaTimingState(flags, g_cuda_timing_state);
|
|
4117
4249
|
}
|
|
4118
4250
|
|
|
4119
|
-
int
|
|
4251
|
+
int wp_cuda_timing_get_result_count()
|
|
4120
4252
|
{
|
|
4121
4253
|
if (g_cuda_timing_state)
|
|
4122
4254
|
return int(g_cuda_timing_state->ranges.size());
|
|
4123
4255
|
return 0;
|
|
4124
4256
|
}
|
|
4125
4257
|
|
|
4126
|
-
void
|
|
4258
|
+
void wp_cuda_timing_end(timing_result_t* results, int size)
|
|
4127
4259
|
{
|
|
4128
4260
|
if (!g_cuda_timing_state)
|
|
4129
4261
|
return;
|
|
4130
4262
|
|
|
4131
4263
|
// number of results to write to the user buffer
|
|
4132
|
-
int count = std::min(
|
|
4264
|
+
int count = std::min(wp_cuda_timing_get_result_count(), size);
|
|
4133
4265
|
|
|
4134
4266
|
// compute timings and write results
|
|
4135
4267
|
for (int i = 0; i < count; i++)
|
|
@@ -4163,7 +4295,6 @@ void cuda_timing_end(timing_result_t* results, int size)
|
|
|
4163
4295
|
#include "reduce.cu"
|
|
4164
4296
|
#include "runlength_encode.cu"
|
|
4165
4297
|
#include "scan.cu"
|
|
4166
|
-
#include "marching.cu"
|
|
4167
4298
|
#include "sparse.cu"
|
|
4168
4299
|
#include "volume.cu"
|
|
4169
4300
|
#include "volume_builder.cu"
|