warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +282 -103
- warp/__init__.pyi +482 -110
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +93 -30
- warp/build_dll.py +48 -63
- warp/builtins.py +955 -137
- warp/codegen.py +327 -209
- warp/config.py +1 -1
- warp/context.py +1363 -800
- warp/examples/core/example_marching_cubes.py +1 -0
- warp/examples/core/example_render_opengl.py +100 -3
- warp/examples/fem/example_apic_fluid.py +98 -52
- warp/examples/fem/example_convection_diffusion_dg.py +25 -4
- warp/examples/fem/example_diffusion_mgpu.py +8 -3
- warp/examples/fem/utils.py +68 -22
- warp/examples/interop/example_jax_callable.py +34 -4
- warp/examples/interop/example_jax_kernel.py +27 -1
- warp/fabric.py +1 -1
- warp/fem/cache.py +27 -19
- warp/fem/domain.py +2 -2
- warp/fem/field/nodal_field.py +2 -2
- warp/fem/field/virtual.py +266 -166
- warp/fem/geometry/geometry.py +5 -5
- warp/fem/integrate.py +200 -91
- warp/fem/space/restriction.py +4 -0
- warp/fem/space/shape/tet_shape_function.py +3 -10
- warp/jax_experimental/custom_call.py +1 -1
- warp/jax_experimental/ffi.py +203 -54
- warp/marching_cubes.py +708 -0
- warp/native/array.h +103 -8
- warp/native/builtin.h +90 -9
- warp/native/bvh.cpp +64 -28
- warp/native/bvh.cu +58 -58
- warp/native/bvh.h +2 -2
- warp/native/clang/clang.cpp +7 -7
- warp/native/coloring.cpp +13 -3
- warp/native/crt.cpp +2 -2
- warp/native/crt.h +3 -5
- warp/native/cuda_util.cpp +42 -11
- warp/native/cuda_util.h +10 -4
- warp/native/exports.h +1842 -1908
- warp/native/fabric.h +2 -1
- warp/native/hashgrid.cpp +37 -37
- warp/native/hashgrid.cu +2 -2
- warp/native/initializer_array.h +1 -1
- warp/native/intersect.h +4 -4
- warp/native/mat.h +1913 -119
- warp/native/mathdx.cpp +43 -43
- warp/native/mesh.cpp +24 -24
- warp/native/mesh.cu +26 -26
- warp/native/mesh.h +5 -3
- warp/native/nanovdb/GridHandle.h +179 -12
- warp/native/nanovdb/HostBuffer.h +8 -7
- warp/native/nanovdb/NanoVDB.h +517 -895
- warp/native/nanovdb/NodeManager.h +323 -0
- warp/native/nanovdb/PNanoVDB.h +2 -2
- warp/native/quat.h +337 -16
- warp/native/rand.h +7 -7
- warp/native/range.h +7 -1
- warp/native/reduce.cpp +10 -10
- warp/native/reduce.cu +13 -14
- warp/native/runlength_encode.cpp +2 -2
- warp/native/runlength_encode.cu +5 -5
- warp/native/scan.cpp +3 -3
- warp/native/scan.cu +4 -4
- warp/native/sort.cpp +10 -10
- warp/native/sort.cu +22 -22
- warp/native/sparse.cpp +8 -8
- warp/native/sparse.cu +14 -14
- warp/native/spatial.h +366 -17
- warp/native/svd.h +23 -8
- warp/native/temp_buffer.h +2 -2
- warp/native/tile.h +303 -70
- warp/native/tile_radix_sort.h +5 -1
- warp/native/tile_reduce.h +16 -25
- warp/native/tuple.h +2 -2
- warp/native/vec.h +385 -18
- warp/native/volume.cpp +54 -54
- warp/native/volume.cu +1 -1
- warp/native/volume.h +2 -1
- warp/native/volume_builder.cu +30 -37
- warp/native/warp.cpp +150 -149
- warp/native/warp.cu +337 -193
- warp/native/warp.h +227 -226
- warp/optim/linear.py +736 -271
- warp/render/imgui_manager.py +289 -0
- warp/render/render_opengl.py +137 -57
- warp/render/render_usd.py +0 -1
- warp/sim/collide.py +1 -2
- warp/sim/graph_coloring.py +2 -2
- warp/sim/integrator_vbd.py +10 -2
- warp/sparse.py +559 -176
- warp/tape.py +2 -0
- warp/tests/aux_test_module_aot.py +7 -0
- warp/tests/cuda/test_async.py +3 -3
- warp/tests/cuda/test_conditional_captures.py +101 -0
- warp/tests/geometry/test_marching_cubes.py +233 -12
- warp/tests/sim/test_cloth.py +89 -6
- warp/tests/sim/test_coloring.py +82 -7
- warp/tests/test_array.py +56 -5
- warp/tests/test_assert.py +53 -0
- warp/tests/test_atomic_cas.py +127 -114
- warp/tests/test_codegen.py +3 -2
- warp/tests/test_context.py +8 -15
- warp/tests/test_enum.py +136 -0
- warp/tests/test_examples.py +2 -2
- warp/tests/test_fem.py +45 -2
- warp/tests/test_fixedarray.py +229 -0
- warp/tests/test_func.py +18 -15
- warp/tests/test_future_annotations.py +7 -5
- warp/tests/test_linear_solvers.py +30 -0
- warp/tests/test_map.py +1 -1
- warp/tests/test_mat.py +1540 -378
- warp/tests/test_mat_assign_copy.py +178 -0
- warp/tests/test_mat_constructors.py +574 -0
- warp/tests/test_module_aot.py +287 -0
- warp/tests/test_print.py +69 -0
- warp/tests/test_quat.py +162 -34
- warp/tests/test_quat_assign_copy.py +145 -0
- warp/tests/test_reload.py +2 -1
- warp/tests/test_sparse.py +103 -0
- warp/tests/test_spatial.py +140 -34
- warp/tests/test_spatial_assign_copy.py +160 -0
- warp/tests/test_static.py +48 -0
- warp/tests/test_struct.py +43 -3
- warp/tests/test_tape.py +38 -0
- warp/tests/test_types.py +0 -20
- warp/tests/test_vec.py +216 -441
- warp/tests/test_vec_assign_copy.py +143 -0
- warp/tests/test_vec_constructors.py +325 -0
- warp/tests/tile/test_tile.py +206 -152
- warp/tests/tile/test_tile_cholesky.py +605 -0
- warp/tests/tile/test_tile_load.py +169 -0
- warp/tests/tile/test_tile_mathdx.py +2 -558
- warp/tests/tile/test_tile_matmul.py +179 -0
- warp/tests/tile/test_tile_mlp.py +1 -1
- warp/tests/tile/test_tile_reduce.py +100 -11
- warp/tests/tile/test_tile_shared_memory.py +16 -16
- warp/tests/tile/test_tile_sort.py +59 -55
- warp/tests/unittest_suites.py +16 -0
- warp/tests/walkthrough_debug.py +1 -1
- warp/thirdparty/unittest_parallel.py +108 -9
- warp/types.py +554 -264
- warp/utils.py +68 -86
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
- warp/native/marching.cpp +0 -19
- warp/native/marching.cu +0 -514
- warp/native/marching.h +0 -19
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
warp/native/warp.cu
CHANGED
|
@@ -168,7 +168,7 @@ struct ContextInfo
|
|
|
168
168
|
{
|
|
169
169
|
DeviceInfo* device_info = NULL;
|
|
170
170
|
|
|
171
|
-
// the current stream, managed from Python (see
|
|
171
|
+
// the current stream, managed from Python (see wp_cuda_context_set_stream() and wp_cuda_context_get_stream())
|
|
172
172
|
CUstream stream = NULL;
|
|
173
173
|
|
|
174
174
|
// conditional graph node support, loaded on demand if the driver supports it (CUDA 12.4+)
|
|
@@ -237,11 +237,11 @@ static std::unordered_map<CUstream, StreamInfo> g_streams;
|
|
|
237
237
|
|
|
238
238
|
// Ongoing graph captures registered using wp.capture_begin().
|
|
239
239
|
// This maps the capture id to the stream where capture was started.
|
|
240
|
-
// See
|
|
240
|
+
// See wp_cuda_graph_begin_capture(), wp_cuda_graph_end_capture(), and wp_free_device_async().
|
|
241
241
|
static std::unordered_map<uint64_t, CaptureInfo*> g_captures;
|
|
242
242
|
|
|
243
243
|
// Memory allocated during graph capture requires special handling.
|
|
244
|
-
// See
|
|
244
|
+
// See wp_alloc_device_async() and wp_free_device_async().
|
|
245
245
|
static std::unordered_map<void*, GraphAllocInfo> g_graph_allocs;
|
|
246
246
|
|
|
247
247
|
// Memory that cannot be freed immediately gets queued here.
|
|
@@ -252,12 +252,12 @@ static std::vector<FreeInfo> g_deferred_free_list;
|
|
|
252
252
|
// Call unload_deferred_modules() to release.
|
|
253
253
|
static std::vector<ModuleInfo> g_deferred_module_list;
|
|
254
254
|
|
|
255
|
-
void
|
|
255
|
+
void wp_cuda_set_context_restore_policy(bool always_restore)
|
|
256
256
|
{
|
|
257
257
|
ContextGuard::always_restore = always_restore;
|
|
258
258
|
}
|
|
259
259
|
|
|
260
|
-
int
|
|
260
|
+
int wp_cuda_get_context_restore_policy()
|
|
261
261
|
{
|
|
262
262
|
return int(ContextGuard::always_restore);
|
|
263
263
|
}
|
|
@@ -309,7 +309,13 @@ int cuda_init()
|
|
|
309
309
|
check_cu(cuDeviceGetAttribute_f(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
|
|
310
310
|
check_cu(cuDeviceGetAttribute_f(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
|
|
311
311
|
g_devices[i].arch = 10 * major + minor;
|
|
312
|
-
|
|
312
|
+
#ifdef CUDA_VERSION
|
|
313
|
+
#if CUDA_VERSION < 13000
|
|
314
|
+
if (g_devices[i].arch == 110) {
|
|
315
|
+
g_devices[i].arch = 101; // Thor SM change
|
|
316
|
+
}
|
|
317
|
+
#endif
|
|
318
|
+
#endif
|
|
313
319
|
g_device_map[device] = &g_devices[i];
|
|
314
320
|
}
|
|
315
321
|
else
|
|
@@ -342,7 +348,7 @@ static inline CUcontext get_current_context()
|
|
|
342
348
|
|
|
343
349
|
static inline CUstream get_current_stream(void* context=NULL)
|
|
344
350
|
{
|
|
345
|
-
return static_cast<CUstream>(
|
|
351
|
+
return static_cast<CUstream>(wp_cuda_context_get_stream(context));
|
|
346
352
|
}
|
|
347
353
|
|
|
348
354
|
static ContextInfo* get_context_info(CUcontext ctx)
|
|
@@ -475,7 +481,7 @@ static int unload_deferred_modules(void* context = NULL)
|
|
|
475
481
|
const ModuleInfo& module_info = *it;
|
|
476
482
|
if (module_info.context == context || !context)
|
|
477
483
|
{
|
|
478
|
-
|
|
484
|
+
wp_cuda_unload_module(module_info.context, module_info.module);
|
|
479
485
|
++num_unloaded_modules;
|
|
480
486
|
it = g_deferred_module_list.erase(it);
|
|
481
487
|
}
|
|
@@ -529,41 +535,41 @@ static inline const char* get_cuda_kernel_name(void* kernel)
|
|
|
529
535
|
}
|
|
530
536
|
|
|
531
537
|
|
|
532
|
-
void*
|
|
538
|
+
void* wp_alloc_pinned(size_t s)
|
|
533
539
|
{
|
|
534
540
|
void* ptr = NULL;
|
|
535
541
|
check_cuda(cudaMallocHost(&ptr, s));
|
|
536
542
|
return ptr;
|
|
537
543
|
}
|
|
538
544
|
|
|
539
|
-
void
|
|
545
|
+
void wp_free_pinned(void* ptr)
|
|
540
546
|
{
|
|
541
547
|
cudaFreeHost(ptr);
|
|
542
548
|
}
|
|
543
549
|
|
|
544
|
-
void*
|
|
550
|
+
void* wp_alloc_device(void* context, size_t s)
|
|
545
551
|
{
|
|
546
|
-
int ordinal =
|
|
552
|
+
int ordinal = wp_cuda_context_get_device_ordinal(context);
|
|
547
553
|
|
|
548
554
|
// use stream-ordered allocator if available
|
|
549
|
-
if (
|
|
550
|
-
return
|
|
555
|
+
if (wp_cuda_device_is_mempool_supported(ordinal))
|
|
556
|
+
return wp_alloc_device_async(context, s);
|
|
551
557
|
else
|
|
552
|
-
return
|
|
558
|
+
return wp_alloc_device_default(context, s);
|
|
553
559
|
}
|
|
554
560
|
|
|
555
|
-
void
|
|
561
|
+
void wp_free_device(void* context, void* ptr)
|
|
556
562
|
{
|
|
557
|
-
int ordinal =
|
|
563
|
+
int ordinal = wp_cuda_context_get_device_ordinal(context);
|
|
558
564
|
|
|
559
565
|
// use stream-ordered allocator if available
|
|
560
|
-
if (
|
|
561
|
-
|
|
566
|
+
if (wp_cuda_device_is_mempool_supported(ordinal))
|
|
567
|
+
wp_free_device_async(context, ptr);
|
|
562
568
|
else
|
|
563
|
-
|
|
569
|
+
wp_free_device_default(context, ptr);
|
|
564
570
|
}
|
|
565
571
|
|
|
566
|
-
void*
|
|
572
|
+
void* wp_alloc_device_default(void* context, size_t s)
|
|
567
573
|
{
|
|
568
574
|
ContextGuard guard(context);
|
|
569
575
|
|
|
@@ -573,7 +579,7 @@ void* alloc_device_default(void* context, size_t s)
|
|
|
573
579
|
return ptr;
|
|
574
580
|
}
|
|
575
581
|
|
|
576
|
-
void
|
|
582
|
+
void wp_free_device_default(void* context, void* ptr)
|
|
577
583
|
{
|
|
578
584
|
ContextGuard guard(context);
|
|
579
585
|
|
|
@@ -589,7 +595,7 @@ void free_device_default(void* context, void* ptr)
|
|
|
589
595
|
}
|
|
590
596
|
}
|
|
591
597
|
|
|
592
|
-
void*
|
|
598
|
+
void* wp_alloc_device_async(void* context, size_t s)
|
|
593
599
|
{
|
|
594
600
|
// stream-ordered allocations don't rely on the current context,
|
|
595
601
|
// but we set the context here for consistent behaviour
|
|
@@ -607,7 +613,7 @@ void* alloc_device_async(void* context, size_t s)
|
|
|
607
613
|
if (ptr)
|
|
608
614
|
{
|
|
609
615
|
// if the stream is capturing, the allocation requires special handling
|
|
610
|
-
if (
|
|
616
|
+
if (wp_cuda_stream_is_capturing(stream))
|
|
611
617
|
{
|
|
612
618
|
// check if this is a known capture
|
|
613
619
|
uint64_t capture_id = get_capture_id(stream);
|
|
@@ -628,7 +634,7 @@ void* alloc_device_async(void* context, size_t s)
|
|
|
628
634
|
return ptr;
|
|
629
635
|
}
|
|
630
636
|
|
|
631
|
-
void
|
|
637
|
+
void wp_free_device_async(void* context, void* ptr)
|
|
632
638
|
{
|
|
633
639
|
// stream-ordered allocators generally don't rely on the current context,
|
|
634
640
|
// but we set the context here for consistent behaviour
|
|
@@ -726,7 +732,7 @@ void free_device_async(void* context, void* ptr)
|
|
|
726
732
|
}
|
|
727
733
|
}
|
|
728
734
|
|
|
729
|
-
bool
|
|
735
|
+
bool wp_memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
|
|
730
736
|
{
|
|
731
737
|
ContextGuard guard(context);
|
|
732
738
|
|
|
@@ -745,7 +751,7 @@ bool memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
|
|
|
745
751
|
return result;
|
|
746
752
|
}
|
|
747
753
|
|
|
748
|
-
bool
|
|
754
|
+
bool wp_memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
|
|
749
755
|
{
|
|
750
756
|
ContextGuard guard(context);
|
|
751
757
|
|
|
@@ -764,7 +770,7 @@ bool memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
|
|
|
764
770
|
return result;
|
|
765
771
|
}
|
|
766
772
|
|
|
767
|
-
bool
|
|
773
|
+
bool wp_memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
|
|
768
774
|
{
|
|
769
775
|
ContextGuard guard(context);
|
|
770
776
|
|
|
@@ -783,7 +789,7 @@ bool memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
|
|
|
783
789
|
return result;
|
|
784
790
|
}
|
|
785
791
|
|
|
786
|
-
bool
|
|
792
|
+
bool wp_memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size_t n, void* stream)
|
|
787
793
|
{
|
|
788
794
|
// ContextGuard guard(context);
|
|
789
795
|
|
|
@@ -803,7 +809,7 @@ bool memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size
|
|
|
803
809
|
// because cudaMemPoolGetAccess() cannot be called during graph capture.
|
|
804
810
|
// - CUDA will report error 1 (invalid argument) if cudaMemcpyAsync() is called but mempool access is not enabled.
|
|
805
811
|
|
|
806
|
-
if (!
|
|
812
|
+
if (!wp_cuda_stream_is_capturing(stream))
|
|
807
813
|
{
|
|
808
814
|
begin_cuda_range(WP_TIMING_MEMCPY, cuda_stream, get_stream_context(stream), "memcpy PtoP");
|
|
809
815
|
|
|
@@ -890,7 +896,7 @@ __global__ void memset_kernel(int* dest, int value, size_t n)
|
|
|
890
896
|
}
|
|
891
897
|
}
|
|
892
898
|
|
|
893
|
-
void
|
|
899
|
+
void wp_memset_device(void* context, void* dest, int value, size_t n)
|
|
894
900
|
{
|
|
895
901
|
ContextGuard guard(context);
|
|
896
902
|
|
|
@@ -934,7 +940,7 @@ __global__ void memtile_value_kernel(T* dst, T value, size_t n)
|
|
|
934
940
|
}
|
|
935
941
|
}
|
|
936
942
|
|
|
937
|
-
void
|
|
943
|
+
void wp_memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
|
|
938
944
|
{
|
|
939
945
|
ContextGuard guard(context);
|
|
940
946
|
|
|
@@ -970,12 +976,12 @@ void memtile_device(void* context, void* dst, const void* src, size_t srcsize, s
|
|
|
970
976
|
|
|
971
977
|
// copy value to device memory
|
|
972
978
|
// TODO: use a persistent stream-local staging buffer to avoid allocs?
|
|
973
|
-
void* src_devptr =
|
|
979
|
+
void* src_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, srcsize);
|
|
974
980
|
check_cuda(cudaMemcpyAsync(src_devptr, src, srcsize, cudaMemcpyHostToDevice, get_current_stream()));
|
|
975
981
|
|
|
976
982
|
wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, (dst, src_devptr, srcsize, n));
|
|
977
983
|
|
|
978
|
-
|
|
984
|
+
wp_free_device(WP_CURRENT_CONTEXT, src_devptr);
|
|
979
985
|
|
|
980
986
|
}
|
|
981
987
|
}
|
|
@@ -1202,7 +1208,7 @@ static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::in
|
|
|
1202
1208
|
}
|
|
1203
1209
|
|
|
1204
1210
|
|
|
1205
|
-
WP_API bool
|
|
1211
|
+
WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
|
|
1206
1212
|
{
|
|
1207
1213
|
if (!src || !dst)
|
|
1208
1214
|
return false;
|
|
@@ -1594,7 +1600,7 @@ static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t
|
|
|
1594
1600
|
}
|
|
1595
1601
|
|
|
1596
1602
|
|
|
1597
|
-
WP_API void
|
|
1603
|
+
WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
|
|
1598
1604
|
{
|
|
1599
1605
|
if (!arr_ptr || !value_ptr)
|
|
1600
1606
|
return;
|
|
@@ -1650,7 +1656,7 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
|
|
|
1650
1656
|
|
|
1651
1657
|
// copy value to device memory
|
|
1652
1658
|
// TODO: use a persistent stream-local staging buffer to avoid allocs?
|
|
1653
|
-
void* value_devptr =
|
|
1659
|
+
void* value_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, value_size);
|
|
1654
1660
|
check_cuda(cudaMemcpyAsync(value_devptr, value_ptr, value_size, cudaMemcpyHostToDevice, get_current_stream()));
|
|
1655
1661
|
|
|
1656
1662
|
// handle fabric arrays
|
|
@@ -1708,20 +1714,20 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
|
|
|
1708
1714
|
return;
|
|
1709
1715
|
}
|
|
1710
1716
|
|
|
1711
|
-
|
|
1717
|
+
wp_free_device(WP_CURRENT_CONTEXT, value_devptr);
|
|
1712
1718
|
}
|
|
1713
1719
|
|
|
1714
|
-
void
|
|
1720
|
+
void wp_array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
|
|
1715
1721
|
{
|
|
1716
1722
|
scan_device((const int*)in, (int*)out, len, inclusive);
|
|
1717
1723
|
}
|
|
1718
1724
|
|
|
1719
|
-
void
|
|
1725
|
+
void wp_array_scan_float_device(uint64_t in, uint64_t out, int len, bool inclusive)
|
|
1720
1726
|
{
|
|
1721
1727
|
scan_device((const float*)in, (float*)out, len, inclusive);
|
|
1722
1728
|
}
|
|
1723
1729
|
|
|
1724
|
-
int
|
|
1730
|
+
int wp_cuda_driver_version()
|
|
1725
1731
|
{
|
|
1726
1732
|
int version;
|
|
1727
1733
|
if (check_cu(cuDriverGetVersion_f(&version)))
|
|
@@ -1730,17 +1736,17 @@ int cuda_driver_version()
|
|
|
1730
1736
|
return 0;
|
|
1731
1737
|
}
|
|
1732
1738
|
|
|
1733
|
-
int
|
|
1739
|
+
int wp_cuda_toolkit_version()
|
|
1734
1740
|
{
|
|
1735
1741
|
return CUDA_VERSION;
|
|
1736
1742
|
}
|
|
1737
1743
|
|
|
1738
|
-
bool
|
|
1744
|
+
bool wp_cuda_driver_is_initialized()
|
|
1739
1745
|
{
|
|
1740
1746
|
return is_cuda_driver_initialized();
|
|
1741
1747
|
}
|
|
1742
1748
|
|
|
1743
|
-
int
|
|
1749
|
+
int wp_nvrtc_supported_arch_count()
|
|
1744
1750
|
{
|
|
1745
1751
|
int count;
|
|
1746
1752
|
if (check_nvrtc(nvrtcGetNumSupportedArchs(&count)))
|
|
@@ -1749,7 +1755,7 @@ int nvrtc_supported_arch_count()
|
|
|
1749
1755
|
return 0;
|
|
1750
1756
|
}
|
|
1751
1757
|
|
|
1752
|
-
void
|
|
1758
|
+
void wp_nvrtc_supported_archs(int* archs)
|
|
1753
1759
|
{
|
|
1754
1760
|
if (archs)
|
|
1755
1761
|
{
|
|
@@ -1757,14 +1763,14 @@ void nvrtc_supported_archs(int* archs)
|
|
|
1757
1763
|
}
|
|
1758
1764
|
}
|
|
1759
1765
|
|
|
1760
|
-
int
|
|
1766
|
+
int wp_cuda_device_get_count()
|
|
1761
1767
|
{
|
|
1762
1768
|
int count = 0;
|
|
1763
1769
|
check_cu(cuDeviceGetCount_f(&count));
|
|
1764
1770
|
return count;
|
|
1765
1771
|
}
|
|
1766
1772
|
|
|
1767
|
-
void*
|
|
1773
|
+
void* wp_cuda_device_get_primary_context(int ordinal)
|
|
1768
1774
|
{
|
|
1769
1775
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1770
1776
|
{
|
|
@@ -1780,75 +1786,75 @@ void* cuda_device_get_primary_context(int ordinal)
|
|
|
1780
1786
|
return NULL;
|
|
1781
1787
|
}
|
|
1782
1788
|
|
|
1783
|
-
const char*
|
|
1789
|
+
const char* wp_cuda_device_get_name(int ordinal)
|
|
1784
1790
|
{
|
|
1785
1791
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1786
1792
|
return g_devices[ordinal].name;
|
|
1787
1793
|
return NULL;
|
|
1788
1794
|
}
|
|
1789
1795
|
|
|
1790
|
-
int
|
|
1796
|
+
int wp_cuda_device_get_arch(int ordinal)
|
|
1791
1797
|
{
|
|
1792
1798
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1793
1799
|
return g_devices[ordinal].arch;
|
|
1794
1800
|
return 0;
|
|
1795
1801
|
}
|
|
1796
1802
|
|
|
1797
|
-
int
|
|
1803
|
+
int wp_cuda_device_get_sm_count(int ordinal)
|
|
1798
1804
|
{
|
|
1799
1805
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1800
1806
|
return g_devices[ordinal].sm_count;
|
|
1801
1807
|
return 0;
|
|
1802
1808
|
}
|
|
1803
1809
|
|
|
1804
|
-
void
|
|
1810
|
+
void wp_cuda_device_get_uuid(int ordinal, char uuid[16])
|
|
1805
1811
|
{
|
|
1806
1812
|
memcpy(uuid, g_devices[ordinal].uuid.bytes, sizeof(char)*16);
|
|
1807
1813
|
}
|
|
1808
1814
|
|
|
1809
|
-
int
|
|
1815
|
+
int wp_cuda_device_get_pci_domain_id(int ordinal)
|
|
1810
1816
|
{
|
|
1811
1817
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1812
1818
|
return g_devices[ordinal].pci_domain_id;
|
|
1813
1819
|
return -1;
|
|
1814
1820
|
}
|
|
1815
1821
|
|
|
1816
|
-
int
|
|
1822
|
+
int wp_cuda_device_get_pci_bus_id(int ordinal)
|
|
1817
1823
|
{
|
|
1818
1824
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1819
1825
|
return g_devices[ordinal].pci_bus_id;
|
|
1820
1826
|
return -1;
|
|
1821
1827
|
}
|
|
1822
1828
|
|
|
1823
|
-
int
|
|
1829
|
+
int wp_cuda_device_get_pci_device_id(int ordinal)
|
|
1824
1830
|
{
|
|
1825
1831
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1826
1832
|
return g_devices[ordinal].pci_device_id;
|
|
1827
1833
|
return -1;
|
|
1828
1834
|
}
|
|
1829
1835
|
|
|
1830
|
-
int
|
|
1836
|
+
int wp_cuda_device_is_uva(int ordinal)
|
|
1831
1837
|
{
|
|
1832
1838
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1833
1839
|
return g_devices[ordinal].is_uva;
|
|
1834
1840
|
return 0;
|
|
1835
1841
|
}
|
|
1836
1842
|
|
|
1837
|
-
int
|
|
1843
|
+
int wp_cuda_device_is_mempool_supported(int ordinal)
|
|
1838
1844
|
{
|
|
1839
1845
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1840
1846
|
return g_devices[ordinal].is_mempool_supported;
|
|
1841
1847
|
return 0;
|
|
1842
1848
|
}
|
|
1843
1849
|
|
|
1844
|
-
int
|
|
1850
|
+
int wp_cuda_device_is_ipc_supported(int ordinal)
|
|
1845
1851
|
{
|
|
1846
1852
|
if (ordinal >= 0 && ordinal < int(g_devices.size()))
|
|
1847
1853
|
return g_devices[ordinal].is_ipc_supported;
|
|
1848
1854
|
return 0;
|
|
1849
1855
|
}
|
|
1850
1856
|
|
|
1851
|
-
int
|
|
1857
|
+
int wp_cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
|
|
1852
1858
|
{
|
|
1853
1859
|
if (ordinal < 0 || ordinal > int(g_devices.size()))
|
|
1854
1860
|
{
|
|
@@ -1875,7 +1881,7 @@ int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
|
|
|
1875
1881
|
return 1; // success
|
|
1876
1882
|
}
|
|
1877
1883
|
|
|
1878
|
-
uint64_t
|
|
1884
|
+
uint64_t wp_cuda_device_get_mempool_release_threshold(int ordinal)
|
|
1879
1885
|
{
|
|
1880
1886
|
if (ordinal < 0 || ordinal > int(g_devices.size()))
|
|
1881
1887
|
{
|
|
@@ -1903,7 +1909,7 @@ uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
|
|
|
1903
1909
|
return threshold;
|
|
1904
1910
|
}
|
|
1905
1911
|
|
|
1906
|
-
uint64_t
|
|
1912
|
+
uint64_t wp_cuda_device_get_mempool_used_mem_current(int ordinal)
|
|
1907
1913
|
{
|
|
1908
1914
|
if (ordinal < 0 || ordinal > int(g_devices.size()))
|
|
1909
1915
|
{
|
|
@@ -1931,7 +1937,7 @@ uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
|
|
|
1931
1937
|
return mem_used;
|
|
1932
1938
|
}
|
|
1933
1939
|
|
|
1934
|
-
uint64_t
|
|
1940
|
+
uint64_t wp_cuda_device_get_mempool_used_mem_high(int ordinal)
|
|
1935
1941
|
{
|
|
1936
1942
|
if (ordinal < 0 || ordinal > int(g_devices.size()))
|
|
1937
1943
|
{
|
|
@@ -1959,7 +1965,7 @@ uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
|
|
|
1959
1965
|
return mem_high_water_mark;
|
|
1960
1966
|
}
|
|
1961
1967
|
|
|
1962
|
-
void
|
|
1968
|
+
void wp_cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
|
|
1963
1969
|
{
|
|
1964
1970
|
// use temporary storage if user didn't specify pointers
|
|
1965
1971
|
size_t tmp_free_mem, tmp_total_mem;
|
|
@@ -1996,12 +2002,12 @@ void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_me
|
|
|
1996
2002
|
}
|
|
1997
2003
|
|
|
1998
2004
|
|
|
1999
|
-
void*
|
|
2005
|
+
void* wp_cuda_context_get_current()
|
|
2000
2006
|
{
|
|
2001
2007
|
return get_current_context();
|
|
2002
2008
|
}
|
|
2003
2009
|
|
|
2004
|
-
void
|
|
2010
|
+
void wp_cuda_context_set_current(void* context)
|
|
2005
2011
|
{
|
|
2006
2012
|
CUcontext ctx = static_cast<CUcontext>(context);
|
|
2007
2013
|
CUcontext prev_ctx = NULL;
|
|
@@ -2012,18 +2018,18 @@ void cuda_context_set_current(void* context)
|
|
|
2012
2018
|
}
|
|
2013
2019
|
}
|
|
2014
2020
|
|
|
2015
|
-
void
|
|
2021
|
+
void wp_cuda_context_push_current(void* context)
|
|
2016
2022
|
{
|
|
2017
2023
|
check_cu(cuCtxPushCurrent_f(static_cast<CUcontext>(context)));
|
|
2018
2024
|
}
|
|
2019
2025
|
|
|
2020
|
-
void
|
|
2026
|
+
void wp_cuda_context_pop_current()
|
|
2021
2027
|
{
|
|
2022
2028
|
CUcontext context;
|
|
2023
2029
|
check_cu(cuCtxPopCurrent_f(&context));
|
|
2024
2030
|
}
|
|
2025
2031
|
|
|
2026
|
-
void*
|
|
2032
|
+
void* wp_cuda_context_create(int device_ordinal)
|
|
2027
2033
|
{
|
|
2028
2034
|
CUcontext ctx = NULL;
|
|
2029
2035
|
CUdevice device;
|
|
@@ -2032,15 +2038,15 @@ void* cuda_context_create(int device_ordinal)
|
|
|
2032
2038
|
return ctx;
|
|
2033
2039
|
}
|
|
2034
2040
|
|
|
2035
|
-
void
|
|
2041
|
+
void wp_cuda_context_destroy(void* context)
|
|
2036
2042
|
{
|
|
2037
2043
|
if (context)
|
|
2038
2044
|
{
|
|
2039
2045
|
CUcontext ctx = static_cast<CUcontext>(context);
|
|
2040
2046
|
|
|
2041
2047
|
// ensure this is not the current context
|
|
2042
|
-
if (ctx ==
|
|
2043
|
-
|
|
2048
|
+
if (ctx == wp_cuda_context_get_current())
|
|
2049
|
+
wp_cuda_context_set_current(NULL);
|
|
2044
2050
|
|
|
2045
2051
|
// release the cached info about this context
|
|
2046
2052
|
ContextInfo* info = get_context_info(ctx);
|
|
@@ -2059,7 +2065,7 @@ void cuda_context_destroy(void* context)
|
|
|
2059
2065
|
}
|
|
2060
2066
|
}
|
|
2061
2067
|
|
|
2062
|
-
void
|
|
2068
|
+
void wp_cuda_context_synchronize(void* context)
|
|
2063
2069
|
{
|
|
2064
2070
|
ContextGuard guard(context);
|
|
2065
2071
|
|
|
@@ -2073,10 +2079,10 @@ void cuda_context_synchronize(void* context)
|
|
|
2073
2079
|
|
|
2074
2080
|
unload_deferred_modules(context);
|
|
2075
2081
|
|
|
2076
|
-
// check_cuda(cudaDeviceGraphMemTrim(
|
|
2082
|
+
// check_cuda(cudaDeviceGraphMemTrim(wp_cuda_context_get_device_ordinal(context)));
|
|
2077
2083
|
}
|
|
2078
2084
|
|
|
2079
|
-
uint64_t
|
|
2085
|
+
uint64_t wp_cuda_context_check(void* context)
|
|
2080
2086
|
{
|
|
2081
2087
|
ContextGuard guard(context);
|
|
2082
2088
|
|
|
@@ -2098,13 +2104,13 @@ uint64_t cuda_context_check(void* context)
|
|
|
2098
2104
|
}
|
|
2099
2105
|
|
|
2100
2106
|
|
|
2101
|
-
int
|
|
2107
|
+
int wp_cuda_context_get_device_ordinal(void* context)
|
|
2102
2108
|
{
|
|
2103
2109
|
ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
|
|
2104
2110
|
return info && info->device_info ? info->device_info->ordinal : -1;
|
|
2105
2111
|
}
|
|
2106
2112
|
|
|
2107
|
-
int
|
|
2113
|
+
int wp_cuda_context_is_primary(void* context)
|
|
2108
2114
|
{
|
|
2109
2115
|
CUcontext ctx = static_cast<CUcontext>(context);
|
|
2110
2116
|
ContextInfo* context_info = get_context_info(ctx);
|
|
@@ -2131,7 +2137,7 @@ int cuda_context_is_primary(void* context)
|
|
|
2131
2137
|
return 0;
|
|
2132
2138
|
}
|
|
2133
2139
|
|
|
2134
|
-
void*
|
|
2140
|
+
void* wp_cuda_context_get_stream(void* context)
|
|
2135
2141
|
{
|
|
2136
2142
|
ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
|
|
2137
2143
|
if (info)
|
|
@@ -2141,7 +2147,7 @@ void* cuda_context_get_stream(void* context)
|
|
|
2141
2147
|
return NULL;
|
|
2142
2148
|
}
|
|
2143
2149
|
|
|
2144
|
-
void
|
|
2150
|
+
void wp_cuda_context_set_stream(void* context, void* stream, int sync)
|
|
2145
2151
|
{
|
|
2146
2152
|
ContextInfo* context_info = get_context_info(static_cast<CUcontext>(context));
|
|
2147
2153
|
if (context_info)
|
|
@@ -2165,7 +2171,7 @@ void cuda_context_set_stream(void* context, void* stream, int sync)
|
|
|
2165
2171
|
}
|
|
2166
2172
|
}
|
|
2167
2173
|
|
|
2168
|
-
int
|
|
2174
|
+
int wp_cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
|
|
2169
2175
|
{
|
|
2170
2176
|
int num_devices = int(g_devices.size());
|
|
2171
2177
|
|
|
@@ -2190,7 +2196,7 @@ int cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
|
|
|
2190
2196
|
return can_access;
|
|
2191
2197
|
}
|
|
2192
2198
|
|
|
2193
|
-
int
|
|
2199
|
+
int wp_cuda_is_peer_access_enabled(void* target_context, void* peer_context)
|
|
2194
2200
|
{
|
|
2195
2201
|
if (!target_context || !peer_context)
|
|
2196
2202
|
{
|
|
@@ -2201,8 +2207,8 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
|
|
|
2201
2207
|
if (target_context == peer_context)
|
|
2202
2208
|
return 1;
|
|
2203
2209
|
|
|
2204
|
-
int target_ordinal =
|
|
2205
|
-
int peer_ordinal =
|
|
2210
|
+
int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
|
|
2211
|
+
int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
|
|
2206
2212
|
|
|
2207
2213
|
// check if peer access is supported
|
|
2208
2214
|
int can_access = 0;
|
|
@@ -2235,7 +2241,7 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
|
|
|
2235
2241
|
}
|
|
2236
2242
|
}
|
|
2237
2243
|
|
|
2238
|
-
int
|
|
2244
|
+
int wp_cuda_set_peer_access_enabled(void* target_context, void* peer_context, int enable)
|
|
2239
2245
|
{
|
|
2240
2246
|
if (!target_context || !peer_context)
|
|
2241
2247
|
{
|
|
@@ -2246,8 +2252,8 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
|
|
|
2246
2252
|
if (target_context == peer_context)
|
|
2247
2253
|
return 1; // no-op
|
|
2248
2254
|
|
|
2249
|
-
int target_ordinal =
|
|
2250
|
-
int peer_ordinal =
|
|
2255
|
+
int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
|
|
2256
|
+
int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
|
|
2251
2257
|
|
|
2252
2258
|
// check if peer access is supported
|
|
2253
2259
|
int can_access = 0;
|
|
@@ -2292,7 +2298,7 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
|
|
|
2292
2298
|
return 1; // success
|
|
2293
2299
|
}
|
|
2294
2300
|
|
|
2295
|
-
int
|
|
2301
|
+
int wp_cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
|
|
2296
2302
|
{
|
|
2297
2303
|
int num_devices = int(g_devices.size());
|
|
2298
2304
|
|
|
@@ -2328,7 +2334,7 @@ int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
|
|
|
2328
2334
|
return 0;
|
|
2329
2335
|
}
|
|
2330
2336
|
|
|
2331
|
-
int
|
|
2337
|
+
int wp_cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable)
|
|
2332
2338
|
{
|
|
2333
2339
|
int num_devices = int(g_devices.size());
|
|
2334
2340
|
|
|
@@ -2374,13 +2380,13 @@ int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int en
|
|
|
2374
2380
|
return 1; // success
|
|
2375
2381
|
}
|
|
2376
2382
|
|
|
2377
|
-
void
|
|
2383
|
+
void wp_cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {
|
|
2378
2384
|
CUipcMemHandle memHandle;
|
|
2379
2385
|
check_cu(cuIpcGetMemHandle_f(&memHandle, (CUdeviceptr)ptr));
|
|
2380
2386
|
memcpy(out_buffer, memHandle.reserved, CU_IPC_HANDLE_SIZE);
|
|
2381
2387
|
}
|
|
2382
2388
|
|
|
2383
|
-
void*
|
|
2389
|
+
void* wp_cuda_ipc_open_mem_handle(void* context, char* handle) {
|
|
2384
2390
|
ContextGuard guard(context);
|
|
2385
2391
|
|
|
2386
2392
|
CUipcMemHandle memHandle;
|
|
@@ -2395,11 +2401,11 @@ void* cuda_ipc_open_mem_handle(void* context, char* handle) {
|
|
|
2395
2401
|
return NULL;
|
|
2396
2402
|
}
|
|
2397
2403
|
|
|
2398
|
-
void
|
|
2404
|
+
void wp_cuda_ipc_close_mem_handle(void* ptr) {
|
|
2399
2405
|
check_cu(cuIpcCloseMemHandle_f((CUdeviceptr) ptr));
|
|
2400
2406
|
}
|
|
2401
2407
|
|
|
2402
|
-
void
|
|
2408
|
+
void wp_cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
|
|
2403
2409
|
ContextGuard guard(context);
|
|
2404
2410
|
|
|
2405
2411
|
CUipcEventHandle eventHandle;
|
|
@@ -2407,7 +2413,7 @@ void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
|
|
|
2407
2413
|
memcpy(out_buffer, eventHandle.reserved, CU_IPC_HANDLE_SIZE);
|
|
2408
2414
|
}
|
|
2409
2415
|
|
|
2410
|
-
void*
|
|
2416
|
+
void* wp_cuda_ipc_open_event_handle(void* context, char* handle) {
|
|
2411
2417
|
ContextGuard guard(context);
|
|
2412
2418
|
|
|
2413
2419
|
CUipcEventHandle eventHandle;
|
|
@@ -2421,31 +2427,31 @@ void* cuda_ipc_open_event_handle(void* context, char* handle) {
|
|
|
2421
2427
|
return NULL;
|
|
2422
2428
|
}
|
|
2423
2429
|
|
|
2424
|
-
void*
|
|
2430
|
+
void* wp_cuda_stream_create(void* context, int priority)
|
|
2425
2431
|
{
|
|
2426
2432
|
ContextGuard guard(context, true);
|
|
2427
2433
|
|
|
2428
2434
|
CUstream stream;
|
|
2429
2435
|
if (check_cu(cuStreamCreateWithPriority_f(&stream, CU_STREAM_DEFAULT, priority)))
|
|
2430
2436
|
{
|
|
2431
|
-
|
|
2437
|
+
wp_cuda_stream_register(WP_CURRENT_CONTEXT, stream);
|
|
2432
2438
|
return stream;
|
|
2433
2439
|
}
|
|
2434
2440
|
else
|
|
2435
2441
|
return NULL;
|
|
2436
2442
|
}
|
|
2437
2443
|
|
|
2438
|
-
void
|
|
2444
|
+
void wp_cuda_stream_destroy(void* context, void* stream)
|
|
2439
2445
|
{
|
|
2440
2446
|
if (!stream)
|
|
2441
2447
|
return;
|
|
2442
2448
|
|
|
2443
|
-
|
|
2449
|
+
wp_cuda_stream_unregister(context, stream);
|
|
2444
2450
|
|
|
2445
2451
|
check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
|
|
2446
2452
|
}
|
|
2447
2453
|
|
|
2448
|
-
int
|
|
2454
|
+
int wp_cuda_stream_query(void* stream)
|
|
2449
2455
|
{
|
|
2450
2456
|
CUresult res = cuStreamQuery_f(static_cast<CUstream>(stream));
|
|
2451
2457
|
|
|
@@ -2458,7 +2464,7 @@ int cuda_stream_query(void* stream)
|
|
|
2458
2464
|
return res;
|
|
2459
2465
|
}
|
|
2460
2466
|
|
|
2461
|
-
void
|
|
2467
|
+
void wp_cuda_stream_register(void* context, void* stream)
|
|
2462
2468
|
{
|
|
2463
2469
|
if (!stream)
|
|
2464
2470
|
return;
|
|
@@ -2470,7 +2476,7 @@ void cuda_stream_register(void* context, void* stream)
|
|
|
2470
2476
|
check_cu(cuEventCreate_f(&stream_info.cached_event, CU_EVENT_DISABLE_TIMING));
|
|
2471
2477
|
}
|
|
2472
2478
|
|
|
2473
|
-
void
|
|
2479
|
+
void wp_cuda_stream_unregister(void* context, void* stream)
|
|
2474
2480
|
{
|
|
2475
2481
|
if (!stream)
|
|
2476
2482
|
return;
|
|
@@ -2494,28 +2500,28 @@ void cuda_stream_unregister(void* context, void* stream)
|
|
|
2494
2500
|
}
|
|
2495
2501
|
}
|
|
2496
2502
|
|
|
2497
|
-
void*
|
|
2503
|
+
void* wp_cuda_stream_get_current()
|
|
2498
2504
|
{
|
|
2499
2505
|
return get_current_stream();
|
|
2500
2506
|
}
|
|
2501
2507
|
|
|
2502
|
-
void
|
|
2508
|
+
void wp_cuda_stream_synchronize(void* stream)
|
|
2503
2509
|
{
|
|
2504
2510
|
check_cu(cuStreamSynchronize_f(static_cast<CUstream>(stream)));
|
|
2505
2511
|
}
|
|
2506
2512
|
|
|
2507
|
-
void
|
|
2513
|
+
void wp_cuda_stream_wait_event(void* stream, void* event)
|
|
2508
2514
|
{
|
|
2509
2515
|
check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
|
|
2510
2516
|
}
|
|
2511
2517
|
|
|
2512
|
-
void
|
|
2518
|
+
void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
|
|
2513
2519
|
{
|
|
2514
2520
|
check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream)));
|
|
2515
2521
|
check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
|
|
2516
2522
|
}
|
|
2517
2523
|
|
|
2518
|
-
int
|
|
2524
|
+
int wp_cuda_stream_is_capturing(void* stream)
|
|
2519
2525
|
{
|
|
2520
2526
|
cudaStreamCaptureStatus status = cudaStreamCaptureStatusNone;
|
|
2521
2527
|
check_cuda(cudaStreamIsCapturing(static_cast<cudaStream_t>(stream), &status));
|
|
@@ -2523,12 +2529,12 @@ int cuda_stream_is_capturing(void* stream)
|
|
|
2523
2529
|
return int(status != cudaStreamCaptureStatusNone);
|
|
2524
2530
|
}
|
|
2525
2531
|
|
|
2526
|
-
uint64_t
|
|
2532
|
+
uint64_t wp_cuda_stream_get_capture_id(void* stream)
|
|
2527
2533
|
{
|
|
2528
2534
|
return get_capture_id(static_cast<CUstream>(stream));
|
|
2529
2535
|
}
|
|
2530
2536
|
|
|
2531
|
-
int
|
|
2537
|
+
int wp_cuda_stream_get_priority(void* stream)
|
|
2532
2538
|
{
|
|
2533
2539
|
int priority = 0;
|
|
2534
2540
|
check_cuda(cuStreamGetPriority_f(static_cast<CUstream>(stream), &priority));
|
|
@@ -2536,7 +2542,7 @@ int cuda_stream_get_priority(void* stream)
|
|
|
2536
2542
|
return priority;
|
|
2537
2543
|
}
|
|
2538
2544
|
|
|
2539
|
-
void*
|
|
2545
|
+
void* wp_cuda_event_create(void* context, unsigned flags)
|
|
2540
2546
|
{
|
|
2541
2547
|
ContextGuard guard(context, true);
|
|
2542
2548
|
|
|
@@ -2547,12 +2553,12 @@ void* cuda_event_create(void* context, unsigned flags)
|
|
|
2547
2553
|
return NULL;
|
|
2548
2554
|
}
|
|
2549
2555
|
|
|
2550
|
-
void
|
|
2556
|
+
void wp_cuda_event_destroy(void* event)
|
|
2551
2557
|
{
|
|
2552
2558
|
check_cu(cuEventDestroy_f(static_cast<CUevent>(event)));
|
|
2553
2559
|
}
|
|
2554
2560
|
|
|
2555
|
-
int
|
|
2561
|
+
int wp_cuda_event_query(void* event)
|
|
2556
2562
|
{
|
|
2557
2563
|
CUresult res = cuEventQuery_f(static_cast<CUevent>(event));
|
|
2558
2564
|
|
|
@@ -2565,9 +2571,9 @@ int cuda_event_query(void* event)
|
|
|
2565
2571
|
return res;
|
|
2566
2572
|
}
|
|
2567
2573
|
|
|
2568
|
-
void
|
|
2574
|
+
void wp_cuda_event_record(void* event, void* stream, bool timing)
|
|
2569
2575
|
{
|
|
2570
|
-
if (timing && !g_captures.empty() &&
|
|
2576
|
+
if (timing && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
|
|
2571
2577
|
{
|
|
2572
2578
|
// record timing event during graph capture
|
|
2573
2579
|
check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
|
|
@@ -2578,12 +2584,12 @@ void cuda_event_record(void* event, void* stream, bool timing)
|
|
|
2578
2584
|
}
|
|
2579
2585
|
}
|
|
2580
2586
|
|
|
2581
|
-
void
|
|
2587
|
+
void wp_cuda_event_synchronize(void* event)
|
|
2582
2588
|
{
|
|
2583
2589
|
check_cu(cuEventSynchronize_f(static_cast<CUevent>(event)));
|
|
2584
2590
|
}
|
|
2585
2591
|
|
|
2586
|
-
float
|
|
2592
|
+
float wp_cuda_event_elapsed_time(void* start_event, void* end_event)
|
|
2587
2593
|
{
|
|
2588
2594
|
float elapsed = 0.0f;
|
|
2589
2595
|
cudaEvent_t start = static_cast<cudaEvent_t>(start_event);
|
|
@@ -2592,7 +2598,7 @@ float cuda_event_elapsed_time(void* start_event, void* end_event)
|
|
|
2592
2598
|
return elapsed;
|
|
2593
2599
|
}
|
|
2594
2600
|
|
|
2595
|
-
bool
|
|
2601
|
+
bool wp_cuda_graph_begin_capture(void* context, void* stream, int external)
|
|
2596
2602
|
{
|
|
2597
2603
|
ContextGuard guard(context);
|
|
2598
2604
|
|
|
@@ -2639,7 +2645,7 @@ bool cuda_graph_begin_capture(void* context, void* stream, int external)
|
|
|
2639
2645
|
return true;
|
|
2640
2646
|
}
|
|
2641
2647
|
|
|
2642
|
-
bool
|
|
2648
|
+
bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
|
|
2643
2649
|
{
|
|
2644
2650
|
ContextGuard guard(context);
|
|
2645
2651
|
|
|
@@ -2774,14 +2780,14 @@ bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
|
|
|
2774
2780
|
return true;
|
|
2775
2781
|
}
|
|
2776
2782
|
|
|
2777
|
-
bool
|
|
2783
|
+
bool wp_capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
|
|
2778
2784
|
{
|
|
2779
2785
|
if (!check_cuda(cudaGraphDebugDotPrint((cudaGraph_t)graph, path, flags)))
|
|
2780
2786
|
return false;
|
|
2781
2787
|
return true;
|
|
2782
2788
|
}
|
|
2783
2789
|
|
|
2784
|
-
bool
|
|
2790
|
+
bool wp_cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
|
|
2785
2791
|
{
|
|
2786
2792
|
ContextGuard guard(context);
|
|
2787
2793
|
|
|
@@ -2789,6 +2795,13 @@ bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
|
|
|
2789
2795
|
if (!check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, (cudaGraph_t)graph, cudaGraphInstantiateFlagAutoFreeOnLaunch)))
|
|
2790
2796
|
return false;
|
|
2791
2797
|
|
|
2798
|
+
// Usually uploading the graph explicitly is optional, but when updating graph nodes (e.g., indirect dispatch)
|
|
2799
|
+
// then the upload is required because otherwise the graph nodes that get updated might not yet be uploaded, which
|
|
2800
|
+
// results in undefined behavior.
|
|
2801
|
+
CUstream cuda_stream = static_cast<CUstream>(stream);
|
|
2802
|
+
if (!check_cuda(cudaGraphUpload(graph_exec, cuda_stream)))
|
|
2803
|
+
return false;
|
|
2804
|
+
|
|
2792
2805
|
if (graph_exec_ret)
|
|
2793
2806
|
*graph_exec_ret = graph_exec;
|
|
2794
2807
|
|
|
@@ -2927,7 +2940,7 @@ static CUfunction get_conditional_kernel(void* context, const char* name)
|
|
|
2927
2940
|
return kernel;
|
|
2928
2941
|
}
|
|
2929
2942
|
|
|
2930
|
-
bool
|
|
2943
|
+
bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
|
|
2931
2944
|
{
|
|
2932
2945
|
ContextGuard guard(context);
|
|
2933
2946
|
|
|
@@ -2937,7 +2950,7 @@ bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
|
|
|
2937
2950
|
return true;
|
|
2938
2951
|
}
|
|
2939
2952
|
|
|
2940
|
-
bool
|
|
2953
|
+
bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
|
|
2941
2954
|
{
|
|
2942
2955
|
ContextGuard guard(context);
|
|
2943
2956
|
|
|
@@ -2963,7 +2976,7 @@ bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
|
|
|
2963
2976
|
// https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
|
|
2964
2977
|
// condition is a gpu pointer
|
|
2965
2978
|
// if_graph_ret and else_graph_ret should be NULL if not needed
|
|
2966
|
-
bool
|
|
2979
|
+
bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
|
|
2967
2980
|
{
|
|
2968
2981
|
bool has_if = if_graph_ret != NULL;
|
|
2969
2982
|
bool has_else = else_graph_ret != NULL;
|
|
@@ -2978,21 +2991,21 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
2978
2991
|
CUstream cuda_stream = static_cast<CUstream>(stream);
|
|
2979
2992
|
|
|
2980
2993
|
// Get the current stream capturing graph
|
|
2981
|
-
|
|
2994
|
+
CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
|
|
2982
2995
|
cudaGraph_t cuda_graph = NULL;
|
|
2983
2996
|
const cudaGraphNode_t* capture_deps = NULL;
|
|
2984
2997
|
size_t dep_count = 0;
|
|
2985
|
-
if (!
|
|
2998
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
2986
2999
|
return false;
|
|
2987
3000
|
|
|
2988
3001
|
// abort if not capturing
|
|
2989
|
-
if (!cuda_graph || capture_status !=
|
|
3002
|
+
if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
|
|
2990
3003
|
{
|
|
2991
3004
|
wp::set_error_string("Stream is not capturing");
|
|
2992
3005
|
return false;
|
|
2993
3006
|
}
|
|
2994
3007
|
|
|
2995
|
-
//int driver_version =
|
|
3008
|
+
//int driver_version = wp_cuda_driver_version();
|
|
2996
3009
|
|
|
2997
3010
|
// IF-ELSE nodes are only supported with CUDA 12.8+
|
|
2998
3011
|
// Somehow child graphs produce wrong results when an else branch is used
|
|
@@ -3000,7 +3013,7 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3000
3013
|
if (num_branches == 1 /*|| driver_version >= 12080*/)
|
|
3001
3014
|
{
|
|
3002
3015
|
cudaGraphConditionalHandle handle;
|
|
3003
|
-
cudaGraphConditionalHandleCreate(&handle, cuda_graph);
|
|
3016
|
+
check_cuda(cudaGraphConditionalHandleCreate(&handle, cuda_graph));
|
|
3004
3017
|
|
|
3005
3018
|
// run a kernel to set the condition handle from the condition pointer
|
|
3006
3019
|
// (need to negate the condition if only the else branch is used)
|
|
@@ -3020,22 +3033,23 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3020
3033
|
kernel_args[0] = &handle;
|
|
3021
3034
|
kernel_args[1] = &condition;
|
|
3022
3035
|
|
|
3023
|
-
if (!
|
|
3036
|
+
if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
|
|
3024
3037
|
return false;
|
|
3025
3038
|
|
|
3026
|
-
if (!
|
|
3039
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
3027
3040
|
return false;
|
|
3028
3041
|
|
|
3029
3042
|
// create conditional node
|
|
3030
|
-
|
|
3031
|
-
|
|
3043
|
+
CUgraphNode condition_node;
|
|
3044
|
+
CUgraphNodeParams condition_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
|
|
3032
3045
|
condition_params.conditional.handle = handle;
|
|
3033
|
-
condition_params.conditional.type =
|
|
3046
|
+
condition_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
|
|
3034
3047
|
condition_params.conditional.size = num_branches;
|
|
3035
|
-
|
|
3048
|
+
condition_params.conditional.ctx = get_current_context();
|
|
3049
|
+
if (!check_cu(cuGraphAddNode_f(&condition_node, cuda_graph, capture_deps, NULL, dep_count, &condition_params)))
|
|
3036
3050
|
return false;
|
|
3037
3051
|
|
|
3038
|
-
if (!
|
|
3052
|
+
if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
|
|
3039
3053
|
return false;
|
|
3040
3054
|
|
|
3041
3055
|
if (num_branches == 1)
|
|
@@ -3055,8 +3069,8 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3055
3069
|
{
|
|
3056
3070
|
// Create IF node followed by an additional IF node with negated condition
|
|
3057
3071
|
cudaGraphConditionalHandle if_handle, else_handle;
|
|
3058
|
-
cudaGraphConditionalHandleCreate(&if_handle, cuda_graph);
|
|
3059
|
-
cudaGraphConditionalHandleCreate(&else_handle, cuda_graph);
|
|
3072
|
+
check_cuda(cudaGraphConditionalHandleCreate(&if_handle, cuda_graph));
|
|
3073
|
+
check_cuda(cudaGraphConditionalHandleCreate(&else_handle, cuda_graph));
|
|
3060
3074
|
|
|
3061
3075
|
CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
|
|
3062
3076
|
if (!kernel)
|
|
@@ -3073,26 +3087,28 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3073
3087
|
if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
|
|
3074
3088
|
return false;
|
|
3075
3089
|
|
|
3076
|
-
if (!
|
|
3090
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
3077
3091
|
return false;
|
|
3078
3092
|
|
|
3079
|
-
|
|
3080
|
-
|
|
3093
|
+
CUgraphNode if_node;
|
|
3094
|
+
CUgraphNodeParams if_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
|
|
3081
3095
|
if_params.conditional.handle = if_handle;
|
|
3082
|
-
if_params.conditional.type =
|
|
3096
|
+
if_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
|
|
3083
3097
|
if_params.conditional.size = 1;
|
|
3084
|
-
|
|
3098
|
+
if_params.conditional.ctx = get_current_context();
|
|
3099
|
+
if (!check_cu(cuGraphAddNode_f(&if_node, cuda_graph, capture_deps, NULL, dep_count, &if_params)))
|
|
3085
3100
|
return false;
|
|
3086
3101
|
|
|
3087
|
-
|
|
3088
|
-
|
|
3102
|
+
CUgraphNode else_node;
|
|
3103
|
+
CUgraphNodeParams else_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
|
|
3089
3104
|
else_params.conditional.handle = else_handle;
|
|
3090
|
-
else_params.conditional.type =
|
|
3105
|
+
else_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
|
|
3091
3106
|
else_params.conditional.size = 1;
|
|
3092
|
-
|
|
3107
|
+
else_params.conditional.ctx = get_current_context();
|
|
3108
|
+
if (!check_cu(cuGraphAddNode_f(&else_node, cuda_graph, &if_node, NULL, 1, &else_params)))
|
|
3093
3109
|
return false;
|
|
3094
3110
|
|
|
3095
|
-
if (!
|
|
3111
|
+
if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
|
|
3096
3112
|
return false;
|
|
3097
3113
|
|
|
3098
3114
|
*if_graph_ret = if_params.conditional.phGraph_out[0];
|
|
@@ -3102,21 +3118,143 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
|
|
|
3102
3118
|
return true;
|
|
3103
3119
|
}
|
|
3104
3120
|
|
|
3105
|
-
|
|
3121
|
+
// graph node type names for intelligible error reporting
|
|
3122
|
+
static const char* get_graph_node_type_name(CUgraphNodeType type)
|
|
3123
|
+
{
|
|
3124
|
+
static const std::unordered_map<CUgraphNodeType, const char*> names
|
|
3125
|
+
{
|
|
3126
|
+
{CU_GRAPH_NODE_TYPE_KERNEL, "kernel launch"},
|
|
3127
|
+
{CU_GRAPH_NODE_TYPE_MEMCPY, "memcpy"},
|
|
3128
|
+
{CU_GRAPH_NODE_TYPE_MEMSET, "memset"},
|
|
3129
|
+
{CU_GRAPH_NODE_TYPE_HOST, "host execution"},
|
|
3130
|
+
{CU_GRAPH_NODE_TYPE_GRAPH, "graph launch"},
|
|
3131
|
+
{CU_GRAPH_NODE_TYPE_EMPTY, "empty node"},
|
|
3132
|
+
{CU_GRAPH_NODE_TYPE_WAIT_EVENT, "event wait"},
|
|
3133
|
+
{CU_GRAPH_NODE_TYPE_EVENT_RECORD, "event record"},
|
|
3134
|
+
{CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL, "semaphore signal"},
|
|
3135
|
+
{CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT, "semaphore wait"},
|
|
3136
|
+
{CU_GRAPH_NODE_TYPE_MEM_ALLOC, "memory allocation"},
|
|
3137
|
+
{CU_GRAPH_NODE_TYPE_MEM_FREE, "memory deallocation"},
|
|
3138
|
+
{CU_GRAPH_NODE_TYPE_BATCH_MEM_OP, "batched mem op"},
|
|
3139
|
+
{CU_GRAPH_NODE_TYPE_CONDITIONAL, "conditional node"},
|
|
3140
|
+
};
|
|
3141
|
+
|
|
3142
|
+
auto it = names.find(type);
|
|
3143
|
+
if (it != names.end())
|
|
3144
|
+
return it->second;
|
|
3145
|
+
else
|
|
3146
|
+
return "unknown node";
|
|
3147
|
+
}
|
|
3148
|
+
|
|
3149
|
+
// check if a graph can be launched as a child graph
|
|
3150
|
+
static bool is_valid_child_graph(void* child_graph)
|
|
3151
|
+
{
|
|
3152
|
+
// disallowed child graph nodes according to the documentation of cuGraphAddChildGraphNode()
|
|
3153
|
+
static const std::unordered_set<CUgraphNodeType> disallowed_nodes
|
|
3154
|
+
{
|
|
3155
|
+
CU_GRAPH_NODE_TYPE_MEM_ALLOC,
|
|
3156
|
+
CU_GRAPH_NODE_TYPE_MEM_FREE,
|
|
3157
|
+
CU_GRAPH_NODE_TYPE_CONDITIONAL,
|
|
3158
|
+
};
|
|
3159
|
+
|
|
3160
|
+
if (!child_graph)
|
|
3161
|
+
{
|
|
3162
|
+
wp::set_error_string("Child graph is null");
|
|
3163
|
+
return false;
|
|
3164
|
+
}
|
|
3165
|
+
|
|
3166
|
+
size_t num_nodes = 0;
|
|
3167
|
+
if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, NULL, &num_nodes)))
|
|
3168
|
+
return false;
|
|
3169
|
+
std::vector<cudaGraphNode_t> nodes(num_nodes);
|
|
3170
|
+
if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, nodes.data(), &num_nodes)))
|
|
3171
|
+
return false;
|
|
3172
|
+
|
|
3173
|
+
for (size_t i = 0; i < num_nodes; i++)
|
|
3174
|
+
{
|
|
3175
|
+
// note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
|
|
3176
|
+
CUgraphNodeType node_type;
|
|
3177
|
+
check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
|
|
3178
|
+
auto it = disallowed_nodes.find(node_type);
|
|
3179
|
+
if (it != disallowed_nodes.end())
|
|
3180
|
+
{
|
|
3181
|
+
wp::set_error_string("Child graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
|
|
3182
|
+
return false;
|
|
3183
|
+
}
|
|
3184
|
+
}
|
|
3185
|
+
|
|
3186
|
+
return true;
|
|
3187
|
+
}
|
|
3188
|
+
|
|
3189
|
+
// check if a graph can be used as a conditional body graph
|
|
3190
|
+
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#condtional-node-body-graph-requirements
|
|
3191
|
+
bool wp_cuda_graph_check_conditional_body(void* body_graph)
|
|
3106
3192
|
{
|
|
3193
|
+
static const std::unordered_set<CUgraphNodeType> allowed_nodes
|
|
3194
|
+
{
|
|
3195
|
+
CU_GRAPH_NODE_TYPE_MEMCPY,
|
|
3196
|
+
CU_GRAPH_NODE_TYPE_MEMSET,
|
|
3197
|
+
CU_GRAPH_NODE_TYPE_KERNEL,
|
|
3198
|
+
CU_GRAPH_NODE_TYPE_GRAPH,
|
|
3199
|
+
CU_GRAPH_NODE_TYPE_EMPTY,
|
|
3200
|
+
CU_GRAPH_NODE_TYPE_CONDITIONAL,
|
|
3201
|
+
};
|
|
3202
|
+
|
|
3203
|
+
if (!body_graph)
|
|
3204
|
+
{
|
|
3205
|
+
wp::set_error_string("Conditional body graph is null");
|
|
3206
|
+
return false;
|
|
3207
|
+
}
|
|
3208
|
+
|
|
3209
|
+
size_t num_nodes = 0;
|
|
3210
|
+
if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, NULL, &num_nodes)))
|
|
3211
|
+
return false;
|
|
3212
|
+
std::vector<cudaGraphNode_t> nodes(num_nodes);
|
|
3213
|
+
if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, nodes.data(), &num_nodes)))
|
|
3214
|
+
return false;
|
|
3215
|
+
|
|
3216
|
+
for (size_t i = 0; i < num_nodes; i++)
|
|
3217
|
+
{
|
|
3218
|
+
// note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
|
|
3219
|
+
CUgraphNodeType node_type;
|
|
3220
|
+
check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
|
|
3221
|
+
if (allowed_nodes.find(node_type) == allowed_nodes.end())
|
|
3222
|
+
{
|
|
3223
|
+
wp::set_error_string("Conditional body graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
|
|
3224
|
+
return false;
|
|
3225
|
+
}
|
|
3226
|
+
else if (node_type == CU_GRAPH_NODE_TYPE_GRAPH)
|
|
3227
|
+
{
|
|
3228
|
+
// check nested child graphs recursively
|
|
3229
|
+
cudaGraph_t child_graph = NULL;
|
|
3230
|
+
if (!check_cuda(cudaGraphChildGraphNodeGetGraph(nodes[i], &child_graph)))
|
|
3231
|
+
return false;
|
|
3232
|
+
if (!wp_cuda_graph_check_conditional_body(child_graph))
|
|
3233
|
+
return false;
|
|
3234
|
+
}
|
|
3235
|
+
}
|
|
3236
|
+
|
|
3237
|
+
return true;
|
|
3238
|
+
}
|
|
3239
|
+
|
|
3240
|
+
bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
|
|
3241
|
+
{
|
|
3242
|
+
if (!is_valid_child_graph(child_graph))
|
|
3243
|
+
return false;
|
|
3244
|
+
|
|
3107
3245
|
ContextGuard guard(context);
|
|
3108
3246
|
|
|
3109
3247
|
CUstream cuda_stream = static_cast<CUstream>(stream);
|
|
3110
3248
|
|
|
3111
3249
|
// Get the current stream capturing graph
|
|
3112
|
-
|
|
3250
|
+
CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
|
|
3113
3251
|
void* cuda_graph = NULL;
|
|
3114
|
-
const
|
|
3252
|
+
const CUgraphNode* capture_deps = NULL;
|
|
3115
3253
|
size_t dep_count = 0;
|
|
3116
|
-
if (!
|
|
3254
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
|
|
3117
3255
|
return false;
|
|
3118
3256
|
|
|
3119
|
-
if (!
|
|
3257
|
+
if (!wp_cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
|
|
3120
3258
|
return false;
|
|
3121
3259
|
|
|
3122
3260
|
cudaGraphNode_t body_node;
|
|
@@ -3126,16 +3264,16 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
|
|
|
3126
3264
|
static_cast<cudaGraph_t>(child_graph))))
|
|
3127
3265
|
return false;
|
|
3128
3266
|
|
|
3129
|
-
if (!
|
|
3267
|
+
if (!wp_cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
|
|
3130
3268
|
return false;
|
|
3131
3269
|
|
|
3132
|
-
if (!
|
|
3270
|
+
if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
|
|
3133
3271
|
return false;
|
|
3134
3272
|
|
|
3135
3273
|
return true;
|
|
3136
3274
|
}
|
|
3137
3275
|
|
|
3138
|
-
bool
|
|
3276
|
+
bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
|
|
3139
3277
|
{
|
|
3140
3278
|
// if there's no body, it's a no-op
|
|
3141
3279
|
if (!body_graph_ret)
|
|
@@ -3146,15 +3284,15 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
|
|
|
3146
3284
|
CUstream cuda_stream = static_cast<CUstream>(stream);
|
|
3147
3285
|
|
|
3148
3286
|
// Get the current stream capturing graph
|
|
3149
|
-
|
|
3287
|
+
CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
|
|
3150
3288
|
cudaGraph_t cuda_graph = NULL;
|
|
3151
3289
|
const cudaGraphNode_t* capture_deps = NULL;
|
|
3152
3290
|
size_t dep_count = 0;
|
|
3153
|
-
if (!
|
|
3291
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
3154
3292
|
return false;
|
|
3155
3293
|
|
|
3156
3294
|
// abort if not capturing
|
|
3157
|
-
if (!cuda_graph || capture_status !=
|
|
3295
|
+
if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
|
|
3158
3296
|
{
|
|
3159
3297
|
wp::set_error_string("Stream is not capturing");
|
|
3160
3298
|
return false;
|
|
@@ -3179,19 +3317,20 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
|
|
|
3179
3317
|
if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
|
|
3180
3318
|
return false;
|
|
3181
3319
|
|
|
3182
|
-
if (!
|
|
3320
|
+
if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
|
|
3183
3321
|
return false;
|
|
3184
3322
|
|
|
3185
3323
|
// insert conditional graph node
|
|
3186
|
-
|
|
3187
|
-
|
|
3324
|
+
CUgraphNode while_node;
|
|
3325
|
+
CUgraphNodeParams while_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
|
|
3188
3326
|
while_params.conditional.handle = handle;
|
|
3189
|
-
while_params.conditional.type =
|
|
3327
|
+
while_params.conditional.type = CU_GRAPH_COND_TYPE_WHILE;
|
|
3190
3328
|
while_params.conditional.size = 1;
|
|
3191
|
-
|
|
3329
|
+
while_params.conditional.ctx = get_current_context();
|
|
3330
|
+
if (!check_cu(cuGraphAddNode_f(&while_node, cuda_graph, capture_deps, NULL, dep_count, &while_params)))
|
|
3192
3331
|
return false;
|
|
3193
3332
|
|
|
3194
|
-
if (!
|
|
3333
|
+
if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
|
|
3195
3334
|
return false;
|
|
3196
3335
|
|
|
3197
3336
|
*body_graph_ret = while_params.conditional.phGraph_out[0];
|
|
@@ -3200,7 +3339,7 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
|
|
|
3200
3339
|
return true;
|
|
3201
3340
|
}
|
|
3202
3341
|
|
|
3203
|
-
bool
|
|
3342
|
+
bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
|
|
3204
3343
|
{
|
|
3205
3344
|
ContextGuard guard(context);
|
|
3206
3345
|
|
|
@@ -3227,37 +3366,43 @@ bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint6
|
|
|
3227
3366
|
#else
|
|
3228
3367
|
// stubs for conditional graph node API if CUDA toolkit is too old.
|
|
3229
3368
|
|
|
3230
|
-
bool
|
|
3369
|
+
bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
|
|
3370
|
+
{
|
|
3371
|
+
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3372
|
+
return false;
|
|
3373
|
+
}
|
|
3374
|
+
|
|
3375
|
+
bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
|
|
3231
3376
|
{
|
|
3232
3377
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3233
3378
|
return false;
|
|
3234
3379
|
}
|
|
3235
3380
|
|
|
3236
|
-
bool
|
|
3381
|
+
bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
|
|
3237
3382
|
{
|
|
3238
3383
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3239
3384
|
return false;
|
|
3240
3385
|
}
|
|
3241
3386
|
|
|
3242
|
-
bool
|
|
3387
|
+
bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
|
|
3243
3388
|
{
|
|
3244
3389
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3245
3390
|
return false;
|
|
3246
3391
|
}
|
|
3247
3392
|
|
|
3248
|
-
bool
|
|
3393
|
+
bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
|
|
3249
3394
|
{
|
|
3250
3395
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3251
3396
|
return false;
|
|
3252
3397
|
}
|
|
3253
3398
|
|
|
3254
|
-
bool
|
|
3399
|
+
bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
|
|
3255
3400
|
{
|
|
3256
3401
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3257
3402
|
return false;
|
|
3258
3403
|
}
|
|
3259
3404
|
|
|
3260
|
-
bool
|
|
3405
|
+
bool wp_cuda_graph_check_conditional_body(void* body_graph)
|
|
3261
3406
|
{
|
|
3262
3407
|
wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
|
|
3263
3408
|
return false;
|
|
@@ -3266,7 +3411,7 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
|
|
|
3266
3411
|
#endif // support for conditional graph nodes
|
|
3267
3412
|
|
|
3268
3413
|
|
|
3269
|
-
bool
|
|
3414
|
+
bool wp_cuda_graph_launch(void* graph_exec, void* stream)
|
|
3270
3415
|
{
|
|
3271
3416
|
// TODO: allow naming graphs?
|
|
3272
3417
|
begin_cuda_range(WP_TIMING_GRAPH, stream, get_stream_context(stream), "graph");
|
|
@@ -3278,14 +3423,14 @@ bool cuda_graph_launch(void* graph_exec, void* stream)
|
|
|
3278
3423
|
return result;
|
|
3279
3424
|
}
|
|
3280
3425
|
|
|
3281
|
-
bool
|
|
3426
|
+
bool wp_cuda_graph_destroy(void* context, void* graph)
|
|
3282
3427
|
{
|
|
3283
3428
|
ContextGuard guard(context);
|
|
3284
3429
|
|
|
3285
3430
|
return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
|
|
3286
3431
|
}
|
|
3287
3432
|
|
|
3288
|
-
bool
|
|
3433
|
+
bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec)
|
|
3289
3434
|
{
|
|
3290
3435
|
ContextGuard guard(context);
|
|
3291
3436
|
|
|
@@ -3337,7 +3482,7 @@ bool write_file(const char* data, size_t size, std::string filename, const char*
|
|
|
3337
3482
|
}
|
|
3338
3483
|
#endif
|
|
3339
3484
|
|
|
3340
|
-
size_t
|
|
3485
|
+
size_t wp_cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
|
|
3341
3486
|
{
|
|
3342
3487
|
// use file extension to determine whether to output PTX or CUBIN
|
|
3343
3488
|
const char* output_ext = strrchr(output_path, '.');
|
|
@@ -3393,9 +3538,9 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3393
3538
|
{
|
|
3394
3539
|
opts.push_back("--define-macro=_DEBUG");
|
|
3395
3540
|
opts.push_back("--generate-line-info");
|
|
3396
|
-
|
|
3397
|
-
//
|
|
3398
|
-
|
|
3541
|
+
#ifndef _WIN32
|
|
3542
|
+
opts.push_back("--device-debug"); // -G
|
|
3543
|
+
#endif
|
|
3399
3544
|
}
|
|
3400
3545
|
else
|
|
3401
3546
|
{
|
|
@@ -3665,7 +3810,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3665
3810
|
}
|
|
3666
3811
|
}
|
|
3667
3812
|
|
|
3668
|
-
bool
|
|
3813
|
+
bool wp_cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
|
|
3669
3814
|
{
|
|
3670
3815
|
|
|
3671
3816
|
CHECK_ANY(ltoir_output_path != nullptr);
|
|
@@ -3711,7 +3856,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3711
3856
|
return res;
|
|
3712
3857
|
}
|
|
3713
3858
|
|
|
3714
|
-
bool
|
|
3859
|
+
bool wp_cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
|
|
3715
3860
|
{
|
|
3716
3861
|
|
|
3717
3862
|
CHECK_ANY(ltoir_output_path != nullptr);
|
|
@@ -3756,7 +3901,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3756
3901
|
return res;
|
|
3757
3902
|
}
|
|
3758
3903
|
|
|
3759
|
-
bool
|
|
3904
|
+
bool wp_cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
|
|
3760
3905
|
{
|
|
3761
3906
|
|
|
3762
3907
|
CHECK_ANY(ltoir_output_path != nullptr);
|
|
@@ -3819,7 +3964,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
|
|
|
3819
3964
|
|
|
3820
3965
|
#endif
|
|
3821
3966
|
|
|
3822
|
-
void*
|
|
3967
|
+
void* wp_cuda_load_module(void* context, const char* path)
|
|
3823
3968
|
{
|
|
3824
3969
|
ContextGuard guard(context);
|
|
3825
3970
|
|
|
@@ -3938,7 +4083,7 @@ void* cuda_load_module(void* context, const char* path)
|
|
|
3938
4083
|
return module;
|
|
3939
4084
|
}
|
|
3940
4085
|
|
|
3941
|
-
void
|
|
4086
|
+
void wp_cuda_unload_module(void* context, void* module)
|
|
3942
4087
|
{
|
|
3943
4088
|
// ensure there are no graph captures in progress
|
|
3944
4089
|
if (g_captures.empty())
|
|
@@ -3957,7 +4102,7 @@ void cuda_unload_module(void* context, void* module)
|
|
|
3957
4102
|
}
|
|
3958
4103
|
|
|
3959
4104
|
|
|
3960
|
-
int
|
|
4105
|
+
int wp_cuda_get_max_shared_memory(void* context)
|
|
3961
4106
|
{
|
|
3962
4107
|
ContextInfo* info = get_context_info(context);
|
|
3963
4108
|
if (!info)
|
|
@@ -3967,7 +4112,7 @@ int cuda_get_max_shared_memory(void* context)
|
|
|
3967
4112
|
return max_smem_bytes;
|
|
3968
4113
|
}
|
|
3969
4114
|
|
|
3970
|
-
bool
|
|
4115
|
+
bool wp_cuda_configure_kernel_shared_memory(void* kernel, int size)
|
|
3971
4116
|
{
|
|
3972
4117
|
int requested_smem_bytes = size;
|
|
3973
4118
|
|
|
@@ -3979,7 +4124,7 @@ bool cuda_configure_kernel_shared_memory(void* kernel, int size)
|
|
|
3979
4124
|
return true;
|
|
3980
4125
|
}
|
|
3981
4126
|
|
|
3982
|
-
void*
|
|
4127
|
+
void* wp_cuda_get_kernel(void* context, void* module, const char* name)
|
|
3983
4128
|
{
|
|
3984
4129
|
ContextGuard guard(context);
|
|
3985
4130
|
|
|
@@ -3994,7 +4139,7 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
|
|
|
3994
4139
|
return kernel;
|
|
3995
4140
|
}
|
|
3996
4141
|
|
|
3997
|
-
size_t
|
|
4142
|
+
size_t wp_cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
|
|
3998
4143
|
{
|
|
3999
4144
|
ContextGuard guard(context);
|
|
4000
4145
|
|
|
@@ -4048,21 +4193,21 @@ size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_block
|
|
|
4048
4193
|
return res;
|
|
4049
4194
|
}
|
|
4050
4195
|
|
|
4051
|
-
void
|
|
4196
|
+
void wp_cuda_graphics_map(void* context, void* resource)
|
|
4052
4197
|
{
|
|
4053
4198
|
ContextGuard guard(context);
|
|
4054
4199
|
|
|
4055
4200
|
check_cu(cuGraphicsMapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
|
|
4056
4201
|
}
|
|
4057
4202
|
|
|
4058
|
-
void
|
|
4203
|
+
void wp_cuda_graphics_unmap(void* context, void* resource)
|
|
4059
4204
|
{
|
|
4060
4205
|
ContextGuard guard(context);
|
|
4061
4206
|
|
|
4062
4207
|
check_cu(cuGraphicsUnmapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
|
|
4063
4208
|
}
|
|
4064
4209
|
|
|
4065
|
-
void
|
|
4210
|
+
void wp_cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t* ptr, size_t* size)
|
|
4066
4211
|
{
|
|
4067
4212
|
ContextGuard guard(context);
|
|
4068
4213
|
|
|
@@ -4074,7 +4219,7 @@ void cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t*
|
|
|
4074
4219
|
*size = bytes;
|
|
4075
4220
|
}
|
|
4076
4221
|
|
|
4077
|
-
void*
|
|
4222
|
+
void* wp_cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags)
|
|
4078
4223
|
{
|
|
4079
4224
|
ContextGuard guard(context);
|
|
4080
4225
|
|
|
@@ -4089,7 +4234,7 @@ void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsign
|
|
|
4089
4234
|
return resource;
|
|
4090
4235
|
}
|
|
4091
4236
|
|
|
4092
|
-
void
|
|
4237
|
+
void wp_cuda_graphics_unregister_resource(void* context, void* resource)
|
|
4093
4238
|
{
|
|
4094
4239
|
ContextGuard guard(context);
|
|
4095
4240
|
|
|
@@ -4098,25 +4243,25 @@ void cuda_graphics_unregister_resource(void* context, void* resource)
|
|
|
4098
4243
|
delete res;
|
|
4099
4244
|
}
|
|
4100
4245
|
|
|
4101
|
-
void
|
|
4246
|
+
void wp_cuda_timing_begin(int flags)
|
|
4102
4247
|
{
|
|
4103
4248
|
g_cuda_timing_state = new CudaTimingState(flags, g_cuda_timing_state);
|
|
4104
4249
|
}
|
|
4105
4250
|
|
|
4106
|
-
int
|
|
4251
|
+
int wp_cuda_timing_get_result_count()
|
|
4107
4252
|
{
|
|
4108
4253
|
if (g_cuda_timing_state)
|
|
4109
4254
|
return int(g_cuda_timing_state->ranges.size());
|
|
4110
4255
|
return 0;
|
|
4111
4256
|
}
|
|
4112
4257
|
|
|
4113
|
-
void
|
|
4258
|
+
void wp_cuda_timing_end(timing_result_t* results, int size)
|
|
4114
4259
|
{
|
|
4115
4260
|
if (!g_cuda_timing_state)
|
|
4116
4261
|
return;
|
|
4117
4262
|
|
|
4118
4263
|
// number of results to write to the user buffer
|
|
4119
|
-
int count = std::min(
|
|
4264
|
+
int count = std::min(wp_cuda_timing_get_result_count(), size);
|
|
4120
4265
|
|
|
4121
4266
|
// compute timings and write results
|
|
4122
4267
|
for (int i = 0; i < count; i++)
|
|
@@ -4150,7 +4295,6 @@ void cuda_timing_end(timing_result_t* results, int size)
|
|
|
4150
4295
|
#include "reduce.cu"
|
|
4151
4296
|
#include "runlength_encode.cu"
|
|
4152
4297
|
#include "scan.cu"
|
|
4153
|
-
#include "marching.cu"
|
|
4154
4298
|
#include "sparse.cu"
|
|
4155
4299
|
#include "volume.cu"
|
|
4156
4300
|
#include "volume_builder.cu"
|