PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/native/warp.cu CHANGED Viewed

@@ -19,6 +19,7 @@
 #include "scan.h"
 #include "cuda_util.h"
 #include "error.h"
+#include "sort.h"
 #include <cstdlib>
 #include <fstream>
@@ -168,7 +169,7 @@ struct ContextInfo
 {
     DeviceInfo* device_info = NULL;
-    // the current stream, managed from Python (see cuda_context_set_stream() and cuda_context_get_stream())
+    // the current stream, managed from Python (see wp_cuda_context_set_stream() and wp_cuda_context_get_stream())
     CUstream stream = NULL;
     // conditional graph node support, loaded on demand if the driver supports it (CUDA 12.4+)
@@ -237,11 +238,11 @@ static std::unordered_map<CUstream, StreamInfo> g_streams;
 // Ongoing graph captures registered using wp.capture_begin().
 // This maps the capture id to the stream where capture was started.
-// See cuda_graph_begin_capture(), cuda_graph_end_capture(), and free_device_async().
+// See wp_cuda_graph_begin_capture(), wp_cuda_graph_end_capture(), and wp_free_device_async().
 static std::unordered_map<uint64_t, CaptureInfo*> g_captures;
 // Memory allocated during graph capture requires special handling.
-// See alloc_device_async() and free_device_async().
+// See wp_alloc_device_async() and wp_free_device_async().
 static std::unordered_map<void*, GraphAllocInfo> g_graph_allocs;
 // Memory that cannot be freed immediately gets queued here.
@@ -252,12 +253,12 @@ static std::vector<FreeInfo> g_deferred_free_list;
 // Call unload_deferred_modules() to release.
 static std::vector<ModuleInfo> g_deferred_module_list;
-void cuda_set_context_restore_policy(bool always_restore)
+void wp_cuda_set_context_restore_policy(bool always_restore)
 {
     ContextGuard::always_restore = always_restore;
 }
-int cuda_get_context_restore_policy()
+int wp_cuda_get_context_restore_policy()
 {
     return int(ContextGuard::always_restore);
 }
@@ -348,7 +349,7 @@ static inline CUcontext get_current_context()
 static inline CUstream get_current_stream(void* context=NULL)
 {
-    return static_cast<CUstream>(cuda_context_get_stream(context));
+    return static_cast<CUstream>(wp_cuda_context_get_stream(context));
 }
 static ContextInfo* get_context_info(CUcontext ctx)
@@ -481,7 +482,7 @@ static int unload_deferred_modules(void* context = NULL)
         const ModuleInfo& module_info = *it;
         if (module_info.context == context || !context)
         {
-            cuda_unload_module(module_info.context, module_info.module);
+            wp_cuda_unload_module(module_info.context, module_info.module);
             ++num_unloaded_modules;
             it = g_deferred_module_list.erase(it);
         }
@@ -535,41 +536,41 @@ static inline const char* get_cuda_kernel_name(void* kernel)
 }
-void* alloc_pinned(size_t s)
+void* wp_alloc_pinned(size_t s)
 {
     void* ptr = NULL;
     check_cuda(cudaMallocHost(&ptr, s));
     return ptr;
 }
-void free_pinned(void* ptr)
+void wp_free_pinned(void* ptr)
 {
     cudaFreeHost(ptr);
 }
-void* alloc_device(void* context, size_t s)
+void* wp_alloc_device(void* context, size_t s)
 {
-    int ordinal = cuda_context_get_device_ordinal(context);
+    int ordinal = wp_cuda_context_get_device_ordinal(context);
     // use stream-ordered allocator if available
-    if (cuda_device_is_mempool_supported(ordinal))
-        return alloc_device_async(context, s);
+    if (wp_cuda_device_is_mempool_supported(ordinal))
+        return wp_alloc_device_async(context, s);
     else
-        return alloc_device_default(context, s);
+        return wp_alloc_device_default(context, s);
 }
-void free_device(void* context, void* ptr)
+void wp_free_device(void* context, void* ptr)
 {
-    int ordinal = cuda_context_get_device_ordinal(context);
+    int ordinal = wp_cuda_context_get_device_ordinal(context);
     // use stream-ordered allocator if available
-    if (cuda_device_is_mempool_supported(ordinal))
-        free_device_async(context, ptr);
+    if (wp_cuda_device_is_mempool_supported(ordinal))
+        wp_free_device_async(context, ptr);
     else
-        free_device_default(context, ptr);
+        wp_free_device_default(context, ptr);
 }
-void* alloc_device_default(void* context, size_t s)
+void* wp_alloc_device_default(void* context, size_t s)
 {
     ContextGuard guard(context);
@@ -579,7 +580,7 @@ void* alloc_device_default(void* context, size_t s)
     return ptr;
 }
-void free_device_default(void* context, void* ptr)
+void wp_free_device_default(void* context, void* ptr)
 {
     ContextGuard guard(context);
@@ -595,7 +596,7 @@ void free_device_default(void* context, void* ptr)
     }
 }
-void* alloc_device_async(void* context, size_t s)
+void* wp_alloc_device_async(void* context, size_t s)
 {
     // stream-ordered allocations don't rely on the current context,
     // but we set the context here for consistent behaviour
@@ -613,7 +614,7 @@ void* alloc_device_async(void* context, size_t s)
     if (ptr)
     {
         // if the stream is capturing, the allocation requires special handling
-        if (cuda_stream_is_capturing(stream))
+        if (wp_cuda_stream_is_capturing(stream))
         {
             // check if this is a known capture
             uint64_t capture_id = get_capture_id(stream);
@@ -634,7 +635,7 @@ void* alloc_device_async(void* context, size_t s)
     return ptr;
 }
-void free_device_async(void* context, void* ptr)
+void wp_free_device_async(void* context, void* ptr)
 {
     // stream-ordered allocators generally don't rely on the current context,
     // but we set the context here for consistent behaviour
@@ -732,7 +733,7 @@ void free_device_async(void* context, void* ptr)
     }
 }
-bool memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
+bool wp_memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
 {
     ContextGuard guard(context);
@@ -751,7 +752,7 @@ bool memcpy_h2d(void* context, void* dest, void* src, size_t n, void* stream)
     return result;
 }
-bool memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
+bool wp_memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
 {
     ContextGuard guard(context);
@@ -770,7 +771,7 @@ bool memcpy_d2h(void* context, void* dest, void* src, size_t n, void* stream)
     return result;
 }
-bool memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
+bool wp_memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
 {
     ContextGuard guard(context);
@@ -789,7 +790,7 @@ bool memcpy_d2d(void* context, void* dest, void* src, size_t n, void* stream)
     return result;
 }
-bool memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size_t n, void* stream)
+bool wp_memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size_t n, void* stream)
 {
     // ContextGuard guard(context);
@@ -809,7 +810,7 @@ bool memcpy_p2p(void* dst_context, void* dst, void* src_context, void* src, size
     //   because cudaMemPoolGetAccess() cannot be called during graph capture.
     // - CUDA will report error 1 (invalid argument) if cudaMemcpyAsync() is called but mempool access is not enabled.
-    if (!cuda_stream_is_capturing(stream))
+    if (!wp_cuda_stream_is_capturing(stream))
     {
         begin_cuda_range(WP_TIMING_MEMCPY, cuda_stream, get_stream_context(stream), "memcpy PtoP");
@@ -896,7 +897,7 @@ __global__ void memset_kernel(int* dest, int value, size_t n)
     }
 }
-void memset_device(void* context, void* dest, int value, size_t n)
+void wp_memset_device(void* context, void* dest, int value, size_t n)
 {
     ContextGuard guard(context);
@@ -940,7 +941,7 @@ __global__ void memtile_value_kernel(T* dst, T value, size_t n)
     }
 }
-void memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
+void wp_memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
 {
     ContextGuard guard(context);
@@ -976,12 +977,12 @@ void memtile_device(void* context, void* dst, const void* src, size_t srcsize, s
         // copy value to device memory
         // TODO: use a persistent stream-local staging buffer to avoid allocs?
-        void* src_devptr = alloc_device(WP_CURRENT_CONTEXT, srcsize);
+        void* src_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, srcsize);
         check_cuda(cudaMemcpyAsync(src_devptr, src, srcsize, cudaMemcpyHostToDevice, get_current_stream()));
         wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, (dst, src_devptr, srcsize, n));
-        free_device(WP_CURRENT_CONTEXT, src_devptr);
+        wp_free_device(WP_CURRENT_CONTEXT, src_devptr);
     }
 }
@@ -1208,7 +1209,7 @@ static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::in
 }
-WP_API bool array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
+WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
 {
     if (!src || !dst)
         return false;
@@ -1600,7 +1601,7 @@ static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t
 }
-WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
+WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
 {
     if (!arr_ptr || !value_ptr)
         return;
@@ -1656,7 +1657,7 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
     // copy value to device memory
     // TODO: use a persistent stream-local staging buffer to avoid allocs?
-    void* value_devptr = alloc_device(WP_CURRENT_CONTEXT, value_size);
+    void* value_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, value_size);
     check_cuda(cudaMemcpyAsync(value_devptr, value_ptr, value_size, cudaMemcpyHostToDevice, get_current_stream()));
     // handle fabric arrays
@@ -1714,20 +1715,20 @@ WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const
         return;
     }
-    free_device(WP_CURRENT_CONTEXT, value_devptr);
+    wp_free_device(WP_CURRENT_CONTEXT, value_devptr);
 }
-void array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
+void wp_array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
 {
     scan_device((const int*)in, (int*)out, len, inclusive);
 }
-void array_scan_float_device(uint64_t in, uint64_t out, int len, bool inclusive)
+void wp_array_scan_float_device(uint64_t in, uint64_t out, int len, bool inclusive)
 {
     scan_device((const float*)in, (float*)out, len, inclusive);
 }
-int cuda_driver_version()
+int wp_cuda_driver_version()
 {
     int version;
     if (check_cu(cuDriverGetVersion_f(&version)))
@@ -1736,17 +1737,17 @@ int cuda_driver_version()
         return 0;
 }
-int cuda_toolkit_version()
+int wp_cuda_toolkit_version()
 {
     return CUDA_VERSION;
 }
-bool cuda_driver_is_initialized()
+bool wp_cuda_driver_is_initialized()
 {
     return is_cuda_driver_initialized();
 }
-int nvrtc_supported_arch_count()
+int wp_nvrtc_supported_arch_count()
 {
     int count;
     if (check_nvrtc(nvrtcGetNumSupportedArchs(&count)))
@@ -1755,7 +1756,7 @@ int nvrtc_supported_arch_count()
         return 0;
 }
-void nvrtc_supported_archs(int* archs)
+void wp_nvrtc_supported_archs(int* archs)
 {
     if (archs)
     {
@@ -1763,14 +1764,14 @@ void nvrtc_supported_archs(int* archs)
     }
 }
-int cuda_device_get_count()
+int wp_cuda_device_get_count()
 {
     int count = 0;
     check_cu(cuDeviceGetCount_f(&count));
     return count;
 }
-void* cuda_device_get_primary_context(int ordinal)
+void* wp_cuda_device_get_primary_context(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
     {
@@ -1786,75 +1787,75 @@ void* cuda_device_get_primary_context(int ordinal)
     return NULL;
 }
-const char* cuda_device_get_name(int ordinal)
+const char* wp_cuda_device_get_name(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].name;
     return NULL;
 }
-int cuda_device_get_arch(int ordinal)
+int wp_cuda_device_get_arch(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].arch;
     return 0;
 }
-int cuda_device_get_sm_count(int ordinal)
+int wp_cuda_device_get_sm_count(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].sm_count;
     return 0;
 }
-void cuda_device_get_uuid(int ordinal, char uuid[16])
+void wp_cuda_device_get_uuid(int ordinal, char uuid[16])
 {
     memcpy(uuid, g_devices[ordinal].uuid.bytes, sizeof(char)*16);
 }
-int cuda_device_get_pci_domain_id(int ordinal)
+int wp_cuda_device_get_pci_domain_id(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].pci_domain_id;
     return -1;
 }
-int cuda_device_get_pci_bus_id(int ordinal)
+int wp_cuda_device_get_pci_bus_id(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].pci_bus_id;
     return -1;
 }
-int cuda_device_get_pci_device_id(int ordinal)
+int wp_cuda_device_get_pci_device_id(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].pci_device_id;
     return -1;
 }
-int cuda_device_is_uva(int ordinal)
+int wp_cuda_device_is_uva(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].is_uva;
     return 0;
 }
-int cuda_device_is_mempool_supported(int ordinal)
+int wp_cuda_device_is_mempool_supported(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].is_mempool_supported;
     return 0;
 }
-int cuda_device_is_ipc_supported(int ordinal)
+int wp_cuda_device_is_ipc_supported(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
         return g_devices[ordinal].is_ipc_supported;
     return 0;
 }
-int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
+int wp_cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
 {
     if (ordinal < 0 || ordinal > int(g_devices.size()))
     {
@@ -1881,7 +1882,7 @@ int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold)
     return 1;  // success
 }
-uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
+uint64_t wp_cuda_device_get_mempool_release_threshold(int ordinal)
 {
     if (ordinal < 0 || ordinal > int(g_devices.size()))
     {
@@ -1909,7 +1910,7 @@ uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
     return threshold;
 }
-uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
+uint64_t wp_cuda_device_get_mempool_used_mem_current(int ordinal)
 {
     if (ordinal < 0 || ordinal > int(g_devices.size()))
     {
@@ -1937,7 +1938,7 @@ uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
     return mem_used;
 }
-uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
+uint64_t wp_cuda_device_get_mempool_used_mem_high(int ordinal)
 {
     if (ordinal < 0 || ordinal > int(g_devices.size()))
     {
@@ -1965,7 +1966,7 @@ uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
     return mem_high_water_mark;
 }
-void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
+void wp_cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
 {
     // use temporary storage if user didn't specify pointers
     size_t tmp_free_mem, tmp_total_mem;
@@ -2002,12 +2003,12 @@ void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_me
 }
-void* cuda_context_get_current()
+void* wp_cuda_context_get_current()
 {
     return get_current_context();
 }
-void cuda_context_set_current(void* context)
+void wp_cuda_context_set_current(void* context)
 {
     CUcontext ctx = static_cast<CUcontext>(context);
     CUcontext prev_ctx = NULL;
@@ -2018,18 +2019,18 @@ void cuda_context_set_current(void* context)
     }
 }
-void cuda_context_push_current(void* context)
+void wp_cuda_context_push_current(void* context)
 {
     check_cu(cuCtxPushCurrent_f(static_cast<CUcontext>(context)));
 }
-void cuda_context_pop_current()
+void wp_cuda_context_pop_current()
 {
     CUcontext context;
     check_cu(cuCtxPopCurrent_f(&context));
 }
-void* cuda_context_create(int device_ordinal)
+void* wp_cuda_context_create(int device_ordinal)
 {
     CUcontext ctx = NULL;
     CUdevice device;
@@ -2038,15 +2039,15 @@ void* cuda_context_create(int device_ordinal)
     return ctx;
 }
-void cuda_context_destroy(void* context)
+void wp_cuda_context_destroy(void* context)
 {
     if (context)
     {
         CUcontext ctx = static_cast<CUcontext>(context);
         // ensure this is not the current context
-        if (ctx == cuda_context_get_current())
-            cuda_context_set_current(NULL);
+        if (ctx == wp_cuda_context_get_current())
+            wp_cuda_context_set_current(NULL);
         // release the cached info about this context
         ContextInfo* info = get_context_info(ctx);
@@ -2065,7 +2066,7 @@ void cuda_context_destroy(void* context)
     }
 }
-void cuda_context_synchronize(void* context)
+void wp_cuda_context_synchronize(void* context)
 {
     ContextGuard guard(context);
@@ -2079,10 +2080,10 @@ void cuda_context_synchronize(void* context)
     unload_deferred_modules(context);
-    // check_cuda(cudaDeviceGraphMemTrim(cuda_context_get_device_ordinal(context)));
+    // check_cuda(cudaDeviceGraphMemTrim(wp_cuda_context_get_device_ordinal(context)));
 }
-uint64_t cuda_context_check(void* context)
+uint64_t wp_cuda_context_check(void* context)
 {
     ContextGuard guard(context);
@@ -2104,13 +2105,13 @@ uint64_t cuda_context_check(void* context)
 }
-int cuda_context_get_device_ordinal(void* context)
+int wp_cuda_context_get_device_ordinal(void* context)
 {
     ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
     return info && info->device_info ? info->device_info->ordinal : -1;
 }
-int cuda_context_is_primary(void* context)
+int wp_cuda_context_is_primary(void* context)
 {
     CUcontext ctx = static_cast<CUcontext>(context);
     ContextInfo* context_info = get_context_info(ctx);
@@ -2137,7 +2138,7 @@ int cuda_context_is_primary(void* context)
     return 0;
 }
-void* cuda_context_get_stream(void* context)
+void* wp_cuda_context_get_stream(void* context)
 {
     ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
     if (info)
@@ -2147,7 +2148,7 @@ void* cuda_context_get_stream(void* context)
     return NULL;
 }
-void cuda_context_set_stream(void* context, void* stream, int sync)
+void wp_cuda_context_set_stream(void* context, void* stream, int sync)
 {
     ContextInfo* context_info = get_context_info(static_cast<CUcontext>(context));
     if (context_info)
@@ -2171,7 +2172,7 @@ void cuda_context_set_stream(void* context, void* stream, int sync)
     }
 }
-int cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
+int wp_cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
 {
     int num_devices = int(g_devices.size());
@@ -2196,7 +2197,7 @@ int cuda_is_peer_access_supported(int target_ordinal, int peer_ordinal)
     return can_access;
 }
-int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
+int wp_cuda_is_peer_access_enabled(void* target_context, void* peer_context)
 {
     if (!target_context || !peer_context)
     {
@@ -2207,8 +2208,8 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
     if (target_context == peer_context)
         return 1;
-    int target_ordinal = cuda_context_get_device_ordinal(target_context);
-    int peer_ordinal = cuda_context_get_device_ordinal(peer_context);
+    int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
+    int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
     // check if peer access is supported
     int can_access = 0;
@@ -2241,7 +2242,7 @@ int cuda_is_peer_access_enabled(void* target_context, void* peer_context)
     }
 }
-int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int enable)
+int wp_cuda_set_peer_access_enabled(void* target_context, void* peer_context, int enable)
 {
     if (!target_context || !peer_context)
     {
@@ -2252,8 +2253,8 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
     if (target_context == peer_context)
         return 1;  // no-op
-    int target_ordinal = cuda_context_get_device_ordinal(target_context);
-    int peer_ordinal = cuda_context_get_device_ordinal(peer_context);
+    int target_ordinal = wp_cuda_context_get_device_ordinal(target_context);
+    int peer_ordinal = wp_cuda_context_get_device_ordinal(peer_context);
     // check if peer access is supported
     int can_access = 0;
@@ -2298,7 +2299,7 @@ int cuda_set_peer_access_enabled(void* target_context, void* peer_context, int e
     return 1;  // success
 }
-int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
+int wp_cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
 {
     int num_devices = int(g_devices.size());
@@ -2334,7 +2335,7 @@ int cuda_is_mempool_access_enabled(int target_ordinal, int peer_ordinal)
     return 0;
 }
-int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable)
+int wp_cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int enable)
 {
     int num_devices = int(g_devices.size());
@@ -2380,13 +2381,13 @@ int cuda_set_mempool_access_enabled(int target_ordinal, int peer_ordinal, int en
     return 1;  // success
 }
-void cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {
+void wp_cuda_ipc_get_mem_handle(void* ptr, char* out_buffer) {
     CUipcMemHandle memHandle;
     check_cu(cuIpcGetMemHandle_f(&memHandle, (CUdeviceptr)ptr));
     memcpy(out_buffer, memHandle.reserved, CU_IPC_HANDLE_SIZE);
 }
-void* cuda_ipc_open_mem_handle(void* context, char* handle) {
+void* wp_cuda_ipc_open_mem_handle(void* context, char* handle) {
     ContextGuard guard(context);
     CUipcMemHandle memHandle;
@@ -2401,11 +2402,11 @@ void* cuda_ipc_open_mem_handle(void* context, char* handle) {
         return NULL;
 }
-void cuda_ipc_close_mem_handle(void* ptr) {
+void wp_cuda_ipc_close_mem_handle(void* ptr) {
     check_cu(cuIpcCloseMemHandle_f((CUdeviceptr) ptr));
 }
-void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
+void wp_cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
     ContextGuard guard(context);
     CUipcEventHandle eventHandle;
@@ -2413,7 +2414,7 @@ void cuda_ipc_get_event_handle(void* context, void* event, char* out_buffer) {
     memcpy(out_buffer, eventHandle.reserved, CU_IPC_HANDLE_SIZE);
 }
-void* cuda_ipc_open_event_handle(void* context, char* handle) {
+void* wp_cuda_ipc_open_event_handle(void* context, char* handle) {
     ContextGuard guard(context);
     CUipcEventHandle eventHandle;
@@ -2427,31 +2428,34 @@ void* cuda_ipc_open_event_handle(void* context, char* handle) {
         return NULL;
 }
-void* cuda_stream_create(void* context, int priority)
+void* wp_cuda_stream_create(void* context, int priority)
 {
     ContextGuard guard(context, true);
     CUstream stream;
     if (check_cu(cuStreamCreateWithPriority_f(&stream, CU_STREAM_DEFAULT, priority)))
     {
-        cuda_stream_register(WP_CURRENT_CONTEXT, stream);
+        wp_cuda_stream_register(WP_CURRENT_CONTEXT, stream);
         return stream;
     }
     else
         return NULL;
 }
-void cuda_stream_destroy(void* context, void* stream)
+void wp_cuda_stream_destroy(void* context, void* stream)
 {
     if (!stream)
         return;
-    cuda_stream_unregister(context, stream);
+    wp_cuda_stream_unregister(context, stream);
+    // release temporary radix sort buffer associated with this stream
+    radix_sort_release(context, stream);
     check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
 }
-int cuda_stream_query(void* stream)
+int wp_cuda_stream_query(void* stream)
 {
     CUresult res =  cuStreamQuery_f(static_cast<CUstream>(stream));
@@ -2464,7 +2468,7 @@ int cuda_stream_query(void* stream)
     return res;
 }
-void cuda_stream_register(void* context, void* stream)
+void wp_cuda_stream_register(void* context, void* stream)
 {
     if (!stream)
         return;
@@ -2476,7 +2480,7 @@ void cuda_stream_register(void* context, void* stream)
     check_cu(cuEventCreate_f(&stream_info.cached_event, CU_EVENT_DISABLE_TIMING));
 }
-void cuda_stream_unregister(void* context, void* stream)
+void wp_cuda_stream_unregister(void* context, void* stream)
 {
     if (!stream)
         return;
@@ -2500,28 +2504,28 @@ void cuda_stream_unregister(void* context, void* stream)
     }
 }
-void* cuda_stream_get_current()
+void* wp_cuda_stream_get_current()
 {
     return get_current_stream();
 }
-void cuda_stream_synchronize(void* stream)
+void wp_cuda_stream_synchronize(void* stream)
 {
     check_cu(cuStreamSynchronize_f(static_cast<CUstream>(stream)));
 }
-void cuda_stream_wait_event(void* stream, void* event)
+void wp_cuda_stream_wait_event(void* stream, void* event)
 {
     check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
 }
-void cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
+void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
 {
     check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream)));
     check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
 }
-int cuda_stream_is_capturing(void* stream)
+int wp_cuda_stream_is_capturing(void* stream)
 {
     cudaStreamCaptureStatus status = cudaStreamCaptureStatusNone;
     check_cuda(cudaStreamIsCapturing(static_cast<cudaStream_t>(stream), &status));
@@ -2529,12 +2533,12 @@ int cuda_stream_is_capturing(void* stream)
     return int(status != cudaStreamCaptureStatusNone);
 }
-uint64_t cuda_stream_get_capture_id(void* stream)
+uint64_t wp_cuda_stream_get_capture_id(void* stream)
 {
     return get_capture_id(static_cast<CUstream>(stream));
 }
-int cuda_stream_get_priority(void* stream)
+int wp_cuda_stream_get_priority(void* stream)
 {
     int priority = 0;
     check_cuda(cuStreamGetPriority_f(static_cast<CUstream>(stream), &priority));
@@ -2542,7 +2546,7 @@ int cuda_stream_get_priority(void* stream)
     return priority;
 }
-void* cuda_event_create(void* context, unsigned flags)
+void* wp_cuda_event_create(void* context, unsigned flags)
 {
     ContextGuard guard(context, true);
@@ -2553,12 +2557,12 @@ void* cuda_event_create(void* context, unsigned flags)
         return NULL;
 }
-void cuda_event_destroy(void* event)
+void wp_cuda_event_destroy(void* event)
 {
     check_cu(cuEventDestroy_f(static_cast<CUevent>(event)));
 }
-int cuda_event_query(void* event)
+int wp_cuda_event_query(void* event)
 {
     CUresult res = cuEventQuery_f(static_cast<CUevent>(event));
@@ -2571,9 +2575,9 @@ int cuda_event_query(void* event)
     return res;
 }
-void cuda_event_record(void* event, void* stream, bool timing)
+void wp_cuda_event_record(void* event, void* stream, bool timing)
 {
-    if (timing && !g_captures.empty() && cuda_stream_is_capturing(stream))
+    if (timing && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
     {
         // record timing event during graph capture
         check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
@@ -2584,12 +2588,12 @@ void cuda_event_record(void* event, void* stream, bool timing)
     }
 }
-void cuda_event_synchronize(void* event)
+void wp_cuda_event_synchronize(void* event)
 {
     check_cu(cuEventSynchronize_f(static_cast<CUevent>(event)));
 }
-float cuda_event_elapsed_time(void* start_event, void* end_event)
+float wp_cuda_event_elapsed_time(void* start_event, void* end_event)
 {
     float elapsed = 0.0f;
     cudaEvent_t start = static_cast<cudaEvent_t>(start_event);
@@ -2598,7 +2602,7 @@ float cuda_event_elapsed_time(void* start_event, void* end_event)
     return elapsed;
 }
-bool cuda_graph_begin_capture(void* context, void* stream, int external)
+bool wp_cuda_graph_begin_capture(void* context, void* stream, int external)
 {
     ContextGuard guard(context);
@@ -2645,7 +2649,7 @@ bool cuda_graph_begin_capture(void* context, void* stream, int external)
     return true;
 }
-bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
+bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
 {
     ContextGuard guard(context);
@@ -2780,14 +2784,14 @@ bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
     return true;
 }
-bool capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
+bool wp_capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
 {
     if (!check_cuda(cudaGraphDebugDotPrint((cudaGraph_t)graph, path, flags)))
         return false;
     return true;
 }
-bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
+bool wp_cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
 {
     ContextGuard guard(context);
@@ -2811,11 +2815,12 @@ bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** gra
 // Support for conditional graph nodes available with CUDA 12.4+.
 #if CUDA_VERSION >= 12040
-// CUBIN data for compiled conditional modules, loaded on demand, keyed on device architecture
-static std::map<int, void*> g_conditional_cubins;
+// CUBIN or PTX data for compiled conditional modules, loaded on demand, keyed on device architecture
+using ModuleKey = std::pair<int, bool>; // <arch, use_ptx>
+static std::map<ModuleKey, void*> g_conditional_modules;
 // Compile module with conditional helper kernels
-static void* compile_conditional_module(int arch)
+static void* compile_conditional_module(int arch, bool use_ptx)
 {
     static const char* kernel_source = R"(
         typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
@@ -2844,8 +2849,9 @@ static void* compile_conditional_module(int arch)
     )";
     // avoid recompilation
-    auto it = g_conditional_cubins.find(arch);
-    if (it != g_conditional_cubins.end())
+    ModuleKey key = {arch, use_ptx};
+    auto it = g_conditional_modules.find(key);
+    if (it != g_conditional_modules.end())
         return it->second;
     nvrtcProgram prog;
@@ -2853,11 +2859,23 @@ static void* compile_conditional_module(int arch)
         return NULL;
     char arch_opt[128];
-    snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
+    if (use_ptx)
+        snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=compute_%d", arch);
+    else
+        snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
     std::vector<const char*> opts;
     opts.push_back(arch_opt);
+    const bool print_debug = (std::getenv("WARP_DEBUG") != nullptr);
+    if (print_debug)
+    {
+        printf("NVRTC options (conditional module, arch=%d, use_ptx=%s):\n", arch, use_ptx ? "true" : "false");
+        for(auto o: opts) {
+            printf("%s\n", o);
+        }
+    }
     if (!check_nvrtc(nvrtcCompileProgram(prog, int(opts.size()), opts.data())))
     {
         size_t log_size;
@@ -2874,23 +2892,37 @@ static void* compile_conditional_module(int arch)
     // get output
     char* output = NULL;
     size_t output_size = 0;
-    check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
-    if (output_size > 0)
+    if (use_ptx)
+    {
+        check_nvrtc(nvrtcGetPTXSize(prog, &output_size));
+        if (output_size > 0)
+        {
+            output = new char[output_size];
+            if (check_nvrtc(nvrtcGetPTX(prog, output)))
+                g_conditional_modules[key] = output;
+        }
+    }
+    else
     {
-        output = new char[output_size];
-        if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
-            g_conditional_cubins[arch] = output;
+        check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
+        if (output_size > 0)
+        {
+            output = new char[output_size];
+            if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
+                g_conditional_modules[key] = output;
+        }
     }
     nvrtcDestroyProgram(&prog);
-    // return CUBIN data
+    // return CUBIN or PTX data
     return output;
 }
 // Load module with conditional helper kernels
-static CUmodule load_conditional_module(void* context)
+static CUmodule load_conditional_module(void* context, int arch, bool use_ptx)
 {
     ContextInfo* context_info = get_context_info(context);
     if (!context_info)
@@ -2900,17 +2932,15 @@ static CUmodule load_conditional_module(void* context)
     if (context_info->conditional_module)
         return context_info->conditional_module;
-    int arch = context_info->device_info->arch;
     // compile if needed
-    void* compiled_module = compile_conditional_module(arch);
+    void* compiled_module = compile_conditional_module(arch, use_ptx);
     if (!compiled_module)
     {
         fprintf(stderr, "Warp error: Failed to compile conditional kernels\n");
         return NULL;
     }
-    // load module
+    // load module (handles both PTX and CUBIN data automatically)
     CUmodule module = NULL;
     if (!check_cu(cuModuleLoadDataEx_f(&module, compiled_module, 0, NULL, NULL)))
     {
@@ -2923,10 +2953,10 @@ static CUmodule load_conditional_module(void* context)
     return module;
 }
-static CUfunction get_conditional_kernel(void* context, const char* name)
+static CUfunction get_conditional_kernel(void* context, int arch, bool use_ptx, const char* name)
 {
     // load module if needed
-    CUmodule module = load_conditional_module(context);
+    CUmodule module = load_conditional_module(context, arch, use_ptx);
     if (!module)
         return NULL;
@@ -2940,7 +2970,7 @@ static CUfunction get_conditional_kernel(void* context, const char* name)
     return kernel;
 }
-bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
+bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
 {
     ContextGuard guard(context);
@@ -2950,7 +2980,7 @@ bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
     return true;
 }
-bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
+bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
 {
     ContextGuard guard(context);
@@ -2976,7 +3006,7 @@ bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
 // https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
 // condition is a gpu pointer
 // if_graph_ret and else_graph_ret should be NULL if not needed
-bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret)
 {
     bool has_if = if_graph_ret != NULL;
     bool has_else = else_graph_ret != NULL;
@@ -2991,21 +3021,21 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
     CUstream cuda_stream = static_cast<CUstream>(stream);
     // Get the current stream capturing graph
-    cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
+    CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
     cudaGraph_t cuda_graph = NULL;
     const cudaGraphNode_t* capture_deps = NULL;
     size_t dep_count = 0;
-    if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+    if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
         return false;
     // abort if not capturing
-    if (!cuda_graph || capture_status != cudaStreamCaptureStatusActive)
+    if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
     {
         wp::set_error_string("Stream is not capturing");
         return false;
     }
-    //int driver_version = cuda_driver_version();
+    //int driver_version = wp_cuda_driver_version();
     // IF-ELSE nodes are only supported with CUDA 12.8+
     // Somehow child graphs produce wrong results when an else branch is used
@@ -3013,15 +3043,15 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
     if (num_branches == 1 /*|| driver_version >= 12080*/)
     {
         cudaGraphConditionalHandle handle;
-        cudaGraphConditionalHandleCreate(&handle, cuda_graph);
+        check_cuda(cudaGraphConditionalHandleCreate(&handle, cuda_graph));
         // run a kernel to set the condition handle from the condition pointer
         // (need to negate the condition if only the else branch is used)
         CUfunction kernel;
         if (has_if)
-            kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+            kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
         else
-            kernel = get_conditional_kernel(context, "set_conditional_else_handle_kernel");
+            kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_else_handle_kernel");
         if (!kernel)
         {
@@ -3033,22 +3063,23 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
         kernel_args[0] = &handle;
         kernel_args[1] = &condition;
-        if (!check_cuda(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
+        if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
             return false;
-        if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+        if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
             return false;
         // create conditional node
-        cudaGraphNode_t condition_node;
-        cudaGraphNodeParams condition_params = { cudaGraphNodeTypeConditional };
+        CUgraphNode condition_node;
+        CUgraphNodeParams condition_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
         condition_params.conditional.handle = handle;
-        condition_params.conditional.type   = cudaGraphCondTypeIf;
+        condition_params.conditional.type   = CU_GRAPH_COND_TYPE_IF;
         condition_params.conditional.size   = num_branches;
-        if (!check_cuda(cudaGraphAddNode(&condition_node, cuda_graph, capture_deps, dep_count, &condition_params)))
+        condition_params.conditional.ctx    = get_current_context();
+        if (!check_cu(cuGraphAddNode_f(&condition_node, cuda_graph, capture_deps, NULL, dep_count, &condition_params)))
             return false;
-        if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
+        if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
             return false;
         if (num_branches == 1)
@@ -3068,10 +3099,10 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
     {
         // Create IF node followed by an additional IF node with negated condition
         cudaGraphConditionalHandle if_handle, else_handle;
-        cudaGraphConditionalHandleCreate(&if_handle, cuda_graph);
-        cudaGraphConditionalHandleCreate(&else_handle, cuda_graph);
+        check_cuda(cudaGraphConditionalHandleCreate(&if_handle, cuda_graph));
+        check_cuda(cudaGraphConditionalHandleCreate(&else_handle, cuda_graph));
-        CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
+        CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_else_handles_kernel");
         if (!kernel)
         {
             wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3086,26 +3117,28 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
         if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
             return false;
-        if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+        if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
             return false;
-        cudaGraphNode_t if_node;
-        cudaGraphNodeParams if_params = { cudaGraphNodeTypeConditional };
+        CUgraphNode if_node;
+        CUgraphNodeParams if_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
         if_params.conditional.handle = if_handle;
-        if_params.conditional.type   = cudaGraphCondTypeIf;
+        if_params.conditional.type   = CU_GRAPH_COND_TYPE_IF;
         if_params.conditional.size   = 1;
-        if (!check_cuda(cudaGraphAddNode(&if_node, cuda_graph, capture_deps, dep_count, &if_params)))
+        if_params.conditional.ctx    = get_current_context();
+        if (!check_cu(cuGraphAddNode_f(&if_node, cuda_graph, capture_deps, NULL, dep_count, &if_params)))
             return false;
-        cudaGraphNode_t else_node;
-        cudaGraphNodeParams else_params = { cudaGraphNodeTypeConditional };
+        CUgraphNode else_node;
+        CUgraphNodeParams else_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
         else_params.conditional.handle = else_handle;
-        else_params.conditional.type   = cudaGraphCondTypeIf;
+        else_params.conditional.type   = CU_GRAPH_COND_TYPE_IF;
         else_params.conditional.size   = 1;
-        if (!check_cuda(cudaGraphAddNode(&else_node, cuda_graph, &if_node, 1, &else_params)))
+        else_params.conditional.ctx    = get_current_context();
+        if (!check_cu(cuGraphAddNode_f(&else_node, cuda_graph, &if_node, NULL, 1, &else_params)))
             return false;
-        if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
+        if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
             return false;
         *if_graph_ret = if_params.conditional.phGraph_out[0];
@@ -3115,21 +3148,143 @@ bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void
     return true;
 }
-bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
+// graph node type names for intelligible error reporting
+static const char* get_graph_node_type_name(CUgraphNodeType type)
+{
+    static const std::unordered_map<CUgraphNodeType, const char*> names
+    {
+        {CU_GRAPH_NODE_TYPE_KERNEL, "kernel launch"},
+        {CU_GRAPH_NODE_TYPE_MEMCPY, "memcpy"},
+        {CU_GRAPH_NODE_TYPE_MEMSET, "memset"},
+        {CU_GRAPH_NODE_TYPE_HOST, "host execution"},
+        {CU_GRAPH_NODE_TYPE_GRAPH, "graph launch"},
+        {CU_GRAPH_NODE_TYPE_EMPTY, "empty node"},
+        {CU_GRAPH_NODE_TYPE_WAIT_EVENT, "event wait"},
+        {CU_GRAPH_NODE_TYPE_EVENT_RECORD, "event record"},
+        {CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL, "semaphore signal"},
+        {CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT, "semaphore wait"},
+        {CU_GRAPH_NODE_TYPE_MEM_ALLOC, "memory allocation"},
+        {CU_GRAPH_NODE_TYPE_MEM_FREE, "memory deallocation"},
+        {CU_GRAPH_NODE_TYPE_BATCH_MEM_OP, "batched mem op"},
+        {CU_GRAPH_NODE_TYPE_CONDITIONAL, "conditional node"},
+    };
+    auto it = names.find(type);
+    if (it != names.end())
+        return it->second;
+    else
+        return "unknown node";
+}
+// check if a graph can be launched as a child graph
+static bool is_valid_child_graph(void* child_graph)
+{
+    // disallowed child graph nodes according to the documentation of cuGraphAddChildGraphNode()
+    static const std::unordered_set<CUgraphNodeType> disallowed_nodes
+    {
+        CU_GRAPH_NODE_TYPE_MEM_ALLOC,
+        CU_GRAPH_NODE_TYPE_MEM_FREE,
+        CU_GRAPH_NODE_TYPE_CONDITIONAL,
+    };
+    if (!child_graph)
+    {
+        wp::set_error_string("Child graph is null");
+        return false;
+    }
+    size_t num_nodes = 0;
+    if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, NULL, &num_nodes)))
+        return false;
+    std::vector<cudaGraphNode_t> nodes(num_nodes);
+    if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)child_graph, nodes.data(), &num_nodes)))
+        return false;
+    for (size_t i = 0; i < num_nodes; i++)
+    {
+        // note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
+        CUgraphNodeType node_type;
+        check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
+        auto it = disallowed_nodes.find(node_type);
+        if (it != disallowed_nodes.end())
+        {
+            wp::set_error_string("Child graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
+            return false;
+        }
+    }
+    return true;
+}
+// check if a graph can be used as a conditional body graph
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#condtional-node-body-graph-requirements
+bool wp_cuda_graph_check_conditional_body(void* body_graph)
 {
+    static const std::unordered_set<CUgraphNodeType> allowed_nodes
+    {
+        CU_GRAPH_NODE_TYPE_MEMCPY,
+        CU_GRAPH_NODE_TYPE_MEMSET,
+        CU_GRAPH_NODE_TYPE_KERNEL,
+        CU_GRAPH_NODE_TYPE_GRAPH,
+        CU_GRAPH_NODE_TYPE_EMPTY,
+        CU_GRAPH_NODE_TYPE_CONDITIONAL,
+    };
+    if (!body_graph)
+    {
+        wp::set_error_string("Conditional body graph is null");
+        return false;
+    }
+    size_t num_nodes = 0;
+    if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, NULL, &num_nodes)))
+        return false;
+    std::vector<cudaGraphNode_t> nodes(num_nodes);
+    if (!check_cuda(cudaGraphGetNodes((cudaGraph_t)body_graph, nodes.data(), &num_nodes)))
+        return false;
+    for (size_t i = 0; i < num_nodes; i++)
+    {
+        // note: we use the driver API to get the node type, otherwise some nodes are not recognized correctly
+        CUgraphNodeType node_type;
+        check_cu(cuGraphNodeGetType_f(nodes[i], &node_type));
+        if (allowed_nodes.find(node_type) == allowed_nodes.end())
+        {
+            wp::set_error_string("Conditional body graph contains an unsupported operation (%s)", get_graph_node_type_name(node_type));
+            return false;
+        }
+        else if (node_type == CU_GRAPH_NODE_TYPE_GRAPH)
+        {
+            // check nested child graphs recursively
+            cudaGraph_t child_graph = NULL;
+            if (!check_cuda(cudaGraphChildGraphNodeGetGraph(nodes[i], &child_graph)))
+                return false;
+            if (!wp_cuda_graph_check_conditional_body(child_graph))
+                return false;
+        }
+    }
+    return true;
+}
+bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
+{
+    if (!is_valid_child_graph(child_graph))
+        return false;
     ContextGuard guard(context);
     CUstream cuda_stream = static_cast<CUstream>(stream);
     // Get the current stream capturing graph
-    cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
+    CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
     void* cuda_graph = NULL;
-    const cudaGraphNode_t* capture_deps = NULL;
+    const CUgraphNode* capture_deps = NULL;
     size_t dep_count = 0;
-    if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
+    if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
         return false;
-    if (!cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
+    if (!wp_cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
         return false;
     cudaGraphNode_t body_node;
@@ -3139,16 +3294,16 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
                                                 static_cast<cudaGraph_t>(child_graph))))
         return false;
-    if (!cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
+    if (!wp_cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
         return false;
-    if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
+    if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
         return false;
     return true;
 }
-bool cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret)
 {
     // if there's no body, it's a no-op
     if (!body_graph_ret)
@@ -3159,15 +3314,15 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
     CUstream cuda_stream = static_cast<CUstream>(stream);
     // Get the current stream capturing graph
-    cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
+    CUstreamCaptureStatus capture_status = CU_STREAM_CAPTURE_STATUS_NONE;
     cudaGraph_t cuda_graph = NULL;
     const cudaGraphNode_t* capture_deps = NULL;
     size_t dep_count = 0;
-    if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+    if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
         return false;
     // abort if not capturing
-    if (!cuda_graph || capture_status != cudaStreamCaptureStatusActive)
+    if (!cuda_graph || capture_status != CU_STREAM_CAPTURE_STATUS_ACTIVE)
     {
         wp::set_error_string("Stream is not capturing");
         return false;
@@ -3178,7 +3333,7 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
         return false;
     // launch a kernel to set the condition handle from condition pointer
-    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
     if (!kernel)
     {
         wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3192,19 +3347,20 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
     if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
         return false;
-    if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+    if (!check_cu(cuStreamGetCaptureInfo_f(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
         return false;
     // insert conditional graph node
-    cudaGraphNode_t while_node;
-    cudaGraphNodeParams while_params = { cudaGraphNodeTypeConditional };
+    CUgraphNode while_node;
+    CUgraphNodeParams while_params = { CU_GRAPH_NODE_TYPE_CONDITIONAL };
     while_params.conditional.handle = handle;
-    while_params.conditional.type   = cudaGraphCondTypeWhile;
+    while_params.conditional.type   = CU_GRAPH_COND_TYPE_WHILE;
     while_params.conditional.size   = 1;
-    if (!check_cuda(cudaGraphAddNode(&while_node, cuda_graph, capture_deps, dep_count, &while_params)))
+    while_params.conditional.ctx    = get_current_context();
+    if (!check_cu(cuGraphAddNode_f(&while_node, cuda_graph, capture_deps, NULL, dep_count, &while_params)))
         return false;
-    if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
+    if (!check_cu(cuStreamUpdateCaptureDependencies_f(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
         return false;
     *body_graph_ret = while_params.conditional.phGraph_out[0];
@@ -3213,14 +3369,14 @@ bool cuda_graph_insert_while(void* context, void* stream, int* condition, void**
     return true;
 }
-bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle)
 {
     ContextGuard guard(context);
     CUstream cuda_stream = static_cast<CUstream>(stream);
     // launch a kernel to set the condition handle from condition pointer
-    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
     if (!kernel)
     {
         wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3240,37 +3396,43 @@ bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint6
 #else
 // stubs for conditional graph node API if CUDA toolkit is too old.
-bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
+bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
+{
+    wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
+    return false;
+}
+bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
+bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
+bool wp_cuda_graph_check_conditional_body(void* body_graph)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
@@ -3279,7 +3441,7 @@ bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_grap
 #endif // support for conditional graph nodes
-bool cuda_graph_launch(void* graph_exec, void* stream)
+bool wp_cuda_graph_launch(void* graph_exec, void* stream)
 {
     // TODO: allow naming graphs?
     begin_cuda_range(WP_TIMING_GRAPH, stream, get_stream_context(stream), "graph");
@@ -3291,14 +3453,14 @@ bool cuda_graph_launch(void* graph_exec, void* stream)
     return result;
 }
-bool cuda_graph_destroy(void* context, void* graph)
+bool wp_cuda_graph_destroy(void* context, void* graph)
 {
     ContextGuard guard(context);
     return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
 }
-bool cuda_graph_exec_destroy(void* context, void* graph_exec)
+bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec)
 {
     ContextGuard guard(context);
@@ -3350,7 +3512,7 @@ bool write_file(const char* data, size_t size, std::string filename, const char*
     }
 #endif
-size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
+size_t wp_cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
 {
     // use file extension to determine whether to output PTX or CUBIN
     const char* output_ext = strrchr(output_path, '.');
@@ -3406,9 +3568,9 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
     {
         opts.push_back("--define-macro=_DEBUG");
         opts.push_back("--generate-line-info");
-        // disabling since it causes issues with `Unresolved extern function 'cudaGetParameterBufferV2'
-        //opts.push_back("--device-debug");
+#ifndef _WIN32
+        opts.push_back("--device-debug"); // -G
+#endif
     }
     else
     {
@@ -3678,7 +3840,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         }
     }
-    bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
+    bool wp_cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
     {
         CHECK_ANY(ltoir_output_path != nullptr);
@@ -3724,7 +3886,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         return res;
     }
-    bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
+    bool wp_cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
     {
         CHECK_ANY(ltoir_output_path != nullptr);
@@ -3769,7 +3931,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         return res;
     }
-    bool cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
+    bool wp_cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
     {
         CHECK_ANY(ltoir_output_path != nullptr);
@@ -3832,7 +3994,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
 #endif
-void* cuda_load_module(void* context, const char* path)
+void* wp_cuda_load_module(void* context, const char* path)
 {
     ContextGuard guard(context);
@@ -3951,7 +4113,7 @@ void* cuda_load_module(void* context, const char* path)
     return module;
 }
-void cuda_unload_module(void* context, void* module)
+void wp_cuda_unload_module(void* context, void* module)
 {
     // ensure there are no graph captures in progress
     if (g_captures.empty())
@@ -3970,7 +4132,7 @@ void cuda_unload_module(void* context, void* module)
 }
-int cuda_get_max_shared_memory(void* context)
+int wp_cuda_get_max_shared_memory(void* context)
 {
     ContextInfo* info = get_context_info(context);
     if (!info)
@@ -3980,7 +4142,7 @@ int cuda_get_max_shared_memory(void* context)
     return max_smem_bytes;
 }
-bool cuda_configure_kernel_shared_memory(void* kernel, int size)
+bool wp_cuda_configure_kernel_shared_memory(void* kernel, int size)
 {
     int requested_smem_bytes = size;
@@ -3992,7 +4154,7 @@ bool cuda_configure_kernel_shared_memory(void* kernel, int size)
     return true;
 }
-void* cuda_get_kernel(void* context, void* module, const char* name)
+void* wp_cuda_get_kernel(void* context, void* module, const char* name)
 {
     ContextGuard guard(context);
@@ -4007,7 +4169,7 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
     return kernel;
 }
-size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
+size_t wp_cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int block_dim, int shared_memory_bytes, void** args, void* stream)
 {
     ContextGuard guard(context);
@@ -4061,21 +4223,21 @@ size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_block
     return res;
 }
-void cuda_graphics_map(void* context, void* resource)
+void wp_cuda_graphics_map(void* context, void* resource)
 {
     ContextGuard guard(context);
     check_cu(cuGraphicsMapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
 }
-void cuda_graphics_unmap(void* context, void* resource)
+void wp_cuda_graphics_unmap(void* context, void* resource)
 {
     ContextGuard guard(context);
     check_cu(cuGraphicsUnmapResources_f(1, (CUgraphicsResource*)resource, get_current_stream()));
 }
-void cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t* ptr, size_t* size)
+void wp_cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t* ptr, size_t* size)
 {
     ContextGuard guard(context);
@@ -4087,7 +4249,7 @@ void cuda_graphics_device_ptr_and_size(void* context, void* resource, uint64_t*
     *size = bytes;
 }
-void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags)
+void* wp_cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags)
 {
     ContextGuard guard(context);
@@ -4102,7 +4264,7 @@ void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsign
     return resource;
 }
-void cuda_graphics_unregister_resource(void* context, void* resource)
+void wp_cuda_graphics_unregister_resource(void* context, void* resource)
 {
     ContextGuard guard(context);
@@ -4111,25 +4273,25 @@ void cuda_graphics_unregister_resource(void* context, void* resource)
     delete res;
 }
-void cuda_timing_begin(int flags)
+void wp_cuda_timing_begin(int flags)
 {
     g_cuda_timing_state = new CudaTimingState(flags, g_cuda_timing_state);
 }
-int cuda_timing_get_result_count()
+int wp_cuda_timing_get_result_count()
 {
     if (g_cuda_timing_state)
         return int(g_cuda_timing_state->ranges.size());
     return 0;
 }
-void cuda_timing_end(timing_result_t* results, int size)
+void wp_cuda_timing_end(timing_result_t* results, int size)
 {
     if (!g_cuda_timing_state)
         return;
     // number of results to write to the user buffer
-    int count = std::min(cuda_timing_get_result_count(), size);
+    int count = std::min(wp_cuda_timing_get_result_count(), size);
     // compute timings and write results
     for (int i = 0; i < count; i++)
@@ -4163,7 +4325,6 @@ void cuda_timing_end(timing_result_t* results, int size)
 #include "reduce.cu"
 #include "runlength_encode.cu"
 #include "scan.cu"
-#include "marching.cu"
 #include "sparse.cu"
 #include "volume.cu"
 #include "volume_builder.cu"