PyPI - warp-lang - Versions diffs - 1.9.0__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.9.0__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (37) hide show

warp/__init__.pyi +1420 -2
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build_dll.py +322 -72
warp/builtins.py +289 -23
warp/codegen.py +5 -0
warp/config.py +1 -1
warp/context.py +243 -32
warp/examples/interop/example_jax_kernel.py +2 -1
warp/jax_experimental/custom_call.py +24 -1
warp/jax_experimental/ffi.py +20 -0
warp/jax_experimental/xla_ffi.py +16 -7
warp/native/builtin.h +4 -4
warp/native/sort.cu +22 -13
warp/native/sort.h +2 -0
warp/native/tile.h +188 -13
warp/native/vec.h +0 -53
warp/native/warp.cpp +3 -3
warp/native/warp.cu +60 -30
warp/native/warp.h +3 -3
warp/render/render_opengl.py +14 -12
warp/render/render_usd.py +1 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/interop/test_jax.py +608 -28
warp/tests/test_array.py +2 -0
warp/tests/test_codegen.py +1 -1
warp/tests/test_fem.py +4 -4
warp/tests/test_map.py +14 -0
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -0
warp/tests/tile/test_tile.py +61 -0
warp/types.py +17 -3
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/METADATA +5 -8
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/RECORD +37 -37
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/native/tile.h CHANGED Viewed

@@ -542,7 +542,7 @@ struct tile_register_t
     // define the += operator which is used during backward pass codegen
     // when returning a register tile from a user defined function
-    inline CUDA_CALLABLE auto& operator += (tile_register_t<T, Layout>& rhs)
+    inline CUDA_CALLABLE auto& operator += (const tile_register_t<T, Layout>& rhs)
     {
         grad_add(rhs);
         return *this;
@@ -658,7 +658,7 @@ struct tile_register_t
             data[i] += tile.data[i];
     }
-    CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
+    inline CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
     {
         apply([&](int reg, auto c) {data[reg] += global.load_grad(c);});
     }
@@ -758,6 +758,7 @@ inline CUDA_CALLABLE void* tile_alloc_shared(int num_bytes, bool init=false, boo
         // one entry per-thread so no need for synchronization
         smem_base[WP_TILE_THREAD_IDX] += tile_align(num_bytes);
+        assert(smem_base[WP_TILE_THREAD_IDX] >= 0);
 #ifdef __CUDA_ARCH__
         extern __shared__ char dynamic_smem_base[];
@@ -905,6 +906,28 @@ struct tile_shared_t
     {
     }
+    // we delete the copy constructor because in the case the shared tile is owning,
+    // this leads to a double deallocation.
+    // this also forces one to handle copies explicitly
+    inline CUDA_CALLABLE tile_shared_t(const tile_shared_t& other) : data(other.data), grad(other.grad), initialized(other.initialized)
+    {
+        static_assert(!Owner, "Copy constructor is only supported for non-owning tiles.");
+    }
+    // move constructor
+    inline CUDA_CALLABLE tile_shared_t(tile_shared_t&& other) : data(other.data), grad(other.grad), initialized(other.initialized)
+    {
+        other.data.ptr = nullptr;
+        other.grad.ptr = nullptr;
+    }
+    template <typename OtherT, typename OtherLayout, bool OtherOwner>
+    inline CUDA_CALLABLE tile_shared_t(const tile_shared_t<OtherT, OtherLayout, OtherOwner>& other) : data(other.data.ptr), grad(other.grad.ptr), initialized(other.initialized)
+    {
+        static_assert(!Owner, "Copy constructor is only supported for non-owning tiles.");
+        static_assert(Layout::Size == OtherLayout::Size, "Expected Size == OtherLayout::Size");
+    }
     // initialize from an existing tile's memory
     inline CUDA_CALLABLE tile_shared_t(T* data, T* grad=nullptr, bool initialized=true) : data(data), grad(grad), initialized(initialized)
     {
@@ -932,19 +955,47 @@ struct tile_shared_t
     // construct from another shared tile, this constructor
     // is invoked for reshape operations like `wp.tile_transpose()`
+    // or `wp::copy()`
     template <typename OtherT, typename OtherLayout, bool OtherOwner>
     inline CUDA_CALLABLE auto& operator=(const tile_shared_t<OtherT, OtherLayout, OtherOwner>& rhs)
     {
         // check dimensions are compatible
         static_assert(Layout::Size == OtherLayout::Size, "Expected Size == OtherLayout::Size");
-        // alias tile directly
-        data.ptr = rhs.data.ptr;
-        grad.ptr = rhs.grad.ptr;
-        initialized = rhs.initialized;
+        if (Owner)
+        {
+            // if the tile owns the data we need to copy
+            assign(rhs);
+        }
+        else
+        {
+            // alias tile directly
+            data.ptr = rhs.data.ptr;
+            grad.ptr = rhs.grad.ptr;
+            initialized = rhs.initialized;
+        }
         return *this;
-    }
+    }
+    inline CUDA_CALLABLE auto& operator=(const tile_shared_t& rhs)
+    {
+        if (Owner)
+        {
+            // if the tile owns the data we need to copy
+            assign(rhs);
+        }
+        else
+        {
+            // alias tile directly
+            data.ptr = rhs.data.ptr;
+            grad.ptr = rhs.grad.ptr;
+            initialized = rhs.initialized;
+        }
+        return *this;
+    }
     // assign from a global tile (load)
@@ -972,6 +1023,21 @@ struct tile_shared_t
         return *this;
     }
+    // define the += operator which is used during backward pass codegen
+    // when returning a register tile from a user defined function
+    template<typename OtherLayout>
+    inline CUDA_CALLABLE auto& operator += (const tile_register_t<T, OtherLayout>& rhs)
+    {
+        grad_add(rhs);
+        return *this;
+    }
+    inline CUDA_CALLABLE auto& operator += (const tile_shared_t<T, Layout>& rhs)
+    {
+        grad_add(rhs);
+        return *this;
+    }
     // in-place zero
     inline CUDA_CALLABLE void zero()
     {
@@ -1053,6 +1119,27 @@ struct tile_shared_t
         WP_TILE_SYNC();
     }
+    // shared tile deep copy
+    template <typename OtherT, typename OtherLayout, bool OtherOwner>
+    inline CUDA_CALLABLE void assign(const tile_shared_t<OtherT, OtherLayout, OtherOwner>& tile)
+    {
+        // check dimensions are compatible
+        static_assert(Layout::Size == OtherLayout::Size, "Expected Size == OtherLayout::Size");
+        if (initialized)
+            WP_TILE_SYNC();
+        WP_PRAGMA_UNROLL
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
+        {
+            auto c = Layout::coord_from_linear(i);
+            data(c) = tile.data(c);
+        }
+        initialized = true;
+        WP_TILE_SYNC();
+    }
     // in-place gradient zero
     inline CUDA_CALLABLE void grad_zero()
     {
@@ -1092,8 +1179,21 @@ struct tile_shared_t
         WP_TILE_SYNC();
     }
+    // accumulate gradients onto this tile from another shared tile
+    inline CUDA_CALLABLE void grad_add(const tile_shared_t<T, Layout>& tile)
+    {
+        WP_PRAGMA_UNROLL
+        for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
+        {
+            auto c = Layout::coord_from_linear(i);
+            grad(c) += tile.grad(c);
+        }
+        WP_TILE_SYNC();
+    }
     // accumulate gradient onto this tile from a global array
-    CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
+    inline CUDA_CALLABLE void grad_add(const tile_global_t<T, typename Layout::Shape>& global)
     {
         WP_PRAGMA_UNROLL
         for (int i=WP_TILE_THREAD_IDX; i < Layout::Size; i += WP_TILE_BLOCK_DIM)
@@ -1477,9 +1577,16 @@ void tile_register_t<T, L>::print() const
 // print entry points
 template <typename T, typename L>
 inline CUDA_CALLABLE void print(const tile_register_t<T, L>& t) { t.print(); }
+template <typename T, typename L>
+inline CUDA_CALLABLE void adj_print(const tile_register_t<T, L>& t, const tile_register_t<T, L>& a) { a.print(); }
 template <typename T, typename L, bool Owner>
 inline CUDA_CALLABLE void print(const tile_shared_t<T, L, Owner>& t) { t.print(); }
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE void adj_print(const tile_shared_t<T, L, Owner>& t, const tile_shared_t<T, L, Owner>& a) { a.print(true); }
 template <typename T, typename L, bool O>
 inline CUDA_CALLABLE int len(const tile_shared_t<T, L, O>& t)
 {
@@ -1502,13 +1609,81 @@ inline CUDA_CALLABLE void adj_len(const tile_register_t<T,L>& t, const AdjTile&
 {
 }
+// select specialization for shared tiles
+template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
+inline CUDA_CALLABLE auto select(const C& cond, const tile_register_t<T, LRegister>& a, const tile_shared_t<T, LShared, Owner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? b.copy_to_register() : a;
+}
-template <typename T, typename L>
-inline CUDA_CALLABLE void adj_print(const tile_register_t<T, L>& t, const tile_register_t<T, L>& a) { a.print(); }
-template <typename T, typename L, bool Owner>
-inline CUDA_CALLABLE void adj_print(const tile_shared_t<T, L, Owner>& t, const tile_shared_t<T, L, Owner>& a) { a.print(true); }
+template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
+inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, LShared, Owner>& a, const tile_register_t<T, LRegister>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? b : a.copy_to_register();
+}
+template <typename C, typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, L, Owner>& a, const tile_shared_t<T, L, Owner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr) : tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr);
+}
+template <typename C, typename T, typename L, bool LOwner, bool ROwner>
+inline CUDA_CALLABLE auto select(const C& cond, const tile_shared_t<T, L, LOwner>& a, const tile_shared_t<T, L, ROwner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr) : tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr);
+}
+// adj_select same as in builtin.h
+// where specialization for register/shared tiles
+template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
+inline CUDA_CALLABLE auto where(const C& cond, const tile_register_t<T, LRegister>& a, const tile_shared_t<T, LShared, Owner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? a : b.copy_to_register();
+}
+template <typename C, typename T, typename LRegister, typename LShared, bool Owner>
+inline CUDA_CALLABLE auto where(const C& cond, const tile_shared_t<T, LShared, Owner>& a, const tile_register_t<T, LRegister>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? a.copy_to_register() : b;
+}
+template <typename C, typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto where(const C& cond, const tile_shared_t<T, L, Owner>& a, const tile_shared_t<T, L, Owner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr) : tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr);
+}
+template <typename C, typename T, typename L, bool LOwner, bool ROwner>
+inline CUDA_CALLABLE auto where(const C& cond, const tile_shared_t<T, L, LOwner>& a, const tile_shared_t<T, L, ROwner>& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? tile_shared_t<T, L, false>(a.data.ptr, a.grad.ptr) : tile_shared_t<T, L, false>(b.data.ptr, b.grad.ptr);
+}
+// adj_where same as in builtin.h
+// copy specialization for shared tiles, the lvalue this gets assigned to is owning, thus, this invokes the copy assign path
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE auto copy(const tile_shared_t<T, L, Owner>& t)
+{
+    return tile_shared_t<T, L, false>(t.data.ptr, t.grad.ptr);
+}
+template <typename T, typename L, bool Owner>
+inline CUDA_CALLABLE void adj_copy(const tile_shared_t<T, L, Owner>& src, tile_shared_t<T, L, Owner>& adj_src, tile_shared_t<T, L, Owner>& adj_dest)
+{
+    adj_src += adj_dest;
+    adj_dest.grad_zero();
+}
 // helpers to allocate shared tiles
 template <typename T, typename Shape, typename Strides, bool RequiresGrad>
@@ -3048,7 +3223,7 @@ template <typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_transpose(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
     auto a = tile_transpose(adj_ret);
-    auto b = adj_t;
+    auto& b = adj_t;
     adj_t.assign(tile_add(a,b));
 }

warp/native/vec.h CHANGED Viewed

@@ -343,17 +343,6 @@ inline CUDA_CALLABLE vec_t<Length, Type> add(vec_t<Length, Type> a, vec_t<Length
     return ret;
 }
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> add(Type a, vec_t<Length, Type> b)
-{
-    vec_t<Length, Type> ret;
-    for( unsigned i=0; i < Length; ++i )
-    {
-        ret[i] = a + b[i];
-    }
-    return ret;
-}
 template<typename Type>
 inline CUDA_CALLABLE vec_t<2, Type> add(vec_t<2, Type> a, vec_t<2, Type> b)
 {
@@ -378,18 +367,6 @@ inline CUDA_CALLABLE vec_t<Length, Type> sub(vec_t<Length, Type> a, vec_t<Length
     return ret;
 }
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> sub(Type a, vec_t<Length, Type> b)
-{
-    vec_t<Length, Type> ret;
-    for (unsigned i=0; i < Length; ++i)
-    {
-        ret[i] = Type(a - b[i]);
-    }
-    return ret;
-}
 template<typename Type>
 inline CUDA_CALLABLE vec_t<2, Type> sub(vec_t<2, Type> a, vec_t<2, Type> b)
 {
@@ -1303,21 +1280,6 @@ inline CUDA_CALLABLE void adj_add(vec_t<Length, Type> a, vec_t<Length, Type> b,
     adj_b += adj_ret;
 }
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void adj_add(
-    Type a, vec_t<Length, Type> b,
-    Type& adj_a, vec_t<Length, Type>& adj_b,
-    const vec_t<Length, Type>& adj_ret
-)
-{
-    for (unsigned i = 0; i < Length; ++i)
-    {
-        adj_a += adj_ret.c[i];
-    }
-    adj_b += adj_ret;
-}
 template<typename Type>
 inline CUDA_CALLABLE void adj_add(vec_t<2, Type> a, vec_t<2, Type> b, vec_t<2, Type>& adj_a, vec_t<2, Type>& adj_b, const vec_t<2, Type>& adj_ret)
 {
@@ -1345,21 +1307,6 @@ inline CUDA_CALLABLE void adj_sub(vec_t<Length, Type> a, vec_t<Length, Type> b,
     adj_b -= adj_ret;
 }
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void adj_sub(
-    Type a, vec_t<Length, Type> b,
-    Type& adj_a, vec_t<Length, Type>& adj_b,
-    const vec_t<Length, Type>& adj_ret
-)
-{
-    for (unsigned i = 0; i < Length; ++i)
-    {
-        adj_a += adj_ret.c[i];
-    }
-    adj_b -= adj_ret;
-}
 template<typename Type>
 inline CUDA_CALLABLE void adj_sub(vec_t<2, Type> a, vec_t<2, Type> b, vec_t<2, Type>& adj_a, vec_t<2, Type>& adj_b, const vec_t<2, Type>& adj_ret)
 {

warp/native/warp.cpp CHANGED Viewed

@@ -1078,9 +1078,9 @@ WP_API bool wp_cuda_graph_destroy(void* context, void* graph) { return false; }
 WP_API bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec) { return false; }
 WP_API bool wp_capture_debug_dot_print(void* graph, const char *path, uint32_t flags) { return false; }
-WP_API bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret) { return false; }
-WP_API bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret) { return false; }
-WP_API bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle) { return false; }
+WP_API bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret) { return false; }
+WP_API bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret) { return false; }
+WP_API bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle) { return false; }
 WP_API bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret) { return false; }
 WP_API bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph) { return false; }
 WP_API bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph) { return false; }

warp/native/warp.cu CHANGED Viewed

@@ -19,6 +19,7 @@
 #include "scan.h"
 #include "cuda_util.h"
 #include "error.h"
+#include "sort.h"
 #include <cstdlib>
 #include <fstream>
@@ -2448,6 +2449,9 @@ void wp_cuda_stream_destroy(void* context, void* stream)
     wp_cuda_stream_unregister(context, stream);
+    // release temporary radix sort buffer associated with this stream
+    radix_sort_release(context, stream);
     check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
 }
@@ -2811,11 +2815,12 @@ bool wp_cuda_graph_create_exec(void* context, void* stream, void* graph, void**
 // Support for conditional graph nodes available with CUDA 12.4+.
 #if CUDA_VERSION >= 12040
-// CUBIN data for compiled conditional modules, loaded on demand, keyed on device architecture
-static std::map<int, void*> g_conditional_cubins;
+// CUBIN or PTX data for compiled conditional modules, loaded on demand, keyed on device architecture
+using ModuleKey = std::pair<int, bool>; // <arch, use_ptx>
+static std::map<ModuleKey, void*> g_conditional_modules;
 // Compile module with conditional helper kernels
-static void* compile_conditional_module(int arch)
+static void* compile_conditional_module(int arch, bool use_ptx)
 {
     static const char* kernel_source = R"(
         typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
@@ -2844,8 +2849,9 @@ static void* compile_conditional_module(int arch)
     )";
     // avoid recompilation
-    auto it = g_conditional_cubins.find(arch);
-    if (it != g_conditional_cubins.end())
+    ModuleKey key = {arch, use_ptx};
+    auto it = g_conditional_modules.find(key);
+    if (it != g_conditional_modules.end())
         return it->second;
     nvrtcProgram prog;
@@ -2853,11 +2859,23 @@ static void* compile_conditional_module(int arch)
         return NULL;
     char arch_opt[128];
-    snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
+    if (use_ptx)
+        snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=compute_%d", arch);
+    else
+        snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
     std::vector<const char*> opts;
     opts.push_back(arch_opt);
+    const bool print_debug = (std::getenv("WARP_DEBUG") != nullptr);
+    if (print_debug)
+    {
+        printf("NVRTC options (conditional module, arch=%d, use_ptx=%s):\n", arch, use_ptx ? "true" : "false");
+        for(auto o: opts) {
+            printf("%s\n", o);
+        }
+    }
     if (!check_nvrtc(nvrtcCompileProgram(prog, int(opts.size()), opts.data())))
     {
         size_t log_size;
@@ -2874,23 +2892,37 @@ static void* compile_conditional_module(int arch)
     // get output
     char* output = NULL;
     size_t output_size = 0;
-    check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
-    if (output_size > 0)
+    if (use_ptx)
+    {
+        check_nvrtc(nvrtcGetPTXSize(prog, &output_size));
+        if (output_size > 0)
+        {
+            output = new char[output_size];
+            if (check_nvrtc(nvrtcGetPTX(prog, output)))
+                g_conditional_modules[key] = output;
+        }
+    }
+    else
     {
-        output = new char[output_size];
-        if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
-            g_conditional_cubins[arch] = output;
+        check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
+        if (output_size > 0)
+        {
+            output = new char[output_size];
+            if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
+                g_conditional_modules[key] = output;
+        }
     }
     nvrtcDestroyProgram(&prog);
-    // return CUBIN data
+    // return CUBIN or PTX data
     return output;
 }
 // Load module with conditional helper kernels
-static CUmodule load_conditional_module(void* context)
+static CUmodule load_conditional_module(void* context, int arch, bool use_ptx)
 {
     ContextInfo* context_info = get_context_info(context);
     if (!context_info)
@@ -2900,17 +2932,15 @@ static CUmodule load_conditional_module(void* context)
     if (context_info->conditional_module)
         return context_info->conditional_module;
-    int arch = context_info->device_info->arch;
     // compile if needed
-    void* compiled_module = compile_conditional_module(arch);
+    void* compiled_module = compile_conditional_module(arch, use_ptx);
     if (!compiled_module)
     {
         fprintf(stderr, "Warp error: Failed to compile conditional kernels\n");
         return NULL;
     }
-    // load module
+    // load module (handles both PTX and CUBIN data automatically)
     CUmodule module = NULL;
     if (!check_cu(cuModuleLoadDataEx_f(&module, compiled_module, 0, NULL, NULL)))
     {
@@ -2923,10 +2953,10 @@ static CUmodule load_conditional_module(void* context)
     return module;
 }
-static CUfunction get_conditional_kernel(void* context, const char* name)
+static CUfunction get_conditional_kernel(void* context, int arch, bool use_ptx, const char* name)
 {
     // load module if needed
-    CUmodule module = load_conditional_module(context);
+    CUmodule module = load_conditional_module(context, arch, use_ptx);
     if (!module)
         return NULL;
@@ -2976,7 +3006,7 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
 // https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
 // condition is a gpu pointer
 // if_graph_ret and else_graph_ret should be NULL if not needed
-bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret)
 {
     bool has_if = if_graph_ret != NULL;
     bool has_else = else_graph_ret != NULL;
@@ -3019,9 +3049,9 @@ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, v
         // (need to negate the condition if only the else branch is used)
         CUfunction kernel;
         if (has_if)
-            kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+            kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
         else
-            kernel = get_conditional_kernel(context, "set_conditional_else_handle_kernel");
+            kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_else_handle_kernel");
         if (!kernel)
         {
@@ -3072,7 +3102,7 @@ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, v
         check_cuda(cudaGraphConditionalHandleCreate(&if_handle, cuda_graph));
         check_cuda(cudaGraphConditionalHandleCreate(&else_handle, cuda_graph));
-        CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
+        CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_else_handles_kernel");
         if (!kernel)
         {
             wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3273,7 +3303,7 @@ bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_g
     return true;
 }
-bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret)
 {
     // if there's no body, it's a no-op
     if (!body_graph_ret)
@@ -3303,7 +3333,7 @@ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, voi
         return false;
     // launch a kernel to set the condition handle from condition pointer
-    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
     if (!kernel)
     {
         wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3339,14 +3369,14 @@ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, voi
     return true;
 }
-bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle)
 {
     ContextGuard guard(context);
     CUstream cuda_stream = static_cast<CUstream>(stream);
     // launch a kernel to set the condition handle from condition pointer
-    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
     if (!kernel)
     {
         wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3378,19 +3408,19 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
     return false;
 }
-bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;

warp/native/warp.h CHANGED Viewed

@@ -314,9 +314,9 @@ extern "C"
     WP_API bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec);
     WP_API bool wp_capture_debug_dot_print(void* graph, const char *path, uint32_t flags);
-    WP_API bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret);
-    WP_API bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret);
-    WP_API bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle);
+    WP_API bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret);
+    WP_API bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret);
+    WP_API bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle);
     WP_API bool wp_cuda_graph_pause_capture(void* context, void* stream, void** graph_ret);
     WP_API bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph);
     WP_API bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph);