PyPI - warp-lang - Versions diffs - 1.7.2__py3-none-macosx_10_13_universal2.whl → 1.8.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.7.2__py3-none-macosx_10_13_universal2.whl → 1.8.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (181) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +241 -252
warp/build_dll.py +125 -26
warp/builtins.py +1907 -384
warp/codegen.py +257 -101
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +657 -223
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +97 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +107 -52
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +12 -17
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +99 -0
warp/native/builtin.h +174 -31
warp/native/coloring.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +3 -3
warp/native/mat.h +5 -10
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/quat.h +28 -4
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/tile.h +583 -72
warp/native/tile_radix_sort.h +1108 -0
warp/native/tile_reduce.h +237 -2
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +6 -16
warp/native/warp.cpp +36 -4
warp/native/warp.cu +574 -51
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +58 -29
warp/render/render_usd.py +124 -61
warp/sim/__init__.py +9 -0
warp/sim/collide.py +252 -78
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +751 -320
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +52 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +15 -1
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_atomic_cas.py +299 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +1 -24
warp/tests/test_quat.py +6 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +51 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_struct.py +2 -2
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/tile/test_tile.py +420 -1
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_reduce.py +213 -0
warp/tests/tile/test_tile_shared_memory.py +130 -1
warp/tests/tile/test_tile_sort.py +117 -0
warp/tests/unittest_suites.py +4 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/RECORD +178 -166
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0

warp/native/warp.cu CHANGED Viewed

@@ -27,6 +27,9 @@
 #if WP_ENABLE_MATHDX
     #include <nvJitLink.h>
     #include <libmathdx.h>
+    #include <libcublasdx.h>
+    #include <libcufftdx.h>
+    #include <libcusolverdx.h>
 #endif
 #include <array>
@@ -155,6 +158,7 @@ struct DeviceInfo
     int arch = 0;
     int is_uva = 0;
     int is_mempool_supported = 0;
+    int sm_count = 0;
     int is_ipc_supported = -1;
     int max_smem_bytes = 0;
     CUcontext primary_context = NULL;
@@ -166,6 +170,9 @@ struct ContextInfo
     // the current stream, managed from Python (see cuda_context_set_stream() and cuda_context_get_stream())
     CUstream stream = NULL;
+    // conditional graph node support, loaded on demand if the driver supports it (CUDA 12.4+)
+    CUmodule conditional_module = NULL;
 };
 struct CaptureInfo
@@ -280,6 +287,7 @@ int cuda_init()
                 check_cu(cuDeviceGetAttribute_f(&g_devices[i].pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device));
                 check_cu(cuDeviceGetAttribute_f(&g_devices[i].is_uva, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, device));
                 check_cu(cuDeviceGetAttribute_f(&g_devices[i].is_mempool_supported, CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, device));
+                check_cu(cuDeviceGetAttribute_f(&g_devices[i].sm_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
 #ifdef CUDA_VERSION
 #if CUDA_VERSION >= 12000
                 int device_attribute_integrated = 0;
@@ -1786,6 +1794,13 @@ int cuda_device_get_arch(int ordinal)
     return 0;
 }
+int cuda_device_get_sm_count(int ordinal)
+{
+    if (ordinal >= 0 && ordinal < int(g_devices.size()))
+        return g_devices[ordinal].sm_count;
+    return 0;
+}
 void cuda_device_get_uuid(int ordinal, char uuid[16])
 {
     memcpy(uuid, g_devices[ordinal].uuid.bytes, sizeof(char)*16);
@@ -2034,6 +2049,9 @@ void cuda_context_destroy(void* context)
             if (info->stream)
                 check_cu(cuStreamDestroy_f(info->stream));
+            if (info->conditional_module)
+                check_cu(cuModuleUnload_f(info->conditional_module));
             g_contexts.erase(ctx);
         }
@@ -2739,22 +2757,10 @@ bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
     if (external)
         return true;
-    cudaGraphExec_t graph_exec = NULL;
     // end the capture
     if (!check_cuda(cudaStreamEndCapture(cuda_stream, &graph)))
         return false;
-    // enable to create debug GraphVis visualization of graph
-    // cudaGraphDebugDotPrint(graph, "graph.dot", cudaGraphDebugDotFlagsVerbose);
-    // can use after CUDA 11.4 to permit graphs to capture cudaMallocAsync() operations
-    if (!check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, graph, cudaGraphInstantiateFlagAutoFreeOnLaunch)))
-        return false;
-    // free source graph
-    check_cuda(cudaGraphDestroy(graph));
     // process deferred free list if no more captures are ongoing
     if (g_captures.empty())
     {
@@ -2763,11 +2769,503 @@ bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
     }
     if (graph_ret)
-        *graph_ret = graph_exec;
+        *graph_ret = graph;
+    return true;
+}
+bool capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
+{
+    if (!check_cuda(cudaGraphDebugDotPrint((cudaGraph_t)graph, path, flags)))
+        return false;
+    return true;
+}
+bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
+{
+    ContextGuard guard(context);
+    cudaGraphExec_t graph_exec = NULL;
+    if (!check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, (cudaGraph_t)graph, cudaGraphInstantiateFlagAutoFreeOnLaunch)))
+        return false;
+    if (graph_exec_ret)
+        *graph_exec_ret = graph_exec;
+    return true;
+}
+// Support for conditional graph nodes available with CUDA 12.4+.
+#if CUDA_VERSION >= 12040
+// CUBIN data for compiled conditional modules, loaded on demand, keyed on device architecture
+static std::map<int, void*> g_conditional_cubins;
+// Compile module with conditional helper kernels
+static void* compile_conditional_module(int arch)
+{
+    static const char* kernel_source = R"(
+        typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
+        extern "C" __device__ __cudart_builtin__ void cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value);
+        extern "C" __global__ void set_conditional_if_handle_kernel(cudaGraphConditionalHandle handle, int* value)
+        {
+            if (threadIdx.x + blockIdx.x * blockDim.x == 0)
+                cudaGraphSetConditional(handle, *value);
+        }
+        extern "C" __global__ void set_conditional_else_handle_kernel(cudaGraphConditionalHandle handle, int* value)
+        {
+            if (threadIdx.x + blockIdx.x * blockDim.x == 0)
+                cudaGraphSetConditional(handle, !*value);
+        }
+        extern "C" __global__ void set_conditional_if_else_handles_kernel(cudaGraphConditionalHandle if_handle, cudaGraphConditionalHandle else_handle, int* value)
+        {
+            if (threadIdx.x + blockIdx.x * blockDim.x == 0)
+            {
+                cudaGraphSetConditional(if_handle, *value);
+                cudaGraphSetConditional(else_handle, !*value);
+            }
+        }
+    )";
+    // avoid recompilation
+    auto it = g_conditional_cubins.find(arch);
+    if (it != g_conditional_cubins.end())
+        return it->second;
+    nvrtcProgram prog;
+    if (!check_nvrtc(nvrtcCreateProgram(&prog, kernel_source, "conditional_kernels", 0, NULL, NULL)))
+        return NULL;
+    char arch_opt[128];
+    snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
+    std::vector<const char*> opts;
+    opts.push_back(arch_opt);
+    if (!check_nvrtc(nvrtcCompileProgram(prog, int(opts.size()), opts.data())))
+    {
+        size_t log_size;
+        if (check_nvrtc(nvrtcGetProgramLogSize(prog, &log_size)))
+        {
+            std::vector<char> log(log_size);
+            if (check_nvrtc(nvrtcGetProgramLog(prog, log.data())))
+                fprintf(stderr, "%s", log.data());
+        }
+        nvrtcDestroyProgram(&prog);
+        return NULL;
+    }
+    // get output
+    char* output = NULL;
+    size_t output_size = 0;
+    check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
+    if (output_size > 0)
+    {
+        output = new char[output_size];
+        if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
+            g_conditional_cubins[arch] = output;
+    }
+    nvrtcDestroyProgram(&prog);
+    // return CUBIN data
+    return output;
+}
+// Load module with conditional helper kernels
+static CUmodule load_conditional_module(void* context)
+{
+    ContextInfo* context_info = get_context_info(context);
+    if (!context_info)
+        return NULL;
+    // check if already loaded
+    if (context_info->conditional_module)
+        return context_info->conditional_module;
+    int arch = context_info->device_info->arch;
+    // compile if needed
+    void* compiled_module = compile_conditional_module(arch);
+    if (!compiled_module)
+    {
+        fprintf(stderr, "Warp error: Failed to compile conditional kernels\n");
+        return NULL;
+    }
+    // load module
+    CUmodule module = NULL;
+    if (!check_cu(cuModuleLoadDataEx_f(&module, compiled_module, 0, NULL, NULL)))
+    {
+        fprintf(stderr, "Warp error: Failed to load conditional kernels module\n");
+        return NULL;
+    }
+    context_info->conditional_module = module;
+    return module;
+}
+static CUfunction get_conditional_kernel(void* context, const char* name)
+{
+    // load module if needed
+    CUmodule module = load_conditional_module(context);
+    if (!module)
+        return NULL;
+    CUfunction kernel;
+    if (!check_cu(cuModuleGetFunction_f(&kernel, module, name)))
+    {
+        fprintf(stderr, "Warp error: Failed to get kernel %s\n", name);
+        return NULL;
+    }
+    return kernel;
+}
+bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
+{
+    ContextGuard guard(context);
+    CUstream cuda_stream = static_cast<CUstream>(stream);
+    if (!check_cuda(cudaStreamEndCapture(cuda_stream, (cudaGraph_t*)graph_ret)))
+        return false;
+    return true;
+}
+bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
+{
+    ContextGuard guard(context);
+    CUstream cuda_stream = static_cast<CUstream>(stream);
+    cudaGraph_t cuda_graph = static_cast<cudaGraph_t>(graph);
+    std::vector<cudaGraphNode_t> leaf_nodes;
+    if (!get_graph_leaf_nodes(cuda_graph, leaf_nodes))
+        return false;
+    if (!check_cuda(cudaStreamBeginCaptureToGraph(cuda_stream,
+                                  cuda_graph,
+                                  leaf_nodes.data(),
+                                  nullptr,
+                                  leaf_nodes.size(),
+                                  cudaStreamCaptureModeGlobal)))
+        return false;
+    return true;
+}
+// https://developer.nvidia.com/blog/constructing-cuda-graphs-with-dynamic-parameters/#combined_approach
+// https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
+// condition is a gpu pointer
+// if_graph_ret and else_graph_ret should be NULL if not needed
+bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+{
+    bool has_if = if_graph_ret != NULL;
+    bool has_else = else_graph_ret != NULL;
+    int num_branches = int(has_if) + int(has_else);
+    // if neither the IF nor ELSE branches are required, it's a no-op
+    if (num_branches == 0)
+        return true;
+    ContextGuard guard(context);
+    CUstream cuda_stream = static_cast<CUstream>(stream);
+    // Get the current stream capturing graph
+    cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
+    cudaGraph_t cuda_graph = NULL;
+    const cudaGraphNode_t* capture_deps = NULL;
+    size_t dep_count = 0;
+    if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+        return false;
+    // abort if not capturing
+    if (!cuda_graph || capture_status != cudaStreamCaptureStatusActive)
+    {
+        wp::set_error_string("Stream is not capturing");
+        return false;
+    }
+    //int driver_version = cuda_driver_version();
+    // IF-ELSE nodes are only supported with CUDA 12.8+
+    // Somehow child graphs produce wrong results when an else branch is used
+    // Seems to be a bug in the CUDA driver: https://nvbugs/5241330
+    if (num_branches == 1 /*|| driver_version >= 12080*/)
+    {
+        cudaGraphConditionalHandle handle;
+        cudaGraphConditionalHandleCreate(&handle, cuda_graph);
+        // run a kernel to set the condition handle from the condition pointer
+        // (need to negate the condition if only the else branch is used)
+        CUfunction kernel;
+        if (has_if)
+            kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+        else
+            kernel = get_conditional_kernel(context, "set_conditional_else_handle_kernel");
+        if (!kernel)
+        {
+            wp::set_error_string("Failed to get built-in conditional kernel");
+            return false;
+        }
+        void* kernel_args[2];
+        kernel_args[0] = &handle;
+        kernel_args[1] = &condition;
+        if (!check_cuda(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
+            return false;
+        if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+            return false;
+        // create conditional node
+        cudaGraphNode_t condition_node;
+        cudaGraphNodeParams condition_params = { cudaGraphNodeTypeConditional };
+        condition_params.conditional.handle = handle;
+        condition_params.conditional.type   = cudaGraphCondTypeIf;
+        condition_params.conditional.size   = num_branches;
+        if (!check_cuda(cudaGraphAddNode(&condition_node, cuda_graph, capture_deps, dep_count, &condition_params)))
+            return false;
+        if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &condition_node, 1, cudaStreamSetCaptureDependencies)))
+            return false;
+        if (num_branches == 1)
+        {
+            if (has_if)
+                *if_graph_ret = condition_params.conditional.phGraph_out[0];
+            else
+                *else_graph_ret = condition_params.conditional.phGraph_out[0];
+        }
+        else
+        {
+            *if_graph_ret = condition_params.conditional.phGraph_out[0];
+            *else_graph_ret = condition_params.conditional.phGraph_out[1];
+        }
+    }
+    else
+    {
+        // Create IF node followed by an additional IF node with negated condition
+        cudaGraphConditionalHandle if_handle, else_handle;
+        cudaGraphConditionalHandleCreate(&if_handle, cuda_graph);
+        cudaGraphConditionalHandleCreate(&else_handle, cuda_graph);
+        CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
+        if (!kernel)
+        {
+            wp::set_error_string("Failed to get built-in conditional kernel");
+            return false;
+        }
+        void* kernel_args[3];
+        kernel_args[0] = &if_handle;
+        kernel_args[1] = &else_handle;
+        kernel_args[2] = &condition;
+        if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
+            return false;
+        if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+            return false;
+        cudaGraphNode_t if_node;
+        cudaGraphNodeParams if_params = { cudaGraphNodeTypeConditional };
+        if_params.conditional.handle = if_handle;
+        if_params.conditional.type   = cudaGraphCondTypeIf;
+        if_params.conditional.size   = 1;
+        if (!check_cuda(cudaGraphAddNode(&if_node, cuda_graph, capture_deps, dep_count, &if_params)))
+            return false;
+        cudaGraphNode_t else_node;
+        cudaGraphNodeParams else_params = { cudaGraphNodeTypeConditional };
+        else_params.conditional.handle = else_handle;
+        else_params.conditional.type   = cudaGraphCondTypeIf;
+        else_params.conditional.size   = 1;
+        if (!check_cuda(cudaGraphAddNode(&else_node, cuda_graph, &if_node, 1, &else_params)))
+            return false;
+        if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &else_node, 1, cudaStreamSetCaptureDependencies)))
+            return false;
+        *if_graph_ret = if_params.conditional.phGraph_out[0];
+        *else_graph_ret = else_params.conditional.phGraph_out[0];
+    }
+    return true;
+}
+bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
+{
+    ContextGuard guard(context);
+    CUstream cuda_stream = static_cast<CUstream>(stream);
+    // Get the current stream capturing graph
+    cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
+    void* cuda_graph = NULL;
+    const cudaGraphNode_t* capture_deps = NULL;
+    size_t dep_count = 0;
+    if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, (cudaGraph_t*)&cuda_graph, &capture_deps, &dep_count)))
+        return false;
+    if (!cuda_graph_pause_capture(context, cuda_stream, &cuda_graph))
+        return false;
+    cudaGraphNode_t body_node;
+    if (!check_cuda(cudaGraphAddChildGraphNode(&body_node,
+                                                static_cast<cudaGraph_t>(cuda_graph),
+                                                capture_deps, dep_count,
+                                                static_cast<cudaGraph_t>(child_graph))))
+        return false;
+    if (!cuda_graph_resume_capture(context, cuda_stream, cuda_graph))
+        return false;
+    if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &body_node, 1, cudaStreamSetCaptureDependencies)))
+        return false;
     return true;
 }
+bool cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+{
+    // if there's no body, it's a no-op
+    if (!body_graph_ret)
+        return true;
+    ContextGuard guard(context);
+    CUstream cuda_stream = static_cast<CUstream>(stream);
+    // Get the current stream capturing graph
+    cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
+    cudaGraph_t cuda_graph = NULL;
+    const cudaGraphNode_t* capture_deps = NULL;
+    size_t dep_count = 0;
+    if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+        return false;
+    // abort if not capturing
+    if (!cuda_graph || capture_status != cudaStreamCaptureStatusActive)
+    {
+        wp::set_error_string("Stream is not capturing");
+        return false;
+    }
+    cudaGraphConditionalHandle handle;
+    if (!check_cuda(cudaGraphConditionalHandleCreate(&handle, cuda_graph)))
+        return false;
+    // launch a kernel to set the condition handle from condition pointer
+    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    if (!kernel)
+    {
+        wp::set_error_string("Failed to get built-in conditional kernel");
+        return false;
+    }
+    void* kernel_args[2];
+    kernel_args[0] = &handle;
+    kernel_args[1] = &condition;
+    if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
+        return false;
+    if (!check_cuda(cudaStreamGetCaptureInfo(cuda_stream, &capture_status, nullptr, &cuda_graph, &capture_deps, &dep_count)))
+        return false;
+    // insert conditional graph node
+    cudaGraphNode_t while_node;
+    cudaGraphNodeParams while_params = { cudaGraphNodeTypeConditional };
+    while_params.conditional.handle = handle;
+    while_params.conditional.type   = cudaGraphCondTypeWhile;
+    while_params.conditional.size   = 1;
+    if (!check_cuda(cudaGraphAddNode(&while_node, cuda_graph, capture_deps, dep_count, &while_params)))
+        return false;
+    if (!check_cuda(cudaStreamUpdateCaptureDependencies(cuda_stream, &while_node, 1, cudaStreamSetCaptureDependencies)))
+        return false;
+    *body_graph_ret = while_params.conditional.phGraph_out[0];
+    *handle_ret = handle;
+    return true;
+}
+bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+{
+    ContextGuard guard(context);
+    CUstream cuda_stream = static_cast<CUstream>(stream);
+    // launch a kernel to set the condition handle from condition pointer
+    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    if (!kernel)
+    {
+        wp::set_error_string("Failed to get built-in conditional kernel");
+        return false;
+    }
+    void* kernel_args[2];
+    kernel_args[0] = &handle;
+    kernel_args[1] = &condition;
+    if (!check_cu(cuLaunchKernel_f(kernel, 1, 1, 1, 1, 1, 1, 0, cuda_stream, kernel_args, NULL)))
+        return false;
+    return true;
+}
+#else
+// stubs for conditional graph node API if CUDA toolkit is too old.
+bool cuda_graph_pause_capture(void* context, void* stream, void** graph_ret)
+{
+    wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
+    return false;
+}
+bool cuda_graph_resume_capture(void* context, void* stream, void* graph)
+{
+    wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
+    return false;
+}
+bool cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+{
+    wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
+    return false;
+}
+bool cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+{
+    wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
+    return false;
+}
+bool cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+{
+    wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
+    return false;
+}
+bool cuda_graph_insert_child_graph(void* context, void* stream, void* child_graph)
+{
+    wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
+    return false;
+}
+#endif // support for conditional graph nodes
 bool cuda_graph_launch(void* graph_exec, void* stream)
 {
     // TODO: allow naming graphs?
@@ -2780,7 +3278,14 @@ bool cuda_graph_launch(void* graph_exec, void* stream)
     return result;
 }
-bool cuda_graph_destroy(void* context, void* graph_exec)
+bool cuda_graph_destroy(void* context, void* graph)
+{
+    ContextGuard guard(context);
+    return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
+}
+bool cuda_graph_exec_destroy(void* context, void* graph_exec)
 {
     ContextGuard guard(context);
@@ -2832,7 +3337,7 @@ bool write_file(const char* data, size_t size, std::string filename, const char*
     }
 #endif
-size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
+size_t cuda_compile_program(const char* cuda_src, const char* program_name, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, bool fuse_fp, bool lineinfo, bool compile_time_trace, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes, int* ltoir_input_types)
 {
     // use file extension to determine whether to output PTX or CUBIN
     const char* output_ext = strrchr(output_path, '.');
@@ -2919,11 +3424,11 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
     else
         opts.push_back("--fmad=false");
-    std::vector<std::string> cuda_include_opt;
+    std::vector<std::string> stored_options;
     for(int i = 0; i < num_cuda_include_dirs; i++)
     {
-        cuda_include_opt.push_back(std::string("--include-path=") + cuda_include_dirs[i]);
-        opts.push_back(cuda_include_opt.back().c_str());
+        stored_options.push_back(std::string("--include-path=") + cuda_include_dirs[i]);
+        opts.push_back(stored_options.back().c_str());
     }
     opts.push_back("--device-as-default-execution-space");
@@ -2936,6 +3441,16 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         opts.push_back("--relocatable-device-code=true");
     }
+    if (compile_time_trace)
+    {
+#if CUDA_VERSION >= 12080
+        stored_options.push_back(std::string("--fdevice-time-trace=") + std::string(output_path).append("_compile-time-trace.json"));
+        opts.push_back(stored_options.back().c_str());
+#else
+        fprintf(stderr, "Warp warning: CUDA version is less than 12.8, compile_time_trace is not supported\n");
+#endif
+    }
     nvrtcProgram prog;
     nvrtcResult res;
@@ -3162,11 +3677,11 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         CHECK_ANY(num_include_dirs == 0);
         bool res = true;
-        cufftdxHandle h;
-        CHECK_CUFFTDX(cufftdxCreate(&h));
+        cufftdxDescriptor h;
+        CHECK_CUFFTDX(cufftdxCreateDescriptor(&h));
-        // CUFFTDX_API_BLOCK_LMEM means each thread starts with a subset of the data
-        CHECK_CUFFTDX(cufftdxSetOperatorInt64(h, cufftdxOperatorType::CUFFTDX_OPERATOR_API, cufftdxApi::CUFFTDX_API_BLOCK_LMEM));
+        // CUFFTDX_API_LMEM means each thread starts with a subset of the data
+        CHECK_CUFFTDX(cufftdxSetOperatorInt64(h, cufftdxOperatorType::CUFFTDX_OPERATOR_API, cufftdxApi::CUFFTDX_API_LMEM));
         CHECK_CUFFTDX(cufftdxSetOperatorInt64(h, cufftdxOperatorType::CUFFTDX_OPERATOR_EXECUTION, commondxExecution::COMMONDX_EXECUTION_BLOCK));
         CHECK_CUFFTDX(cufftdxSetOperatorInt64(h, cufftdxOperatorType::CUFFTDX_OPERATOR_SIZE, (long long)size));
         CHECK_CUFFTDX(cufftdxSetOperatorInt64(h, cufftdxOperatorType::CUFFTDX_OPERATOR_DIRECTION, (cufftdxDirection)direction));
@@ -3191,7 +3706,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
             res = false;
         }
-        CHECK_CUFFTDX(cufftdxDestroy(h));
+        CHECK_CUFFTDX(cufftdxDestroyDescriptor(h));
         return res;
     }
@@ -3207,22 +3722,22 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         CHECK_ANY(num_include_dirs == 0);
         bool res = true;
-        cublasdxHandle h;
-        CHECK_CUBLASDX(cublasdxCreate(&h));
+        cublasdxDescriptor h;
+        CHECK_CUBLASDX(cublasdxCreateDescriptor(&h));
         CHECK_CUBLASDX(cublasdxSetOperatorInt64(h, cublasdxOperatorType::CUBLASDX_OPERATOR_FUNCTION, cublasdxFunction::CUBLASDX_FUNCTION_MM));
         CHECK_CUBLASDX(cublasdxSetOperatorInt64(h, cublasdxOperatorType::CUBLASDX_OPERATOR_EXECUTION, commondxExecution::COMMONDX_EXECUTION_BLOCK));
-        CHECK_CUBLASDX(cublasdxSetOperatorInt64(h, cublasdxOperatorType::CUBLASDX_OPERATOR_API, cublasdxApi::CUBLASDX_API_BLOCK_SMEM));
+        CHECK_CUBLASDX(cublasdxSetOperatorInt64(h, cublasdxOperatorType::CUBLASDX_OPERATOR_API, cublasdxApi::CUBLASDX_API_SMEM));
         std::array<long long int, 3> precisions = {precision_A, precision_B, precision_C};
-        CHECK_CUBLASDX(cublasdxSetOperatorInt64Array(h, cublasdxOperatorType::CUBLASDX_OPERATOR_PRECISION, 3, precisions.data()));
+        CHECK_CUBLASDX(cublasdxSetOperatorInt64s(h, cublasdxOperatorType::CUBLASDX_OPERATOR_PRECISION, 3, precisions.data()));
         CHECK_CUBLASDX(cublasdxSetOperatorInt64(h, cublasdxOperatorType::CUBLASDX_OPERATOR_SM, (long long)(arch * 10)));
         CHECK_CUBLASDX(cublasdxSetOperatorInt64(h, cublasdxOperatorType::CUBLASDX_OPERATOR_TYPE, (cublasdxType)type));
         std::array<long long int, 3> block_dim = {num_threads, 1, 1};
-        CHECK_CUBLASDX(cublasdxSetOperatorInt64Array(h, cublasdxOperatorType::CUBLASDX_OPERATOR_BLOCK_DIM, block_dim.size(), block_dim.data()));
+        CHECK_CUBLASDX(cublasdxSetOperatorInt64s(h, cublasdxOperatorType::CUBLASDX_OPERATOR_BLOCK_DIM, block_dim.size(), block_dim.data()));
         std::array<long long int, 3> size = {M, N, K};
-        CHECK_CUBLASDX(cublasdxSetOperatorInt64Array(h, cublasdxOperatorType::CUBLASDX_OPERATOR_SIZE, size.size(), size.data()));
+        CHECK_CUBLASDX(cublasdxSetOperatorInt64s(h, cublasdxOperatorType::CUBLASDX_OPERATOR_SIZE, size.size(), size.data()));
         std::array<long long int, 3> arrangement = {arrangement_A, arrangement_B, arrangement_C};
-        CHECK_CUBLASDX(cublasdxSetOperatorInt64Array(h, cublasdxOperatorType::CUBLASDX_OPERATOR_ARRANGEMENT, arrangement.size(), arrangement.data()));
+        CHECK_CUBLASDX(cublasdxSetOperatorInt64s(h, cublasdxOperatorType::CUBLASDX_OPERATOR_ARRANGEMENT, arrangement.size(), arrangement.data()));
         CHECK_CUBLASDX(cublasdxSetOptionStr(h, commondxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
@@ -3236,12 +3751,12 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
             res = false;
         }
-        CHECK_CUBLASDX(cublasdxDestroy(h));
+        CHECK_CUBLASDX(cublasdxDestroyDescriptor(h));
         return res;
     }
-    bool cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int function, int precision, int fill_mode, int num_threads)
+    bool cuda_compile_solver(const char* fatbin_output_path, const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int NRHS, int function, int side, int diag, int precision, int arrangement_A, int arrangement_B, int fill_mode, int num_threads)
     {
         CHECK_ANY(ltoir_output_path != nullptr);
@@ -3252,34 +3767,42 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         bool res = true;
-        cusolverHandle h { 0 };
-        CHECK_CUSOLVER(cusolverCreate(&h));
-        long long int size[2] = {M, N};
-        long long int block_dim[3] = {num_threads, 1, 1};
-        CHECK_CUSOLVER(cusolverSetOperatorInt64Array(h, cusolverOperatorType::CUSOLVER_OPERATOR_SIZE, 2, size));
-        CHECK_CUSOLVER(cusolverSetOperatorInt64Array(h, cusolverOperatorType::CUSOLVER_OPERATOR_BLOCK_DIM, 3, block_dim));
-        CHECK_CUSOLVER(cusolverSetOperatorInt64(h, cusolverOperatorType::CUSOLVER_OPERATOR_TYPE, cusolverType::CUSOLVER_TYPE_REAL));
-        CHECK_CUSOLVER(cusolverSetOperatorInt64(h, cusolverOperatorType::CUSOLVER_OPERATOR_API, cusolverApi::CUSOLVER_API_BLOCK_SMEM));
-        CHECK_CUSOLVER(cusolverSetOperatorInt64(h, cusolverOperatorType::CUSOLVER_OPERATOR_FUNCTION, (cusolverFunction)function));
-        CHECK_CUSOLVER(cusolverSetOperatorInt64(h, cusolverOperatorType::CUSOLVER_OPERATOR_EXECUTION, commondxExecution::COMMONDX_EXECUTION_BLOCK));
-        CHECK_CUSOLVER(cusolverSetOperatorInt64(h, cusolverOperatorType::CUSOLVER_OPERATOR_PRECISION, (commondxPrecision)precision));
-        CHECK_CUSOLVER(cusolverSetOperatorInt64(h, cusolverOperatorType::CUSOLVER_OPERATOR_FILL_MODE, (cusolverFillMode)fill_mode));
-        CHECK_CUSOLVER(cusolverSetOperatorInt64(h, cusolverOperatorType::CUSOLVER_OPERATOR_SM, (long long)(arch * 10)));
+        cusolverdxDescriptor h { 0 };
+        CHECK_CUSOLVER(cusolverdxCreateDescriptor(&h));
+        std::array<long long int, 3> size = {M, N, NRHS};
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64s(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_SIZE, size.size(), size.data()));
+        std::array<long long int, 3> block_dim = {num_threads, 1, 1};
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64s(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_BLOCK_DIM, block_dim.size(), block_dim.data()));
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_TYPE, cusolverdxType::CUSOLVERDX_TYPE_REAL));
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_API, cusolverdxApi::CUSOLVERDX_API_SMEM));
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_FUNCTION, (cusolverdxFunction)function));
+        if (side >= 0) {
+            CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_SIDE, (cusolverdxSide)side));
+        }
+        if (diag >= 0) {
+            CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_DIAG, (cusolverdxDiag)diag));
+        }
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_EXECUTION, commondxExecution::COMMONDX_EXECUTION_BLOCK));
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_PRECISION, (commondxPrecision)precision));
+        std::array<long long int, 2> arrangement = {arrangement_A, arrangement_B};
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64s(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_ARRANGEMENT, arrangement.size(), arrangement.data()));
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_FILL_MODE, (cusolverdxFillMode)fill_mode));
+        CHECK_CUSOLVER(cusolverdxSetOperatorInt64(h, cusolverdxOperatorType::CUSOLVERDX_OPERATOR_SM, (long long)(arch * 10)));
-        CHECK_CUSOLVER(cusolverSetOptionStr(h, commondxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
+        CHECK_CUSOLVER(cusolverdxSetOptionStr(h, commondxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
         size_t lto_size = 0;
-        CHECK_CUSOLVER(cusolverGetLTOIRSize(h, &lto_size));
+        CHECK_CUSOLVER(cusolverdxGetLTOIRSize(h, &lto_size));
         std::vector<char> lto(lto_size);
-        CHECK_CUSOLVER(cusolverGetLTOIR(h, lto.size(), lto.data()));
+        CHECK_CUSOLVER(cusolverdxGetLTOIR(h, lto.size(), lto.data()));
         // This fatbin is universal, ie it is the same for any instantiations of a cusolver device function
         size_t fatbin_size = 0;
-        CHECK_CUSOLVER(cusolverGetUniversalFATBINSize(h, &fatbin_size));
+        CHECK_CUSOLVER(cusolverdxGetUniversalFATBINSize(h, &fatbin_size));
         std::vector<char> fatbin(fatbin_size);
-        CHECK_CUSOLVER(cusolverGetUniversalFATBIN(h, fatbin.size(), fatbin.data()));
+        CHECK_CUSOLVER(cusolverdxGetUniversalFATBIN(h, fatbin.size(), fatbin.data()));
         if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
             res = false;
@@ -3289,7 +3812,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
             res = false;
         }
-        CHECK_CUSOLVER(cusolverDestroy(h));
+        CHECK_CUSOLVER(cusolverdxDestroyDescriptor(h));
         return res;
     }