PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +93 -30
warp/build_dll.py +48 -63
warp/builtins.py +955 -137
warp/codegen.py +327 -209
warp/config.py +1 -1
warp/context.py +1363 -800
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +266 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +200 -91
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +203 -54
warp/marching_cubes.py +708 -0
warp/native/array.h +103 -8
warp/native/builtin.h +90 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +13 -3
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +42 -11
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +4 -4
warp/native/mat.h +1913 -119
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +5 -3
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +337 -16
warp/native/rand.h +7 -7
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +14 -14
warp/native/spatial.h +366 -17
warp/native/svd.h +23 -8
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +303 -70
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +385 -18
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +337 -193
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +137 -57
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/graph_coloring.py +2 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +559 -176
warp/tape.py +2 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +82 -7
warp/tests/test_array.py +56 -5
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1540 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +162 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +103 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_static.py +48 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tape.py +38 -0
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +216 -441
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +206 -152
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +16 -16
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +16 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/jax_experimental/ffi.py CHANGED Viewed

@@ -16,7 +16,8 @@
 import ctypes
 import threading
 import traceback
-from typing import Callable
+from enum import IntEnum
+from typing import Callable, Optional
 import jax
@@ -28,10 +29,17 @@ from warp.types import array_t, launch_bounds_t, strides_from_shape, type_to_war
 from .xla_ffi import *
+class GraphMode(IntEnum):
+    NONE = 0  # don't capture a graph
+    JAX = 1  # let JAX capture a graph
+    WARP = 2  # let Warp capture a graph
 class FfiArg:
-    def __init__(self, name, type):
+    def __init__(self, name, type, in_out=False):
         self.name = name
         self.type = type
+        self.in_out = in_out
         self.is_array = isinstance(type, wp.array)
         if self.is_array:
@@ -65,7 +73,7 @@ class FfiLaunchDesc:
 class FfiKernel:
-    def __init__(self, kernel, num_outputs, vmap_method, launch_dims, output_dims):
+    def __init__(self, kernel, num_outputs, vmap_method, launch_dims, output_dims, in_out_argnames):
         self.kernel = kernel
         self.name = generate_unique_name(kernel.func)
         self.num_outputs = num_outputs
@@ -76,17 +84,28 @@ class FfiKernel:
         self.launch_id = 0
         self.launch_descriptors = {}
+        in_out_argnames_list = in_out_argnames or []
+        in_out_argnames = set(in_out_argnames_list)
+        if len(in_out_argnames_list) != len(in_out_argnames):
+            raise AssertionError("in_out_argnames must not contain duplicate names")
         self.num_kernel_args = len(kernel.adj.args)
-        self.num_inputs = self.num_kernel_args - num_outputs
+        self.num_in_out = len(in_out_argnames)
+        self.num_inputs = self.num_kernel_args - num_outputs + self.num_in_out
         if self.num_outputs < 1:
             raise ValueError("At least one output is required")
         if self.num_outputs > self.num_kernel_args:
             raise ValueError("Number of outputs cannot be greater than the number of kernel arguments")
+        if self.num_outputs < self.num_in_out:
+            raise ValueError("Number of outputs cannot be smaller than the number of in_out_argnames")
         # process input args
         self.input_args = []
         for i in range(self.num_inputs):
-            arg = FfiArg(kernel.adj.args[i].label, kernel.adj.args[i].type)
+            arg_name = kernel.adj.args[i].label
+            arg = FfiArg(arg_name, kernel.adj.args[i].type, arg_name in in_out_argnames)
+            if arg_name in in_out_argnames:
+                in_out_argnames.remove(arg_name)
             if arg.is_array:
                 # keep track of the first input array argument
                 if self.first_array_arg is None:
@@ -96,11 +115,30 @@ class FfiKernel:
         # process output args
         self.output_args = []
         for i in range(self.num_inputs, self.num_kernel_args):
-            arg = FfiArg(kernel.adj.args[i].label, kernel.adj.args[i].type)
+            arg_name = kernel.adj.args[i].label
+            if arg_name in in_out_argnames:
+                raise AssertionError(
+                    f"Expected an output-only argument for argument {arg_name}."
+                    " in_out arguments should be placed before output-only arguments."
+                )
+            arg = FfiArg(arg_name, kernel.adj.args[i].type, False)
             if not arg.is_array:
                 raise TypeError("All output arguments must be arrays")
             self.output_args.append(arg)
+        if in_out_argnames:
+            raise ValueError(f"in_out_argnames: '{in_out_argnames}' did not match any function argument names.")
+        # Build input output aliases.
+        out_id = 0
+        input_output_aliases = {}
+        for in_id, arg in enumerate(self.input_args):
+            if not arg.in_out:
+                continue
+            input_output_aliases[in_id] = out_id
+            out_id += 1
+        self.input_output_aliases = input_output_aliases
         # register the callback
         FFI_CCALLFUNC = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.POINTER(XLA_FFI_CallFrame))
         self.callback_func = FFI_CCALLFUNC(lambda call_frame: self.ffi_callback(call_frame))
@@ -121,6 +159,9 @@ class FfiKernel:
         if vmap_method is None:
             vmap_method = self.vmap_method
+        # output types
+        out_types = []
         # process inputs
         static_inputs = {}
         for i in range(num_inputs):
@@ -150,6 +191,10 @@ class FfiKernel:
                 # stash the value to be retrieved by callback
                 static_inputs[input_arg.name] = input_arg.type(input_value)
+            # append in-out arg to output types
+            if input_arg.in_out:
+                out_types.append(get_jax_output_type(input_arg, input_value.shape))
         # launch dimensions
         if launch_dims is None:
             # use the shape of the first input array
@@ -162,8 +207,7 @@ class FfiKernel:
         else:
             launch_dims = tuple(launch_dims)
-        # output types
-        out_types = []
+        # output shapes
         if isinstance(output_dims, dict):
             # assume a dictionary of shapes keyed on argument name
             for output_arg in self.output_args:
@@ -185,6 +229,7 @@ class FfiKernel:
             self.name,
             out_types,
             vmap_method=vmap_method,
+            input_output_aliases=self.input_output_aliases,
         )
         # ensure the kernel module is loaded before the callback, otherwise graph capture may fail
@@ -238,9 +283,8 @@ class FfiKernel:
             arg_refs = []
-            # inputs
-            for i in range(num_inputs):
-                input_arg = self.input_args[i]
+            # input and in-out args
+            for i, input_arg in enumerate(self.input_args):
                 if input_arg.is_array:
                     buffer = inputs[i].contents
                     shape = buffer.dims[: input_arg.type.ndim]
@@ -255,10 +299,9 @@ class FfiKernel:
                     kernel_params[i + 1] = ctypes.addressof(arg)
                     arg_refs.append(arg)  # keep a reference
-            # outputs
-            for i in range(num_outputs):
-                output_arg = self.output_args[i]
-                buffer = outputs[i].contents
+            # pure output args (skip in-out FFI buffers)
+            for i, output_arg in enumerate(self.output_args):
+                buffer = outputs[i + self.num_in_out].contents
                 shape = buffer.dims[: output_arg.type.ndim]
                 strides = strides_from_shape(shape, output_arg.type.dtype)
                 arg = array_t(buffer.data, 0, output_arg.type.ndim, shape, strides)
@@ -274,7 +317,7 @@ class FfiKernel:
             assert hooks.forward, "Failed to find kernel entry point"
             # launch the kernel
-            wp.context.runtime.core.cuda_launch_kernel(
+            wp.context.runtime.core.wp_cuda_launch_kernel(
                 device.context,
                 hooks.forward,
                 launch_bounds.size,
@@ -295,29 +338,38 @@ class FfiKernel:
 class FfiCallDesc:
     def __init__(self, static_inputs):
         self.static_inputs = static_inputs
+        self.captures = {}
 class FfiCallable:
-    def __init__(self, func, num_outputs, graph_compatible, vmap_method, output_dims):
+    def __init__(self, func, num_outputs, graph_mode, vmap_method, output_dims, in_out_argnames):
         self.func = func
         self.name = generate_unique_name(func)
         self.num_outputs = num_outputs
         self.vmap_method = vmap_method
-        self.graph_compatible = graph_compatible
+        self.graph_mode = graph_mode
         self.output_dims = output_dims
         self.first_array_arg = None
         self.call_id = 0
         self.call_descriptors = {}
+        in_out_argnames_list = in_out_argnames or []
+        in_out_argnames = set(in_out_argnames_list)
+        if len(in_out_argnames_list) != len(in_out_argnames):
+            raise AssertionError("in_out_argnames must not contain duplicate names")
         # get arguments and annotations
         argspec = get_full_arg_spec(func)
         num_args = len(argspec.args)
-        self.num_inputs = num_args - num_outputs
+        self.num_in_out = len(in_out_argnames)
+        self.num_inputs = num_args - num_outputs + self.num_in_out
         if self.num_outputs < 1:
             raise ValueError("At least one output is required")
         if self.num_outputs > num_args:
             raise ValueError("Number of outputs cannot be greater than the number of kernel arguments")
+        if self.num_outputs < self.num_in_out:
+            raise ValueError("Number of outputs cannot be smaller than the number of in_out_argnames")
         if len(argspec.annotations) < num_args:
             raise RuntimeError(f"Incomplete argument annotations on function {self.name}")
@@ -329,16 +381,45 @@ class FfiCallable:
             if arg_name == "return":
                 if arg_type is not None:
                     raise TypeError("Function must not return a value")
+                continue
             else:
-                arg = FfiArg(arg_name, arg_type)
+                arg = FfiArg(arg_name, arg_type, arg_name in in_out_argnames)
+                if arg_name in in_out_argnames:
+                    in_out_argnames.remove(arg_name)
                 if arg.is_array:
                     if arg_idx < self.num_inputs and self.first_array_arg is None:
                         self.first_array_arg = arg_idx
                 self.args.append(arg)
+            if arg.in_out and arg_idx >= self.num_inputs:
+                raise AssertionError(
+                    f"Expected an output-only argument for argument {arg_name}."
+                    " in_out arguments should be placed before output-only arguments."
+                )
             arg_idx += 1
-        self.input_args = self.args[: self.num_inputs]
-        self.output_args = self.args[self.num_inputs :]
+        if in_out_argnames:
+            raise ValueError(f"in_out_argnames: '{in_out_argnames}' did not match any function argument names.")
+        self.input_args = self.args[: self.num_inputs]  # includes in-out args
+        self.output_args = self.args[self.num_inputs :]  # pure output args
+        # Buffer indices for array arguments in callback.
+        # In-out buffers are the same pointers in the XLA call frame,
+        # so we only include them for inputs and skip them for outputs.
+        self.array_input_indices = [i for i, arg in enumerate(self.input_args) if arg.is_array]
+        self.array_output_indices = list(range(self.num_in_out, self.num_outputs))
+        # Build input output aliases.
+        out_id = 0
+        input_output_aliases = {}
+        for in_id, arg in enumerate(self.input_args):
+            if not arg.in_out:
+                continue
+            input_output_aliases[in_id] = out_id
+            out_id += 1
+        self.input_output_aliases = input_output_aliases
         # register the callback
         FFI_CCALLFUNC = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.POINTER(XLA_FFI_CallFrame))
@@ -350,7 +431,9 @@ class FfiCallable:
     def __call__(self, *args, output_dims=None, vmap_method=None):
         num_inputs = len(args)
         if num_inputs != self.num_inputs:
-            raise ValueError(f"Expected {self.num_inputs} inputs, but got {num_inputs}")
+            input_names = ", ".join(arg.name for arg in self.input_args)
+            s = "" if self.num_inputs == 1 else "s"
+            raise ValueError(f"Expected {self.num_inputs} input{s} ({input_names}), but got {num_inputs}")
         # default argument fallback
         if vmap_method is None:
@@ -358,6 +441,9 @@ class FfiCallable:
         if output_dims is None:
             output_dims = self.output_dims
+        # output types
+        out_types = []
         # process inputs
         static_inputs = {}
         for i in range(num_inputs):
@@ -387,12 +473,11 @@ class FfiCallable:
                 # stash the value to be retrieved by callback
                 static_inputs[input_arg.name] = input_arg.type(input_value)
-        if output_dims is None and self.first_array_arg is not None:
-            # use the shape of the first input array
-            output_dims = get_warp_shape(self.input_args[self.first_array_arg], args[self.first_array_arg].shape)
+            # append in-out arg to output types
+            if input_arg.in_out:
+                out_types.append(get_jax_output_type(input_arg, input_value.shape))
-        # output types
-        out_types = []
+        # output shapes
         if isinstance(output_dims, dict):
             # assume a dictionary of shapes keyed on argument name
             for output_arg in self.output_args:
@@ -402,7 +487,9 @@ class FfiCallable:
                 out_types.append(get_jax_output_type(output_arg, dims))
         else:
             if output_dims is None:
-                raise ValueError("Unable to determine output dimensions")
+                if self.first_array_arg is None:
+                    raise ValueError("Unable to determine output dimensions")
+                output_dims = get_warp_shape(self.input_args[self.first_array_arg], args[self.first_array_arg].shape)
             elif isinstance(output_dims, int):
                 output_dims = (output_dims,)
             # assume same dimensions for all outputs
@@ -413,6 +500,7 @@ class FfiCallable:
             self.name,
             out_types,
             vmap_method=vmap_method,
+            input_output_aliases=self.input_output_aliases,
             # has_side_effect=True,  # force this function to execute even if outputs aren't used
         )
@@ -430,11 +518,10 @@ class FfiCallable:
     def ffi_callback(self, call_frame):
         try:
-            # TODO Try-catch around the body and return XLA_FFI_Error on error.
-            extension = call_frame.contents.extension_start
             # On the first call, XLA runtime will query the API version and traits
             # metadata using the |extension| field. Let us respond to that query
             # if the metadata extension is present.
+            extension = call_frame.contents.extension_start
             if extension:
                 # Try to set the version metadata.
                 if extension.contents.type == XLA_FFI_Extension_Type.Metadata:
@@ -442,15 +529,19 @@ class FfiCallable:
                     metadata_ext.contents.metadata.contents.api_version.major_version = 0
                     metadata_ext.contents.metadata.contents.api_version.minor_version = 1
                     # Turn on CUDA graphs for this handler.
-                    if self.graph_compatible:
+                    if self.graph_mode is GraphMode.JAX:
                         metadata_ext.contents.metadata.contents.traits = (
                             XLA_FFI_Handler_TraitsBits.COMMAND_BUFFER_COMPATIBLE
                         )
                     return None
             # retrieve call info
-            attrs = decode_attrs(call_frame.contents.attrs)
-            call_id = int(attrs["call_id"])
+            # NOTE: this assumes that there's only one attribute - call_id (int64).
+            # A more general but slower approach is this:
+            #   attrs = decode_attrs(call_frame.contents.attrs)
+            #   call_id = int(attrs["call_id"])
+            attr = ctypes.cast(call_frame.contents.attrs.attrs[0], ctypes.POINTER(XLA_FFI_Scalar)).contents
+            call_id = ctypes.cast(attr.value, ctypes.POINTER(ctypes.c_int64)).contents.value
             call_desc = self.call_descriptors[call_id]
             num_inputs = call_frame.contents.args.size
@@ -462,16 +553,42 @@ class FfiCallable:
             assert num_inputs == self.num_inputs
             assert num_outputs == self.num_outputs
-            device = wp.device_from_jax(get_jax_device())
             cuda_stream = get_stream_from_callframe(call_frame.contents)
+            if self.graph_mode == GraphMode.WARP:
+                # check if we already captured an identical call
+                ip = [inputs[i].contents.data for i in self.array_input_indices]
+                op = [outputs[i].contents.data for i in self.array_output_indices]
+                buffer_hash = hash((*ip, *op))
+                capture = call_desc.captures.get(buffer_hash)
+                # launch existing graph
+                if capture is not None:
+                    # NOTE: We use the native graph API to avoid overhead with obtaining Stream and Device objects in Python.
+                    # This code should match wp.capture_launch().
+                    graph = capture.graph
+                    if graph.graph_exec is None:
+                        g = ctypes.c_void_p()
+                        if not wp.context.runtime.core.wp_cuda_graph_create_exec(
+                            graph.device.context, cuda_stream, graph.graph, ctypes.byref(g)
+                        ):
+                            raise RuntimeError(f"Graph creation error: {wp.context.runtime.get_error_string()}")
+                        graph.graph_exec = g
+                    if not wp.context.runtime.core.wp_cuda_graph_launch(graph.graph_exec, cuda_stream):
+                        raise RuntimeError(f"Graph launch error: {wp.context.runtime.get_error_string()}")
+                    # early out
+                    return
+            device = wp.device_from_jax(get_jax_device())
             stream = wp.Stream(device, cuda_stream=cuda_stream)
             # reconstruct the argument list
             arg_list = []
-            # inputs
-            for i in range(num_inputs):
-                arg = self.input_args[i]
+            # input and in-out args
+            for i, arg in enumerate(self.input_args):
                 if arg.is_array:
                     buffer = inputs[i].contents
                     shape = buffer.dims[: buffer.rank - arg.dtype_ndim]
@@ -482,10 +599,9 @@ class FfiCallable:
                     value = call_desc.static_inputs[arg.name]
                     arg_list.append(value)
-            # outputs
-            for i in range(num_outputs):
-                arg = self.output_args[i]
-                buffer = outputs[i].contents
+            # pure output args (skip in-out FFI buffers)
+            for i, arg in enumerate(self.output_args):
+                buffer = outputs[i + self.num_in_out].contents
                 shape = buffer.dims[: buffer.rank - arg.dtype_ndim]
                 arr = wp.array(ptr=buffer.data, dtype=arg.type.dtype, shape=shape, device=device)
                 arg_list.append(arr)
@@ -493,11 +609,20 @@ class FfiCallable:
             # call the Python function with reconstructed arguments
             with wp.ScopedStream(stream, sync_enter=False):
                 if stream.is_capturing:
-                    with wp.ScopedCapture(stream=stream, external=True) as capture:
+                    # capturing with JAX
+                    with wp.ScopedCapture(external=True) as capture:
                         self.func(*arg_list)
                     # keep a reference to the capture object to prevent required modules getting unloaded
                     call_desc.capture = capture
+                elif self.graph_mode == GraphMode.WARP:
+                    # capturing with WARP
+                    with wp.ScopedCapture() as capture:
+                        self.func(*arg_list)
+                    wp.capture_launch(capture.graph)
+                    # keep a reference to the capture object and reuse it with same buffers
+                    call_desc.captures[buffer_hash] = capture
                 else:
+                    # not capturing
                     self.func(*arg_list)
         except Exception as e:
@@ -515,7 +640,9 @@ _FFI_KERNEL_REGISTRY: dict[str, FfiKernel] = {}
 _FFI_REGISTRY_LOCK = threading.Lock()
-def jax_kernel(kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=None, output_dims=None):
+def jax_kernel(
+    kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=None, output_dims=None, in_out_argnames=None
+):
     """Create a JAX callback from a Warp kernel.
     NOTE: This is an experimental feature under development.
@@ -523,6 +650,7 @@ def jax_kernel(kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=N
     Args:
         kernel: The Warp kernel to launch.
         num_outputs: Optional. Specify the number of output arguments if greater than 1.
+                     This must include the number of ``in_out_arguments``.
         vmap_method: Optional. String specifying how the callback transforms under ``vmap()``.
                      This argument can also be specified for individual calls.
         launch_dims: Optional. Specify the default kernel launch dimensions. If None, launch
@@ -531,12 +659,13 @@ def jax_kernel(kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=N
         output_dims: Optional. Specify the default dimensions of output arrays.  If None, output
                      dimensions are inferred from the launch dimensions.
                      This argument can also be specified for individual calls.
+        in_out_argnames: Optional. Names of input-output arguments.
     Limitations:
         - All kernel arguments must be contiguous arrays or scalars.
         - Scalars must be static arguments in JAX.
-        - Input arguments are followed by output arguments in the Warp kernel definition.
-        - There must be at least one output argument.
+        - Input and input-output arguments must precede the output arguments in the ``kernel`` definition.
+        - There must be at least one output or input-output argument.
         - Only the CUDA backend is supported.
     """
     key = (
@@ -549,7 +678,7 @@ def jax_kernel(kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=N
     with _FFI_REGISTRY_LOCK:
         if key not in _FFI_KERNEL_REGISTRY:
-            new_kernel = FfiKernel(kernel, num_outputs, vmap_method, launch_dims, output_dims)
+            new_kernel = FfiKernel(kernel, num_outputs, vmap_method, launch_dims, output_dims, in_out_argnames)
             _FFI_KERNEL_REGISTRY[key] = new_kernel
     return _FFI_KERNEL_REGISTRY[key]
@@ -558,9 +687,11 @@ def jax_kernel(kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=N
 def jax_callable(
     func: Callable,
     num_outputs: int = 1,
-    graph_compatible: bool = True,
-    vmap_method: str = "broadcast_all",
+    graph_compatible: Optional[bool] = None,  # deprecated
+    graph_mode: GraphMode = GraphMode.JAX,
+    vmap_method: Optional[str] = "broadcast_all",
     output_dims=None,
+    in_out_argnames=None,
 ):
     """Create a JAX callback from an annotated Python function.
@@ -571,31 +702,50 @@ def jax_callable(
     Args:
         func: The Python function to call.
         num_outputs: Optional. Specify the number of output arguments if greater than 1.
+            This must include the number of ``in_out_arguments``.
         graph_compatible: Optional. Whether the function can be called during CUDA graph capture.
+            This argument is deprecated, use ``graph_mode`` instead.
+        graph_mode: Optional. CUDA graph capture mode.
+            ``GraphMode.JAX`` (default): Let JAX capture the graph, which may be used as a subgraph in an enclosing capture.
+            ``GraphMode.WARP``: Let Warp capture the graph. Use this mode when the callable cannot be used as a subraph,
+            such as when the callable uses conditional graph nodes.
+            ``GraphMode.NONE``: Disable graph capture. Use when the callable performs operations that are not legal in a graph,
+            such as host synchronization.
         vmap_method: Optional. String specifying how the callback transforms under ``vmap()``.
             This argument can also be specified for individual calls.
         output_dims: Optional. Specify the default dimensions of output arrays.
             If ``None``, output dimensions are inferred from the launch dimensions.
             This argument can also be specified for individual calls.
+        in_out_argnames: Optional. Names of input-output arguments.
     Limitations:
         - All kernel arguments must be contiguous arrays or scalars.
         - Scalars must be static arguments in JAX.
-        - Input arguments are followed by output arguments in the Warp kernel definition.
-        - There must be at least one output argument.
+        - Input and input-output arguments must precede the output arguments in the ``func`` definition.
+        - There must be at least one output or input-output argument.
         - Only the CUDA backend is supported.
     """
+    if graph_compatible is not None:
+        wp.utils.warn(
+            "The `graph_compatible` argument is deprecated, use `graph_mode` instead.",
+            DeprecationWarning,
+            stacklevel=3,
+        )
+        if graph_compatible is False:
+            graph_mode = GraphMode.NONE
     key = (
         func,
         num_outputs,
-        graph_compatible,
+        graph_mode,
         vmap_method,
         tuple(sorted(output_dims.items())) if output_dims else output_dims,
     )
     with _FFI_REGISTRY_LOCK:
         if key not in _FFI_CALLABLE_REGISTRY:
-            new_callable = FfiCallable(func, num_outputs, graph_compatible, vmap_method, output_dims)
+            new_callable = FfiCallable(func, num_outputs, graph_mode, vmap_method, output_dims, in_out_argnames)
             _FFI_CALLABLE_REGISTRY[key] = new_callable
     return _FFI_CALLABLE_REGISTRY[key]
@@ -626,7 +776,6 @@ def register_ffi_callback(name: str, func: Callable, graph_compatible: bool = Tr
     def ffi_callback(call_frame):
         try:
-            # TODO Try-catch around the body and return XLA_FFI_Error on error.
             extension = call_frame.contents.extension_start
             # On the first call, XLA runtime will query the API version and traits
             # metadata using the |extension| field. Let us respond to that query