PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl - Mend

warp-lang 1.6.2__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (179) hide show

warp/__init__.py +7 -1
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +452 -362
warp/codegen.py +179 -119
warp/config.py +42 -6
warp/context.py +490 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/nodal_field.py +22 -68
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +9 -10
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +3 -8
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +14 -27
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +301 -105
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +99 -10
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/sim/articulation.py +4 -4
warp/sim/collide.py +21 -10
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/integrator_euler.py +5 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +5 -5
warp/sim/model.py +42 -13
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +216 -19
warp/tests/__main__.py +0 -15
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +2 -2
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_examples.py +28 -36
warp/tests/test_fem.py +23 -4
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +233 -79
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +67 -46
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +46 -34
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +1 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -59
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +110 -658
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/METADATA +29 -7
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/RECORD +172 -162
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info/licenses}/LICENSE.md +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/jax_experimental/ffi.py ADDED Viewed

@@ -0,0 +1,698 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ctypes
+import traceback
+from typing import Callable
+import jax
+import warp as wp
+from warp.codegen import get_full_arg_spec, make_full_qualified_name
+from warp.jax import get_jax_device
+from warp.types import array_t, launch_bounds_t, strides_from_shape, type_to_warp
+from .xla_ffi import *
+def jax_kernel(kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=None, output_dims=None):
+    """Create a JAX callback from a Warp kernel.
+    NOTE: This is an experimental feature under development.
+    Args:
+        kernel: The Warp kernel to launch.
+        num_outputs: Optional. Specify the number of output arguments if greater than 1.
+        vmap_method: Optional. String specifying how the callback transforms under ``vmap()``.
+                     This argument can also be specified for individual calls.
+        launch_dims: Optional. Specify the default kernel launch dimensions. If None, launch
+                     dimensions are inferred from the shape of the first array argument.
+                     This argument can also be specified for individual calls.
+        output_dims: Optional. Specify the default dimensions of output arrays.  If None, output
+                     dimensions are inferred from the launch dimensions.
+                     This argument can also be specified for individual calls.
+    Limitations:
+        - All kernel arguments must be contiguous arrays or scalars.
+        - Scalars must be static arguments in JAX.
+        - Input arguments are followed by output arguments in the Warp kernel definition.
+        - There must be at least one output argument.
+        - Only the CUDA backend is supported.
+    """
+    return FfiKernel(kernel, num_outputs, vmap_method, launch_dims, output_dims)
+def jax_callable(
+    func: Callable,
+    num_outputs: int = 1,
+    graph_compatible: bool = True,
+    vmap_method: str = "broadcast_all",
+    output_dims=None,
+):
+    """Create a JAX callback from an annotated Python function.
+    The Python function arguments must have type annotations like Warp kernels.
+    NOTE: This is an experimental feature under development.
+    Args:
+        func: The Python function to call.
+        num_outputs: Optional. Specify the number of output arguments if greater than 1.
+        graph_compatible: Optional. Whether the function can be called during CUDA graph capture.
+        vmap_method: Optional. String specifying how the callback transforms under ``vmap()``.
+            This argument can also be specified for individual calls.
+        output_dims: Optional. Specify the default dimensions of output arrays.
+            If ``None``, output dimensions are inferred from the launch dimensions.
+            This argument can also be specified for individual calls.
+    Limitations:
+        - All kernel arguments must be contiguous arrays or scalars.
+        - Scalars must be static arguments in JAX.
+        - Input arguments are followed by output arguments in the Warp kernel definition.
+        - There must be at least one output argument.
+        - Only the CUDA backend is supported.
+    """
+    return FfiCallable(func, num_outputs, graph_compatible, vmap_method, output_dims)
+class FfiArg:
+    def __init__(self, name, type):
+        self.name = name
+        self.type = type
+        self.is_array = isinstance(type, wp.array)
+        if self.is_array:
+            if hasattr(type.dtype, "_wp_scalar_type_"):
+                self.dtype_shape = type.dtype._shape_
+                self.dtype_ndim = len(self.dtype_shape)
+                self.jax_scalar_type = wp.dtype_to_jax(type.dtype._wp_scalar_type_)
+                self.jax_ndim = type.ndim + self.dtype_ndim
+            elif type.dtype in wp.types.value_types:
+                self.dtype_ndim = 0
+                self.dtype_shape = ()
+                self.jax_scalar_type = wp.dtype_to_jax(type.dtype)
+                self.jax_ndim = type.ndim
+            else:
+                raise TypeError(f"Invalid data type for array argument '{name}', expected scalar, vector, or matrix")
+            self.warp_ndim = type.ndim
+        elif type in wp.types.value_types:
+            self.dtype_ndim = 0
+            self.dtype_shape = ()
+            self.jax_scalar_type = wp.dtype_to_jax(type_to_warp(type))
+            self.jax_ndim = 0
+            self.warp_ndim = 0
+        else:
+            raise TypeError(f"Invalid type for argument '{name}', expected array or scalar, got {type}")
+class FfiLaunchDesc:
+    def __init__(self, static_inputs, launch_dims):
+        self.static_inputs = static_inputs
+        self.launch_dims = launch_dims
+class FfiKernel:
+    def __init__(self, kernel, num_outputs, vmap_method, launch_dims, output_dims):
+        self.kernel = kernel
+        self.name = generate_unique_name(kernel.func)
+        self.num_outputs = num_outputs
+        self.vmap_method = vmap_method
+        self.launch_dims = launch_dims
+        self.output_dims = output_dims
+        self.first_array_arg = None
+        self.launch_id = 0
+        self.launch_descriptors = {}
+        self.num_kernel_args = len(kernel.adj.args)
+        self.num_inputs = self.num_kernel_args - num_outputs
+        if self.num_outputs < 1:
+            raise ValueError("At least one output is required")
+        if self.num_outputs > self.num_kernel_args:
+            raise ValueError("Number of outputs cannot be greater than the number of kernel arguments")
+        # process input args
+        self.input_args = []
+        for i in range(self.num_inputs):
+            arg = FfiArg(kernel.adj.args[i].label, kernel.adj.args[i].type)
+            if arg.is_array:
+                # keep track of the first input array argument
+                if self.first_array_arg is None:
+                    self.first_array_arg = i
+            self.input_args.append(arg)
+        # process output args
+        self.output_args = []
+        for i in range(self.num_inputs, self.num_kernel_args):
+            arg = FfiArg(kernel.adj.args[i].label, kernel.adj.args[i].type)
+            if not arg.is_array:
+                raise TypeError("All output arguments must be arrays")
+            self.output_args.append(arg)
+        # register the callback
+        FFI_CCALLFUNC = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.POINTER(XLA_FFI_CallFrame))
+        self.callback_func = FFI_CCALLFUNC(lambda call_frame: self.ffi_callback(call_frame))
+        ffi_ccall_address = ctypes.cast(self.callback_func, ctypes.c_void_p)
+        ffi_capsule = jax.ffi.pycapsule(ffi_ccall_address.value)
+        jax.ffi.register_ffi_target(self.name, ffi_capsule, platform="CUDA")
+    def __call__(self, *args, output_dims=None, launch_dims=None, vmap_method=None):
+        num_inputs = len(args)
+        if num_inputs != self.num_inputs:
+            raise ValueError(f"Expected {self.num_inputs} inputs, but got {num_inputs}")
+        # default argument fallback
+        if launch_dims is None:
+            launch_dims = self.launch_dims
+        if output_dims is None:
+            output_dims = self.output_dims
+        if vmap_method is None:
+            vmap_method = self.vmap_method
+        # process inputs
+        static_inputs = {}
+        for i in range(num_inputs):
+            input_arg = self.input_args[i]
+            input_value = args[i]
+            if input_arg.is_array:
+                # check dtype
+                if input_value.dtype != input_arg.jax_scalar_type:
+                    raise TypeError(
+                        f"Invalid data type for array argument '{input_arg.name}', expected {input_arg.jax_scalar_type}, got {input_value.dtype}"
+                    )
+                # check ndim
+                if input_value.ndim != input_arg.jax_ndim:
+                    raise TypeError(
+                        f"Invalid dimensionality for array argument '{input_arg.name}', expected {input_arg.jax_ndim} dimensions, got {input_value.ndim}"
+                    )
+                # check inner dims
+                for d in range(input_arg.dtype_ndim):
+                    if input_value.shape[input_arg.type.ndim + d] != input_arg.dtype_shape[d]:
+                        raise TypeError(
+                            f"Invalid inner dimensions for array argument '{input_arg.name}', expected {input_arg.dtype_shape}, got {input_value.shape[-input_arg.dtype_ndim :]}"
+                        )
+            else:
+                # make sure scalar is not a traced variable, should be static
+                if isinstance(input_value, jax.core.Tracer):
+                    raise ValueError(f"Argument '{input_arg.name}' must be a static value")
+                # stash the value to be retrieved by callback
+                static_inputs[input_arg.name] = input_arg.type(input_value)
+        # launch dimensions
+        if launch_dims is None:
+            # use the shape of the first input array
+            if self.first_array_arg is not None:
+                launch_dims = get_warp_shape(self.input_args[self.first_array_arg], args[self.first_array_arg].shape)
+            else:
+                raise RuntimeError("Failed to determine launch dimensions")
+        elif isinstance(launch_dims, int):
+            launch_dims = (launch_dims,)
+        else:
+            launch_dims = tuple(launch_dims)
+        # output types
+        out_types = []
+        if isinstance(output_dims, dict):
+            # assume a dictionary of shapes keyed on argument name
+            for output_arg in self.output_args:
+                dims = output_dims.get(output_arg.name)
+                if dims is None:
+                    raise ValueError(f"Missing output dimensions for argument '{output_arg.name}'")
+                out_types.append(get_jax_output_type(output_arg, dims))
+        else:
+            if output_dims is None:
+                # use launch dimensions
+                output_dims = launch_dims
+            elif isinstance(output_dims, int):
+                output_dims = (output_dims,)
+            # assume same dimensions for all outputs
+            for output_arg in self.output_args:
+                out_types.append(get_jax_output_type(output_arg, output_dims))
+        call = jax.ffi.ffi_call(
+            self.name,
+            out_types,
+            vmap_method=vmap_method,
+        )
+        # ensure the kernel module is loaded before the callback, otherwise graph capture may fail
+        device = wp.device_from_jax(get_jax_device())
+        self.kernel.module.load(device)
+        # save launch data to be retrieved by callback
+        launch_id = self.launch_id
+        self.launch_descriptors[launch_id] = FfiLaunchDesc(static_inputs, launch_dims)
+        self.launch_id += 1
+        return call(*args, launch_id=launch_id)
+    def ffi_callback(self, call_frame):
+        try:
+            # On the first call, XLA runtime will query the API version and traits
+            # metadata using the |extension| field. Let us respond to that query
+            # if the metadata extension is present.
+            extension = call_frame.contents.extension_start
+            if extension:
+                # Try to set the version metadata.
+                if extension.contents.type == XLA_FFI_Extension_Type.Metadata:
+                    metadata_ext = ctypes.cast(extension, ctypes.POINTER(XLA_FFI_Metadata_Extension))
+                    metadata_ext.contents.metadata.contents.api_version.major_version = 0
+                    metadata_ext.contents.metadata.contents.api_version.minor_version = 1
+                    # Turn on CUDA graphs for this handler.
+                    metadata_ext.contents.metadata.contents.traits = (
+                        XLA_FFI_Handler_TraitsBits.COMMAND_BUFFER_COMPATIBLE
+                    )
+                    return None
+            # retrieve call info
+            attrs = decode_attrs(call_frame.contents.attrs)
+            launch_id = int(attrs["launch_id"])
+            launch_desc = self.launch_descriptors[launch_id]
+            num_inputs = call_frame.contents.args.size
+            inputs = ctypes.cast(call_frame.contents.args.args, ctypes.POINTER(ctypes.POINTER(XLA_FFI_Buffer)))
+            num_outputs = call_frame.contents.rets.size
+            outputs = ctypes.cast(call_frame.contents.rets.rets, ctypes.POINTER(ctypes.POINTER(XLA_FFI_Buffer)))
+            assert num_inputs == self.num_inputs
+            assert num_outputs == self.num_outputs
+            launch_bounds = launch_bounds_t(launch_desc.launch_dims)
+            # first kernel param is the launch bounds
+            kernel_params = (ctypes.c_void_p * (1 + self.num_kernel_args))()
+            kernel_params[0] = ctypes.addressof(launch_bounds)
+            arg_refs = []
+            # inputs
+            for i in range(num_inputs):
+                input_arg = self.input_args[i]
+                if input_arg.is_array:
+                    buffer = inputs[i].contents
+                    shape = buffer.dims[: input_arg.type.ndim]
+                    strides = strides_from_shape(shape, input_arg.type.dtype)
+                    arg = array_t(buffer.data, 0, input_arg.type.ndim, shape, strides)
+                    kernel_params[i + 1] = ctypes.addressof(arg)
+                    arg_refs.append(arg)  # keep a reference
+                else:
+                    # scalar argument, get stashed value
+                    value = launch_desc.static_inputs[input_arg.name]
+                    arg = input_arg.type._type_(value)
+                    kernel_params[i + 1] = ctypes.addressof(arg)
+                    arg_refs.append(arg)  # keep a reference
+            # outputs
+            for i in range(num_outputs):
+                output_arg = self.output_args[i]
+                buffer = outputs[i].contents
+                shape = buffer.dims[: output_arg.type.ndim]
+                strides = strides_from_shape(shape, output_arg.type.dtype)
+                arg = array_t(buffer.data, 0, output_arg.type.ndim, shape, strides)
+                kernel_params[num_inputs + i + 1] = ctypes.addressof(arg)
+                arg_refs.append(arg)  # keep a reference
+            # get device and stream
+            device = wp.device_from_jax(get_jax_device())
+            stream = get_stream_from_callframe(call_frame.contents)
+            # get kernel hooks
+            hooks = self.kernel.module.get_kernel_hooks(self.kernel, device)
+            assert hooks.forward, "Failed to find kernel entry point"
+            # launch the kernel
+            wp.context.runtime.core.cuda_launch_kernel(
+                device.context,
+                hooks.forward,
+                launch_bounds.size,
+                0,
+                256,
+                hooks.forward_smem_bytes,
+                kernel_params,
+                stream,
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            return create_ffi_error(
+                call_frame.contents.api, XLA_FFI_Error_Code.UNKNOWN, f"FFI callback error: {type(e).__name__}: {e}"
+            )
+class FfiCallDesc:
+    def __init__(self, static_inputs):
+        self.static_inputs = static_inputs
+class FfiCallable:
+    def __init__(self, func, num_outputs, graph_compatible, vmap_method, output_dims):
+        self.func = func
+        self.name = generate_unique_name(func)
+        self.num_outputs = num_outputs
+        self.vmap_method = vmap_method
+        self.graph_compatible = graph_compatible
+        self.output_dims = output_dims
+        self.first_array_arg = None
+        self.has_static_args = False
+        self.call_id = 0
+        self.call_descriptors = {}
+        # get arguments and annotations
+        argspec = get_full_arg_spec(func)
+        num_args = len(argspec.args)
+        self.num_inputs = num_args - num_outputs
+        if self.num_outputs < 1:
+            raise ValueError("At least one output is required")
+        if self.num_outputs > num_args:
+            raise ValueError("Number of outputs cannot be greater than the number of kernel arguments")
+        if len(argspec.annotations) < num_args:
+            raise RuntimeError(f"Incomplete argument annotations on function {self.name}")
+        # parse type annotations
+        self.args = []
+        arg_idx = 0
+        for arg_name, arg_type in argspec.annotations.items():
+            if arg_name == "return":
+                if arg_type is not None:
+                    raise TypeError("Function must not return a value")
+            else:
+                arg = FfiArg(arg_name, arg_type)
+                if arg.is_array:
+                    if arg_idx < self.num_inputs and self.first_array_arg is None:
+                        self.first_array_arg = arg_idx
+                else:
+                    self.has_static_args = True
+                self.args.append(arg)
+            arg_idx += 1
+        self.input_args = self.args[: self.num_inputs]
+        self.output_args = self.args[self.num_inputs :]
+        # register the callback
+        FFI_CCALLFUNC = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.POINTER(XLA_FFI_CallFrame))
+        self.callback_func = FFI_CCALLFUNC(lambda call_frame: self.ffi_callback(call_frame))
+        ffi_ccall_address = ctypes.cast(self.callback_func, ctypes.c_void_p)
+        ffi_capsule = jax.ffi.pycapsule(ffi_ccall_address.value)
+        jax.ffi.register_ffi_target(self.name, ffi_capsule, platform="CUDA")
+    def __call__(self, *args, output_dims=None, vmap_method=None):
+        num_inputs = len(args)
+        if num_inputs != self.num_inputs:
+            raise ValueError(f"Expected {self.num_inputs} inputs, but got {num_inputs}")
+        # default argument fallback
+        if vmap_method is None:
+            vmap_method = self.vmap_method
+        if output_dims is None:
+            output_dims = self.output_dims
+        # process inputs
+        static_inputs = {}
+        for i in range(num_inputs):
+            input_arg = self.input_args[i]
+            input_value = args[i]
+            if input_arg.is_array:
+                # check dtype
+                if input_value.dtype != input_arg.jax_scalar_type:
+                    raise TypeError(
+                        f"Invalid data type for array argument '{input_arg.name}', expected {input_arg.jax_scalar_type}, got {input_value.dtype}"
+                    )
+                # check ndim
+                if input_value.ndim != input_arg.jax_ndim:
+                    raise TypeError(
+                        f"Invalid dimensionality for array argument '{input_arg.name}', expected {input_arg.jax_ndim} dimensions, got {input_value.ndim}"
+                    )
+                # check inner dims
+                for d in range(input_arg.dtype_ndim):
+                    if input_value.shape[input_arg.type.ndim + d] != input_arg.dtype_shape[d]:
+                        raise TypeError(
+                            f"Invalid inner dimensions for array argument '{input_arg.name}', expected {input_arg.dtype_shape}, got {input_value.shape[-input_arg.dtype_ndim :]}"
+                        )
+            else:
+                # make sure scalar is not a traced variable, should be static
+                if isinstance(input_value, jax.core.Tracer):
+                    raise ValueError(f"Argument '{input_arg.name}' must be a static value")
+                # stash the value to be retrieved by callback
+                static_inputs[input_arg.name] = input_arg.type(input_value)
+        if output_dims is None and self.first_array_arg is not None:
+            # use the shape of the first input array
+            output_dims = get_warp_shape(self.input_args[self.first_array_arg], args[self.first_array_arg].shape)
+        # output types
+        out_types = []
+        if isinstance(output_dims, dict):
+            # assume a dictionary of shapes keyed on argument name
+            for output_arg in self.output_args:
+                dims = output_dims.get(output_arg.name)
+                if dims is None:
+                    raise ValueError(f"Missing output dimensions for argument '{output_arg.name}'")
+                out_types.append(get_jax_output_type(output_arg, dims))
+        else:
+            if output_dims is None:
+                raise ValueError("Unable to determine output dimensions")
+            elif isinstance(output_dims, int):
+                output_dims = (output_dims,)
+            # assume same dimensions for all outputs
+            for output_arg in self.output_args:
+                out_types.append(get_jax_output_type(output_arg, output_dims))
+        call = jax.ffi.ffi_call(
+            self.name,
+            out_types,
+            vmap_method=vmap_method,
+            # has_side_effect=True,  # force this function to execute even if outputs aren't used
+        )
+        # load the module
+        # NOTE: if the target function uses kernels from different modules, they will not be loaded here
+        device = wp.device_from_jax(get_jax_device())
+        module = wp.get_module(self.func.__module__)
+        module.load(device)
+        if self.has_static_args:
+            # save call data to be retrieved by callback
+            call_id = self.call_id
+            self.call_descriptors[call_id] = FfiCallDesc(static_inputs)
+            self.call_id += 1
+            return call(*args, call_id=call_id)
+        else:
+            return call(*args)
+    def ffi_callback(self, call_frame):
+        try:
+            # TODO Try-catch around the body and return XLA_FFI_Error on error.
+            extension = call_frame.contents.extension_start
+            # On the first call, XLA runtime will query the API version and traits
+            # metadata using the |extension| field. Let us respond to that query
+            # if the metadata extension is present.
+            if extension:
+                # Try to set the version metadata.
+                if extension.contents.type == XLA_FFI_Extension_Type.Metadata:
+                    metadata_ext = ctypes.cast(extension, ctypes.POINTER(XLA_FFI_Metadata_Extension))
+                    metadata_ext.contents.metadata.contents.api_version.major_version = 0
+                    metadata_ext.contents.metadata.contents.api_version.minor_version = 1
+                    # Turn on CUDA graphs for this handler.
+                    if self.graph_compatible:
+                        metadata_ext.contents.metadata.contents.traits = (
+                            XLA_FFI_Handler_TraitsBits.COMMAND_BUFFER_COMPATIBLE
+                        )
+                    return None
+            if self.has_static_args:
+                # retrieve call info
+                attrs = decode_attrs(call_frame.contents.attrs)
+                call_id = int(attrs["call_id"])
+                call_desc = self.call_descriptors[call_id]
+            num_inputs = call_frame.contents.args.size
+            inputs = ctypes.cast(call_frame.contents.args.args, ctypes.POINTER(ctypes.POINTER(XLA_FFI_Buffer)))
+            num_outputs = call_frame.contents.rets.size
+            outputs = ctypes.cast(call_frame.contents.rets.rets, ctypes.POINTER(ctypes.POINTER(XLA_FFI_Buffer)))
+            assert num_inputs == self.num_inputs
+            assert num_outputs == self.num_outputs
+            device = wp.device_from_jax(get_jax_device())
+            cuda_stream = get_stream_from_callframe(call_frame.contents)
+            stream = wp.Stream(device, cuda_stream=cuda_stream)
+            # reconstruct the argument list
+            arg_list = []
+            # inputs
+            for i in range(num_inputs):
+                arg = self.input_args[i]
+                if arg.is_array:
+                    buffer = inputs[i].contents
+                    shape = buffer.dims[: buffer.rank - arg.dtype_ndim]
+                    arr = wp.array(ptr=buffer.data, dtype=arg.type.dtype, shape=shape, device=device)
+                    arg_list.append(arr)
+                else:
+                    # scalar argument, get stashed value
+                    value = call_desc.static_inputs[arg.name]
+                    arg_list.append(value)
+            # outputs
+            for i in range(num_outputs):
+                arg = self.output_args[i]
+                buffer = outputs[i].contents
+                shape = buffer.dims[: buffer.rank - arg.dtype_ndim]
+                arr = wp.array(ptr=buffer.data, dtype=arg.type.dtype, shape=shape, device=device)
+                arg_list.append(arr)
+            # call the Python function with reconstructed arguments
+            with wp.ScopedStream(stream, sync_enter=False):
+                self.func(*arg_list)
+        except Exception as e:
+            print(traceback.format_exc())
+            return create_ffi_error(
+                call_frame.contents.api, XLA_FFI_Error_Code.UNKNOWN, f"FFI callback error: {type(e).__name__}: {e}"
+            )
+        return None
+###############################################################################
+#
+# Generic FFI callbacks for Python functions of the form
+# func(inputs, outputs, attrs, ctx)
+#
+###############################################################################
+# Holder for the custom callbacks to keep them alive.
+ffi_callbacks = {}
+def register_ffi_callback(name: str, func: Callable, graph_compatible: bool = True) -> None:
+    """Create a JAX callback from a Python function.
+    The Python function must have the form ``func(inputs, outputs, attrs, ctx)``.
+    NOTE: This is an experimental feature under development.
+    Args:
+        name: A unique FFI callback name.
+        func: The Python function to call.
+        graph_compatible: Optional. Whether the function can be called during CUDA graph capture.
+    """
+    # TODO check that the name is not already registered
+    def ffi_callback(call_frame):
+        try:
+            # TODO Try-catch around the body and return XLA_FFI_Error on error.
+            extension = call_frame.contents.extension_start
+            # On the first call, XLA runtime will query the API version and traits
+            # metadata using the |extension| field. Let us respond to that query
+            # if the metadata extension is present.
+            if extension:
+                # Try to set the version metadata.
+                if extension.contents.type == XLA_FFI_Extension_Type.Metadata:
+                    metadata_ext = ctypes.cast(extension, ctypes.POINTER(XLA_FFI_Metadata_Extension))
+                    metadata_ext.contents.metadata.contents.api_version.major_version = 0
+                    metadata_ext.contents.metadata.contents.api_version.minor_version = 1
+                    if graph_compatible:
+                        # Turn on CUDA graphs for this handler.
+                        metadata_ext.contents.metadata.contents.traits = (
+                            XLA_FFI_Handler_TraitsBits.COMMAND_BUFFER_COMPATIBLE
+                        )
+                    return None
+            attrs = decode_attrs(call_frame.contents.attrs)
+            input_count = call_frame.contents.args.size
+            inputs = ctypes.cast(call_frame.contents.args.args, ctypes.POINTER(ctypes.POINTER(XLA_FFI_Buffer)))
+            inputs = [FfiBuffer(inputs[i].contents) for i in range(input_count)]
+            output_count = call_frame.contents.rets.size
+            outputs = ctypes.cast(call_frame.contents.rets.rets, ctypes.POINTER(ctypes.POINTER(XLA_FFI_Buffer)))
+            outputs = [FfiBuffer(outputs[i].contents) for i in range(output_count)]
+            ctx = ExecutionContext(call_frame.contents)
+            func(inputs, outputs, attrs, ctx)
+        except Exception as e:
+            print(traceback.format_exc())
+            return create_ffi_error(
+                call_frame.contents.api, XLA_FFI_Error_Code.UNKNOWN, f"FFI callback error: {type(e).__name__}: {e}"
+            )
+        return None
+    FFI_CCALLFUNC = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.POINTER(XLA_FFI_CallFrame))
+    callback_func = FFI_CCALLFUNC(ffi_callback)
+    ffi_callbacks[name] = callback_func
+    ffi_ccall_address = ctypes.cast(callback_func, ctypes.c_void_p)
+    ffi_capsule = jax.ffi.pycapsule(ffi_ccall_address.value)
+    jax.ffi.register_ffi_target(name, ffi_capsule, platform="CUDA")
+###############################################################################
+#
+# Utilities
+#
+###############################################################################
+# ensure unique FFI callback names
+ffi_name_counts = {}
+def generate_unique_name(func) -> str:
+    key = make_full_qualified_name(func)
+    unique_id = ffi_name_counts.get(key, 0)
+    ffi_name_counts[key] = unique_id + 1
+    return f"{key}_{unique_id}"
+def get_warp_shape(arg, dims):
+    if arg.dtype_ndim > 0:
+        # vector/matrix array
+        return dims[: arg.warp_ndim]
+    else:
+        # scalar array
+        return dims
+def get_jax_output_type(arg, dims):
+    if isinstance(dims, int):
+        dims = (dims,)
+    ndim = len(dims)
+    if arg.dtype_ndim > 0:
+        # vector/matrix array
+        if ndim == arg.warp_ndim:
+            return jax.ShapeDtypeStruct((*dims, *arg.dtype_shape), arg.jax_scalar_type)
+        elif ndim == arg.jax_ndim:
+            # make sure inner dimensions match
+            inner_dims = dims[-arg.dtype_ndim :]
+            for i in range(arg.dtype_ndim):
+                if inner_dims[i] != arg.dtype_shape[i]:
+                    raise ValueError(f"Invalid output dimensions for argument '{arg.name}': {dims}")
+            return jax.ShapeDtypeStruct(dims, arg.jax_scalar_type)
+        else:
+            raise ValueError(f"Invalid output dimensions for argument '{arg.name}': {dims}")
+    else:
+        # scalar array
+        if ndim != arg.warp_ndim:
+            raise ValueError(f"Invalid output dimensions for argument '{arg.name}': {dims}")
+        return jax.ShapeDtypeStruct(dims, arg.jax_scalar_type)