PyPI - warp-lang - Versions diffs - 1.5.1__py3-none-manylinux2014_x86_64.whl → 1.6.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.5.1__py3-none-manylinux2014_x86_64.whl → 1.6.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (123) hide show

warp/__init__.py +5 -0
warp/autograd.py +414 -191
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +40 -12
warp/build_dll.py +13 -6
warp/builtins.py +1076 -480
warp/codegen.py +240 -119
warp/config.py +1 -1
warp/context.py +298 -84
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_gemm.py +27 -18
warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
warp/examples/core/example_torch.py +18 -34
warp/examples/fem/example_apic_fluid.py +1 -0
warp/examples/fem/example_mixed_elasticity.py +1 -1
warp/examples/optim/example_bounce.py +1 -1
warp/examples/optim/example_cloth_throw.py +1 -1
warp/examples/optim/example_diffray.py +4 -15
warp/examples/optim/example_drone.py +1 -1
warp/examples/optim/example_softbody_properties.py +392 -0
warp/examples/optim/example_trajectory.py +1 -3
warp/examples/optim/example_walker.py +5 -0
warp/examples/sim/example_cartpole.py +0 -2
warp/examples/sim/example_cloth_self_contact.py +260 -0
warp/examples/sim/example_granular_collision_sdf.py +4 -5
warp/examples/sim/example_jacobian_ik.py +0 -2
warp/examples/sim/example_quadruped.py +5 -2
warp/examples/tile/example_tile_cholesky.py +79 -0
warp/examples/tile/example_tile_convolution.py +2 -2
warp/examples/tile/example_tile_fft.py +2 -2
warp/examples/tile/example_tile_filtering.py +3 -3
warp/examples/tile/example_tile_matmul.py +4 -4
warp/examples/tile/example_tile_mlp.py +12 -12
warp/examples/tile/example_tile_nbody.py +180 -0
warp/examples/tile/example_tile_walker.py +319 -0
warp/math.py +147 -0
warp/native/array.h +12 -0
warp/native/builtin.h +0 -1
warp/native/bvh.cpp +149 -70
warp/native/bvh.cu +287 -68
warp/native/bvh.h +195 -85
warp/native/clang/clang.cpp +5 -1
warp/native/cuda_util.cpp +35 -0
warp/native/cuda_util.h +5 -0
warp/native/exports.h +40 -40
warp/native/intersect.h +17 -0
warp/native/mat.h +41 -0
warp/native/mathdx.cpp +19 -0
warp/native/mesh.cpp +25 -8
warp/native/mesh.cu +153 -101
warp/native/mesh.h +482 -403
warp/native/quat.h +40 -0
warp/native/solid_angle.h +7 -0
warp/native/sort.cpp +85 -0
warp/native/sort.cu +34 -0
warp/native/sort.h +3 -1
warp/native/spatial.h +11 -0
warp/native/tile.h +1185 -664
warp/native/tile_reduce.h +8 -6
warp/native/vec.h +41 -0
warp/native/warp.cpp +8 -1
warp/native/warp.cu +263 -40
warp/native/warp.h +19 -5
warp/optim/linear.py +22 -4
warp/render/render_opengl.py +124 -59
warp/sim/__init__.py +6 -1
warp/sim/collide.py +270 -26
warp/sim/integrator_euler.py +25 -7
warp/sim/integrator_featherstone.py +154 -35
warp/sim/integrator_vbd.py +842 -40
warp/sim/model.py +111 -53
warp/stubs.py +248 -115
warp/tape.py +28 -30
warp/tests/aux_test_module_unload.py +15 -0
warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
warp/tests/test_array.py +74 -0
warp/tests/test_assert.py +242 -0
warp/tests/test_codegen.py +14 -61
warp/tests/test_collision.py +2 -2
warp/tests/test_examples.py +9 -0
warp/tests/test_grad_debug.py +87 -2
warp/tests/test_hash_grid.py +1 -1
warp/tests/test_ipc.py +116 -0
warp/tests/test_mat.py +138 -167
warp/tests/test_math.py +47 -1
warp/tests/test_matmul.py +11 -7
warp/tests/test_matmul_lite.py +4 -4
warp/tests/test_mesh.py +84 -60
warp/tests/test_mesh_query_aabb.py +165 -0
warp/tests/test_mesh_query_point.py +328 -286
warp/tests/test_mesh_query_ray.py +134 -121
warp/tests/test_mlp.py +2 -2
warp/tests/test_operators.py +43 -0
warp/tests/test_overwrite.py +2 -2
warp/tests/test_quat.py +77 -0
warp/tests/test_reload.py +29 -0
warp/tests/test_sim_grad_bounce_linear.py +204 -0
warp/tests/test_static.py +16 -0
warp/tests/test_tape.py +25 -0
warp/tests/test_tile.py +134 -191
warp/tests/test_tile_load.py +356 -0
warp/tests/test_tile_mathdx.py +61 -8
warp/tests/test_tile_mlp.py +17 -17
warp/tests/test_tile_reduce.py +24 -18
warp/tests/test_tile_shared_memory.py +66 -17
warp/tests/test_tile_view.py +165 -0
warp/tests/test_torch.py +35 -0
warp/tests/test_utils.py +36 -24
warp/tests/test_vec.py +110 -0
warp/tests/unittest_suites.py +29 -4
warp/tests/unittest_utils.py +30 -11
warp/thirdparty/unittest_parallel.py +2 -2
warp/types.py +409 -99
warp/utils.py +9 -5
{warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/METADATA +68 -44
{warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/RECORD +121 -110
{warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
warp/examples/benchmarks/benchmark_tile.py +0 -179
warp/native/tile_gemm.h +0 -341
{warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0

warp/autograd.py CHANGED Viewed

@@ -5,8 +5,9 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+import inspect
 import itertools
-from typing import Any, Dict, List, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Sequence, Tuple, Union
 import numpy as np
@@ -22,23 +23,23 @@ __all__ = [
 def gradcheck(
-    function: wp.Kernel,
-    dim: Tuple[int],
-    inputs: Sequence,
-    outputs: Sequence,
+    function: Union[wp.Kernel, Callable],
+    dim: Tuple[int] = None,
+    inputs: Sequence = None,
+    outputs: Sequence = None,
     *,
-    eps=1e-4,
-    atol=1e-3,
-    rtol=1e-2,
-    raise_exception=True,
+    eps: float = 1e-4,
+    atol: float = 1e-3,
+    rtol: float = 1e-2,
+    raise_exception: bool = True,
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
-    max_blocks=0,
-    block_dim=256,
-    max_inputs_per_var=-1,
-    max_outputs_per_var=-1,
-    plot_relative_error=False,
-    plot_absolute_error=False,
+    max_blocks: int = 0,
+    block_dim: int = 256,
+    max_inputs_per_var: int = -1,
+    max_outputs_per_var: int = -1,
+    plot_relative_error: bool = False,
+    plot_absolute_error: bool = False,
     show_summary: bool = True,
 ) -> bool:
     """
@@ -56,10 +57,10 @@ def gradcheck(
         Structs arguments are not yet supported by this function to compute Jacobians.
     Args:
-        function: The Warp kernel function, decorated with the ``@wp.kernel`` decorator.
-        dim: The number of threads to launch the kernel, can be an integer, or a Tuple of ints.
+        function: The Warp kernel function, decorated with the ``@wp.kernel`` decorator, or any function that involves Warp kernel launches.
+        dim: The number of threads to launch the kernel, can be an integer, or a Tuple of ints. Only required if the function is a Warp kernel.
         inputs: List of input variables.
-        outputs: List of output variables.
+        outputs: List of output variables. Only required if the function is a Warp kernel.
         eps: The finite-difference step size.
         atol: The absolute tolerance for the gradient check.
         rtol: The relative tolerance for the gradient check.
@@ -78,9 +79,12 @@ def gradcheck(
         True if the gradient check passes, False otherwise.
     """
-    assert isinstance(function, wp.Kernel), "The function argument must be a Warp kernel"
+    if inputs is None:
+        raise ValueError("The inputs argument must be provided")
-    jacs_fd = jacobian_fd(
+    metadata = FunctionMetadata()
+    jacs_ad = jacobian(
         function,
         dim=dim,
         inputs=inputs,
@@ -89,12 +93,11 @@ def gradcheck(
         device=device,
         max_blocks=max_blocks,
         block_dim=block_dim,
-        max_inputs_per_var=max_inputs_per_var,
-        eps=eps,
+        max_outputs_per_var=max_outputs_per_var,
         plot_jacobians=False,
+        metadata=metadata,
     )
-    jacs_ad = jacobian(
+    jacs_fd = jacobian_fd(
         function,
         dim=dim,
         inputs=inputs,
@@ -103,8 +106,10 @@ def gradcheck(
         device=device,
         max_blocks=max_blocks,
         block_dim=block_dim,
-        max_outputs_per_var=max_outputs_per_var,
+        max_inputs_per_var=max_inputs_per_var,
+        eps=eps,
         plot_jacobians=False,
+        metadata=metadata,
     )
     relative_error_jacs = {}
@@ -112,7 +117,7 @@ def gradcheck(
     if show_summary:
         summary = []
-        summary_header = ["Input", "Output", "Max Abs Error", "Max Rel Error", "Pass"]
+        summary_header = ["Input", "Output", "Max Abs Error", "AD at MAE", "FD at MAE", "Max Rel Error", "Pass"]
         class FontColors:
             OKGREEN = "\033[92m"
@@ -121,6 +126,8 @@ def gradcheck(
             ENDC = "\033[0m"
     success = True
+    any_grad_mismatch = False
+    any_grad_nan = False
     for (input_i, output_i), jac_fd in jacs_fd.items():
         jac_ad = jacs_ad[input_i, output_i]
         if plot_relative_error or plot_absolute_error:
@@ -147,28 +154,15 @@ def gradcheck(
             cut_jac_fd = cut_jac_fd[:, :max_inputs_per_var]
             cut_jac_ad = cut_jac_ad[:, :max_inputs_per_var]
         grad_matches = np.allclose(cut_jac_ad, cut_jac_fd, atol=atol, rtol=rtol)
+        any_grad_mismatch = any_grad_mismatch or not grad_matches
         success = success and grad_matches
-        if not grad_matches:
-            if raise_exception:
-                raise ValueError(
-                    f"Gradient check failed for kernel {function.key}, input {input_i}, output {output_i}: "
-                    f"finite difference and autodiff gradients do not match"
-                )
-            elif not show_summary:
-                return False
         isnan = np.any(np.isnan(cut_jac_ad))
+        any_grad_nan = any_grad_nan or isnan
         success = success and not isnan
-        if isnan:
-            if raise_exception:
-                raise ValueError(
-                    f"Gradient check failed for kernel {function.key}, input {input_i}, output {output_i}: "
-                    f"gradient contains NaN values"
-                )
-            elif not show_summary:
-                return False
         if show_summary:
             max_abs_error = np.abs(cut_jac_ad - cut_jac_fd).max()
+            arg_max_abs_error = np.unravel_index(np.argmax(np.abs(cut_jac_ad - cut_jac_fd)), cut_jac_ad.shape)
             max_rel_error = np.abs((cut_jac_ad - cut_jac_fd) / (cut_jac_fd + 1e-8)).max()
             if isnan:
                 pass_str = FontColors.FAIL + "NaN" + FontColors.ENDC
@@ -176,33 +170,55 @@ def gradcheck(
                 pass_str = FontColors.OKGREEN + "PASS" + FontColors.ENDC
             else:
                 pass_str = FontColors.FAIL + "FAIL" + FontColors.ENDC
-            input_name = function.adj.args[input_i].label
-            output_name = function.adj.args[len(inputs) + output_i].label
-            summary.append([input_name, output_name, f"{max_abs_error:.7e}", f"{max_rel_error:.7e}", pass_str])
+            input_name = metadata.input_labels[input_i]
+            output_name = metadata.output_labels[output_i]
+            summary.append(
+                [
+                    input_name,
+                    output_name,
+                    f"{max_abs_error:.3e} at {tuple(int(i) for i in arg_max_abs_error)}",
+                    f"{cut_jac_ad[arg_max_abs_error]:.3e}",
+                    f"{cut_jac_fd[arg_max_abs_error]:.3e}",
+                    f"{max_rel_error:.3e}",
+                    pass_str,
+                ]
+            )
     if show_summary:
         print_table(summary_header, summary)
         if not success:
-            print(FontColors.FAIL + f"Gradient check for kernel {function.key} failed" + FontColors.ENDC)
+            print(FontColors.FAIL + f"Gradient check for kernel {metadata.key} failed" + FontColors.ENDC)
         else:
-            print(FontColors.OKGREEN + f"Gradient check for kernel {function.key} passed" + FontColors.ENDC)
+            print(FontColors.OKGREEN + f"Gradient check for kernel {metadata.key} passed" + FontColors.ENDC)
     if plot_relative_error:
         jacobian_plot(
             relative_error_jacs,
-            function,
+            metadata,
             inputs,
             outputs,
-            title=f"{function.key} kernel Jacobian relative error",
+            title=f"{metadata.key} kernel Jacobian relative error",
         )
     if plot_absolute_error:
         jacobian_plot(
             absolute_error_jacs,
-            function,
+            metadata,
             inputs,
             outputs,
-            title=f"{function.key} kernel Jacobian absolute error",
+            title=f"{metadata.key} kernel Jacobian absolute error",
         )
+    if raise_exception:
+        if any_grad_mismatch:
+            raise ValueError(
+                f"Gradient check failed for kernel {metadata.key}, input {input_i}, output {output_i}: "
+                f"finite difference and autodiff gradients do not match"
+            )
+        if any_grad_nan:
+            raise ValueError(
+                f"Gradient check failed for kernel {metadata.key}, input {input_i}, output {output_i}: "
+                f"gradient contains NaN values"
+            )
     return success
@@ -221,6 +237,8 @@ def gradcheck_tape(
     plot_relative_error=False,
     plot_absolute_error=False,
     show_summary: bool = True,
+    reverse_launches: bool = False,
+    skip_to_launch_index: int = 0,
 ) -> bool:
     """
     Checks whether the autodiff gradients for kernels recorded on the Warp tape match finite differences.
@@ -247,6 +265,7 @@ def gradcheck_tape(
         plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
         plot_absolute_error: If True, visualizes the absolute error of the Jacobians in a plot (requires ``matplotlib``).
         show_summary: If True, prints a summary table of the gradient check results.
+        reverse_launches: If True, reverses the order of the kernel launches on the tape to check.
     Returns:
         True if the gradient check passes for all kernels on the tape, False otherwise.
@@ -263,7 +282,12 @@ def gradcheck_tape(
         whitelist_kernels = set(whitelist_kernels)
     overall_success = True
-    for launch in tape.launches:
+    launches = reversed(tape.launches) if reverse_launches else tape.launches
+    for i, launch in enumerate(launches):
+        if i < skip_to_launch_index:
+            continue
+        if not isinstance(launch, tuple) and not isinstance(launch, list):
+            continue
         if not isinstance(launch[0], wp.Kernel):
             continue
         kernel, dim, max_blocks, inputs, outputs, device, block_dim = launch[:7]
@@ -271,6 +295,9 @@ def gradcheck_tape(
             continue
         if kernel.key in blacklist_kernels:
             continue
+        if not kernel.options.get("enable_backward", True):
+            continue
         input_output_mask = input_output_masks.get(kernel.key)
         success = gradcheck(
             kernel,
@@ -312,11 +339,95 @@ def infer_device(xs: list):
     return wp.get_preferred_device()
+class FunctionMetadata:
+    """
+    Metadata holder for kernel functions or functions with Warp arrays as inputs/outputs.
+    """
+    def __init__(
+        self,
+        key: str = None,
+        input_labels: List[str] = None,
+        output_labels: List[str] = None,
+        input_strides: List[tuple] = None,
+        output_strides: List[tuple] = None,
+        input_dtypes: list = None,
+        output_dtypes: list = None,
+    ):
+        self.key = key
+        self.input_labels = input_labels
+        self.output_labels = output_labels
+        self.input_strides = input_strides
+        self.output_strides = output_strides
+        self.input_dtypes = input_dtypes
+        self.output_dtypes = output_dtypes
+    @property
+    def is_empty(self):
+        return self.key is None
+    def input_is_array(self, i: int):
+        return self.input_strides[i] is not None
+    def output_is_array(self, i: int):
+        return self.output_strides[i] is not None
+    def update_from_kernel(self, kernel: wp.Kernel, inputs: Sequence):
+        self.key = kernel.key
+        self.input_labels = [arg.label for arg in kernel.adj.args[: len(inputs)]]
+        self.output_labels = [arg.label for arg in kernel.adj.args[len(inputs) :]]
+        self.input_strides = []
+        self.output_strides = []
+        self.input_dtypes = []
+        self.output_dtypes = []
+        for arg in kernel.adj.args[: len(inputs)]:
+            if arg.type is wp.array:
+                self.input_strides.append(arg.type.strides)
+                self.input_dtypes.append(arg.type.dtype)
+            else:
+                self.input_strides.append(None)
+                self.input_dtypes.append(None)
+        for arg in kernel.adj.args[len(inputs) :]:
+            if arg.type is wp.array:
+                self.output_strides.append(arg.type.strides)
+                self.output_dtypes.append(arg.type.dtype)
+            else:
+                self.output_strides.append(None)
+                self.output_dtypes.append(None)
+    def update_from_function(self, function: Callable, inputs: Sequence, outputs: Sequence = None):
+        self.key = function.__name__
+        self.input_labels = list(inspect.signature(function).parameters.keys())
+        if outputs is None:
+            outputs = function(*inputs)
+        if isinstance(outputs, wp.array):
+            outputs = [outputs]
+        self.output_labels = [f"output_{i}" for i in range(len(outputs))]
+        self.input_strides = []
+        self.output_strides = []
+        self.input_dtypes = []
+        self.output_dtypes = []
+        for input in inputs:
+            if isinstance(input, wp.array):
+                self.input_strides.append(input.strides)
+                self.input_dtypes.append(input.dtype)
+            else:
+                self.input_strides.append(None)
+                self.input_dtypes.append(None)
+        for output in outputs:
+            if isinstance(output, wp.array):
+                self.output_strides.append(output.strides)
+                self.output_dtypes.append(output.dtype)
+            else:
+                self.output_strides.append(None)
+                self.output_dtypes.append(None)
 def jacobian_plot(
     jacobians: Dict[Tuple[int, int], wp.array],
-    kernel: wp.Kernel,
-    inputs: Sequence,
-    outputs: Sequence,
+    kernel: Union[FunctionMetadata, wp.Kernel],
+    inputs: Sequence = None,
+    outputs: Sequence = None,
     show_plot=True,
     show_colorbar=True,
     scale_colors_per_submatrix=False,
@@ -330,9 +441,9 @@ def jacobian_plot(
     Args:
         jacobians: A dictionary of Jacobians, where the keys are tuples of input and output indices, and the values are the Jacobian matrices.
-        kernel: The Warp kernel function, decorated with the ``@wp.kernel`` decorator.
+        kernel: The Warp kernel function, decorated with the ``@wp.kernel`` decorator, or a :class:`FunctionMetadata` instance with the kernel/function attributes.
         inputs: List of input variables.
-        outputs: List of output variables.
+        outputs: List of output variables. Deprecated and will be removed in a future Warp version.
         show_plot: If True, displays the plot via ``plt.show()``.
         show_colorbar: If True, displays a colorbar next to the plot (or a colorbar next to every submatrix if ).
         scale_colors_per_submatrix: If True, considers the minimum and maximum of each Jacobian submatrix separately for color scaling. Otherwise, uses the global minimum and maximum of all Jacobians.
@@ -343,19 +454,39 @@ def jacobian_plot(
     Returns:
         The created Matplotlib figure.
     """
     import matplotlib.pyplot as plt
-    from matplotlib.ticker import FuncFormatter, MaxNLocator, MultipleLocator
+    from matplotlib.ticker import MaxNLocator
+    if isinstance(kernel, wp.Kernel):
+        assert inputs is not None
+        metadata = FunctionMetadata()
+        metadata.update_from_kernel(kernel, inputs)
+    elif isinstance(kernel, FunctionMetadata):
+        metadata = kernel
+    else:
+        raise ValueError("Invalid kernel argument: must be a Warp kernel or a FunctionMetadata object")
+    if outputs is not None:
+        wp.utils.warn(
+            "The `outputs` argument to `jacobian_plot` is no longer needed and will be removed in a future Warp version.",
+            DeprecationWarning,
+            stacklevel=3,
+        )
     jacobians = sorted(jacobians.items(), key=lambda x: (x[0][1], x[0][0]))
     jacobians = dict(jacobians)
     input_to_ax = {}
     output_to_ax = {}
+    ax_to_input = {}
+    ax_to_output = {}
     for i, j in jacobians.keys():
         if i not in input_to_ax:
             input_to_ax[i] = len(input_to_ax)
+            ax_to_input[input_to_ax[i]] = i
         if j not in output_to_ax:
             output_to_ax[j] = len(output_to_ax)
+            ax_to_output[output_to_ax[j]] = j
     num_rows = len(output_to_ax)
     num_cols = len(input_to_ax)
@@ -366,19 +497,19 @@ def jacobian_plot(
     # dimensions of the Jacobians
     width_ratios = []
     height_ratios = []
-    for i, input in enumerate(inputs):
-        if not isinstance(input, wp.array) or not input.requires_grad:
+    for i in range(len(metadata.input_labels)):
+        if not metadata.input_is_array(i):
             continue
-        input_stride = input.dtype._length_
-        for j in range(len(outputs)):
+        input_stride = metadata.input_strides[i][0]
+        for j in range(len(metadata.output_labels)):
             if (i, j) not in jacobians:
                 continue
             jac_wp = jacobians[(i, j)]
             width_ratios.append(jac_wp.shape[1] * input_stride)
             break
-    for i, output in enumerate(outputs):
-        if not isinstance(output, wp.array) or not output.requires_grad:
+    for i in range(len(metadata.output_labels)):
+        if not metadata.output_is_array(i):
             continue
         for j in range(len(inputs)):
             if (j, i) not in jacobians:
@@ -403,7 +534,8 @@ def jacobian_plot(
         squeeze=False,
     )
     if title is None:
-        title = f"{kernel.key} kernel Jacobian"
+        key = kernel.key if isinstance(kernel, wp.Kernel) else kernel.get("key", "unknown")
+        title = f"{key} kernel Jacobian"
     fig.suptitle(title)
     fig.canvas.manager.set_window_title(title)
@@ -421,66 +553,31 @@ def jacobian_plot(
     has_plot = np.ones((num_rows, num_cols), dtype=bool)
     for i in range(num_rows):
         for j in range(num_cols):
-            if (j, i) not in jacobians:
+            if (ax_to_input[j], ax_to_output[i]) not in jacobians:
                 ax = axs[i, j]
                 ax.axis("off")
                 has_plot[i, j] = False
     jac_i = 0
     for (input_i, output_i), jac_wp in jacobians.items():
-        input = inputs[input_i]
-        output = outputs[output_i]
-        if not isinstance(input, wp.array) or not input.requires_grad:
-            continue
-        if not isinstance(output, wp.array) or not output.requires_grad:
-            continue
-        input_name = kernel.adj.args[input_i].label
-        output_name = kernel.adj.args[len(inputs) + output_i].label
+        input_name = metadata.input_labels[input_i]
+        output_name = metadata.output_labels[output_i]
         ax_i, ax_j = output_to_ax[output_i], input_to_ax[input_i]
         ax = axs[ax_i, ax_j]
         ax.tick_params(which="major", width=1, length=7)
         ax.tick_params(which="minor", width=1, length=4, color="gray")
-        # ax.yaxis.set_minor_formatter('{x:.0f}')
-        input_stride = input.dtype._length_
-        output_stride = output.dtype._length_
+        input_stride = metadata.input_dtypes[input_i]._length_
+        # output_stride = metadata.output_dtypes[output_i]._length_
         jac = jac_wp.numpy()
         # Jacobian matrix has output stride already multiplied to first dimension
         jac = jac.reshape(jac_wp.shape[0], jac_wp.shape[1] * input_stride)
-        ax.xaxis.set_minor_formatter("")
-        ax.yaxis.set_minor_formatter("")
-        ax.xaxis.set_minor_locator(MultipleLocator(1))
-        ax.yaxis.set_minor_locator(MultipleLocator(1))
-        # ax.set_xticks(np.arange(jac.shape[0]))
-        # stride = jac.shape[1] // jacobians[jac_i].shape[1]
-        # ax.xaxis.set_major_locator(MultipleLocator(input_stride))
-        if input_stride > 1:
-            ax.xaxis.set_major_locator(MaxNLocator(integer=True, nbins=1, steps=[input_stride]))
-            ticks = FuncFormatter(lambda x, pos, input_stride=input_stride: "{0:g}".format(x // input_stride))
-            ax.xaxis.set_major_formatter(ticks)
-        # ax.xaxis.set_major_locator(FixedLocator(np.arange(0, jac.shape[1] + 1, input_stride)))
-        # ax.xaxis.set_major_formatter('{x:.0f}')
-        # ticks =  np.arange(jac_wp.shape[1] + 1)
-        # ax.set_xticklabels(ticks)
-        # ax.yaxis.set_major_locator(FixedLocator(np.arange(0, jac.shape[0] + 1, output_stride)))
-        # ax.yaxis.set_major_formatter('{x:.0f}')
-        # ax.yaxis.set_major_locator(MultipleLocator(output_stride))
-        if output_stride > 1:
-            ax.yaxis.set_major_locator(MaxNLocator(integer=True, nbins=1, steps=[output_stride]))
-        max_y = jac_wp.shape[0]
-        ticks = FuncFormatter(
-            lambda y, pos, max_y=max_y, output_stride=output_stride: "{0:g}".format((max_y - y) // output_stride)
-        )
-        ax.yaxis.set_major_formatter(ticks)
-        # divide by output stride to get the correct number of rows
-        ticks = np.arange(jac_wp.shape[0] // output_stride + 1)
-        # flip y labels to match the order of matrix rows starting from the top
-        # ax.set_yticklabels(ticks[::-1])
+        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
+        ax.yaxis.set_major_locator(MaxNLocator(integer=True))
         if scale_colors_per_submatrix:
             safe_jac = jac[~np.isnan(jac)]
             vmin = safe_jac.min()
@@ -494,7 +591,7 @@ def jacobian_plot(
             vmin=vmin,
             vmax=vmax,
         )
-        if ax_i == len(outputs) - 1 or not has_plot[ax_i + 1 :, ax_j].any():
+        if ax_i == num_rows - 1 or not has_plot[ax_i + 1 :, ax_j].any():
             # last plot of this column
             ax.set_xlabel(input_name)
         if ax_j == 0 or not has_plot[ax_i, :ax_j].any():
@@ -609,9 +706,9 @@ def scalarize_array_2d(arr):
 def jacobian(
-    kernel: wp.Kernel,
-    dim: Tuple[int],
-    inputs: Sequence,
+    function: Union[wp.Kernel, Callable],
+    dim: Tuple[int] = None,
+    inputs: Sequence = None,
     outputs: Sequence = None,
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
@@ -619,40 +716,84 @@ def jacobian(
     block_dim=256,
     max_outputs_per_var=-1,
     plot_jacobians=False,
+    metadata: FunctionMetadata = None,
+    kernel: wp.Kernel = None,
 ) -> Dict[Tuple[int, int], wp.array]:
     """
-    Computes the Jacobians of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
+    Computes the Jacobians of a function or Warp kernel for the provided selection of differentiable inputs to differentiable outputs.
+    The input function can be either a Warp kernel (e.g. a function decorated by ``@wp.kernel``) or a regular Python function that accepts arguments (of which some must be Warp arrays) and returns a Warp array or a list of Warp arrays.
-    The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim``,
+    In case ``function`` is a Warp kernel, its adjoint kernel is launched with the given inputs and outputs, as well as the provided ``dim``,
     ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
     Note:
-        This function only supports Warp kernels whose input arguments precede the output arguments.
+        If ``function`` is a Warp kernel, the input arguments must precede the output arguments in the kernel code definition.
         Only Warp arrays with ``requires_grad=True`` are considered for the Jacobian computation.
-        Structs arguments are not yet supported by this function to compute Jacobians.
+        Function arguments of type :ref:`Struct <structs>` are not yet supported.
     Args:
-        kernel: The Warp kernel function, decorated with the ``@wp.kernel`` decorator
-        dim: The number of threads to launch the kernel, can be an integer, or a Tuple of ints
-        inputs: List of input variables.
-        outputs: List of output variables. If None, the outputs are inferred from the kernel argument flags.
+        function: The Warp kernel function, or a regular Python function that returns a Warp array or a list of Warp arrays.
+        dim: The number of threads to launch the kernel, can be an integer, or a Tuple of ints. Only required if ``function`` is a Warp kernel.
+        inputs: List of input variables. At least one of the arguments must be a Warp array with ``requires_grad=True``.
+        outputs: List of output variables. Optional if the function is a regular Python function that returns a Warp array or a list of Warp arrays. Only required if ``function`` is a Warp kernel.
         input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
-        device: The device to launch on (optional)
-        max_blocks: The maximum number of CUDA thread blocks to use.
-        block_dim: The number of threads per block.
+        device: The device to launch on (optional). Only used if ``function`` is a Warp kernel.
+        max_blocks: The maximum number of CUDA thread blocks to use. Only used if ``function`` is a Warp kernel.
+        block_dim: The number of threads per block. Only used if ``function`` is a Warp kernel.
         max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
         plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
+        metadata: The metadata of the kernel function, containing the input and output labels, strides, and dtypes. If None or empty, the metadata is inferred from the kernel or function.
+        kernel: Deprecated argument. Use the ``function`` argument instead.
     Returns:
         A dictionary of Jacobians, where the keys are tuples of input and output indices, and the values are the Jacobian matrices.
     """
-    if outputs is None:
-        outputs = []
     if input_output_mask is None:
         input_output_mask = []
-    arg_names = [arg.label for arg in kernel.adj.args]
+    if kernel is not None:
+        wp.utils.warn(
+            "The argument `kernel` to the function `wp.autograd.jacobian` is deprecated in favor of the `function` argument and will be removed in a future Warp version.",
+            DeprecationWarning,
+            stacklevel=3,
+        )
+        function = kernel
+    if metadata is None:
+        metadata = FunctionMetadata()
+    if isinstance(function, wp.Kernel):
+        if not function.options.get("enable_backward", True):
+            raise ValueError("Kernel must have backward pass enabled to compute Jacobians")
+        if outputs is None or len(outputs) == 0:
+            raise ValueError("A list of output arguments must be provided to compute kernel Jacobians")
+        if device is None:
+            device = infer_device(inputs + outputs)
+        if metadata.is_empty:
+            metadata.update_from_kernel(function, inputs)
+        tape = wp.Tape()
+        tape.record_launch(
+            kernel=function,
+            dim=dim,
+            inputs=inputs,
+            outputs=outputs,
+            device=device,
+            max_blocks=max_blocks,
+            block_dim=block_dim,
+        )
+    else:
+        tape = wp.Tape()
+        with tape:
+            outputs = function(*inputs)
+        if isinstance(outputs, wp.array):
+            outputs = [outputs]
+        if metadata.is_empty:
+            metadata.update_from_function(function, inputs, outputs)
+    arg_names = metadata.input_labels + metadata.output_labels
     def resolve_arg(name, offset: int = 0):
         if isinstance(name, int):
@@ -665,19 +806,8 @@ def jacobian(
     ]
     input_output_mask = set(input_output_mask)
-    if device is None:
-        device = infer_device(inputs + outputs)
-    tape = wp.Tape()
-    tape.record_launch(
-        kernel=kernel,
-        dim=dim,
-        inputs=inputs,
-        outputs=outputs,
-        device=device,
-        max_blocks=max_blocks,
-        block_dim=block_dim,
-    )
+    zero_grads(inputs)
+    zero_grads(outputs)
     jacobians = {}
@@ -697,19 +827,21 @@ def jacobian(
         if max_outputs_per_var > 0:
             output_num = min(output_num, max_outputs_per_var)
         for i in range(output_num):
-            tape.zero()
+            output.grad.zero_()
             if i > 0:
                 set_element(out_grad, i - 1, 0.0)
             set_element(out_grad, i, 1.0)
             tape.backward()
             jacobian[i].assign(input.grad)
-        output.grad.zero_()
+            zero_grads(inputs)
+            zero_grads(outputs)
         jacobians[input_i, output_i] = jacobian
     if plot_jacobians:
         jacobian_plot(
             jacobians,
-            kernel,
+            metadata,
             inputs,
             outputs,
         )
@@ -718,53 +850,97 @@ def jacobian(
 def jacobian_fd(
-    kernel: wp.Kernel,
-    dim: Tuple[int],
-    inputs: Sequence,
+    function: Union[wp.Kernel, Callable],
+    dim: Tuple[int] = None,
+    inputs: Sequence = None,
     outputs: Sequence = None,
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
     max_blocks=0,
     block_dim=256,
     max_inputs_per_var=-1,
-    eps=1e-4,
+    eps: float = 1e-4,
     plot_jacobians=False,
+    metadata: FunctionMetadata = None,
+    kernel: wp.Kernel = None,
 ) -> Dict[Tuple[int, int], wp.array]:
     """
-    Computes the finite-difference Jacobian of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
+    Computes the finite-difference Jacobian of a function or Warp kernel for the provided selection of differentiable inputs to differentiable outputs.
     The method uses a central difference scheme to approximate the Jacobian.
-    The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the
-    provided ``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
+    The input function can be either a Warp kernel (e.g. a function decorated by ``@wp.kernel``) or a regular Python function that accepts arguments (of which some must be Warp arrays) and returns a Warp array or a list of Warp arrays.
+    The function is launched multiple times in forward-only mode with the given inputs. If ``function`` is a Warp kernel, the provided inputs and outputs,
+    as well as the other parameters ``dim``, ``max_blocks``, and ``block_dim`` are provided to the kernel launch (see :func:`warp.launch`).
     Note:
-        This function only supports Warp kernels whose input arguments precede the output arguments.
+        If ``function`` is a Warp kernel, the input arguments must precede the output arguments in the kernel code definition.
         Only Warp arrays with ``requires_grad=True`` are considered for the Jacobian computation.
-        Structs arguments are not yet supported by this function to compute Jacobians.
+        Function arguments of type :ref:`Struct <structs>` are not yet supported.
     Args:
-        kernel: The Warp kernel function, decorated with the ``@wp.kernel`` decorator
-        dim: The number of threads to launch the kernel, can be an integer, or a Tuple of ints
-        inputs: List of input variables.
-        outputs: List of output variables. If None, the outputs are inferred from the kernel argument flags.
+        function: The Warp kernel function, or a regular Python function that returns a Warp array or a list of Warp arrays.
+        dim: The number of threads to launch the kernel, can be an integer, or a Tuple of ints. Only required if ``function`` is a Warp kernel.
+        inputs: List of input variables. At least one of the arguments must be a Warp array with ``requires_grad=True``.
+        outputs: List of output variables. Optional if the function is a regular Python function that returns a Warp array or a list of Warp arrays. Only required if ``function`` is a Warp kernel.
         input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
-        device: The device to launch on (optional)
-        max_blocks: The maximum number of CUDA thread blocks to use.
-        block_dim: The number of threads per block.
+        device: The device to launch on (optional). Only used if ``function`` is a Warp kernel.
+        max_blocks: The maximum number of CUDA thread blocks to use. Only used if ``function`` is a Warp kernel.
+        block_dim: The number of threads per block. Only used if ``function`` is a Warp kernel.
         max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
         eps: The finite-difference step size.
         plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
+        metadata: The metadata of the kernel function, containing the input and output labels, strides, and dtypes. If None or empty, the metadata is inferred from the kernel or function.
+        kernel: Deprecated argument. Use the ``function`` argument instead.
     Returns:
         A dictionary of Jacobians, where the keys are tuples of input and output indices, and the values are the Jacobian matrices.
     """
-    if outputs is None:
-        outputs = []
     if input_output_mask is None:
         input_output_mask = []
-    arg_names = [arg.label for arg in kernel.adj.args]
+    if kernel is not None:
+        wp.utils.warn(
+            "The argument `kernel` to the function `wp.autograd.jacobian` is deprecated in favor of the `function` argument and will be removed in a future Warp version.",
+            DeprecationWarning,
+            stacklevel=3,
+        )
+        function = kernel
+    if metadata is None:
+        metadata = FunctionMetadata()
+    if isinstance(function, wp.Kernel):
+        if not function.options.get("enable_backward", True):
+            raise ValueError("Kernel must have backward pass enabled to compute Jacobians")
+        if outputs is None or len(outputs) == 0:
+            raise ValueError("A list of output arguments must be provided to compute kernel Jacobians")
+        if device is None:
+            device = infer_device(inputs + outputs)
+        if metadata.is_empty:
+            metadata.update_from_kernel(function, inputs)
+        tape = wp.Tape()
+        tape.record_launch(
+            kernel=function,
+            dim=dim,
+            inputs=inputs,
+            outputs=outputs,
+            device=device,
+            max_blocks=max_blocks,
+            block_dim=block_dim,
+        )
+    else:
+        tape = wp.Tape()
+        with tape:
+            outputs = function(*inputs)
+        if isinstance(outputs, wp.array):
+            outputs = [outputs]
+        if metadata.is_empty:
+            metadata.update_from_function(function, inputs, outputs)
+    arg_names = metadata.input_labels + metadata.output_labels
     def resolve_arg(name, offset: int = 0):
         if isinstance(name, int):
@@ -777,11 +953,15 @@ def jacobian_fd(
     ]
     input_output_mask = set(input_output_mask)
-    if device is None:
-        device = infer_device(inputs + outputs)
     jacobians = {}
+    def conditional_clone(obj):
+        if isinstance(obj, wp.array):
+            return wp.clone(obj)
+        return obj
+    outputs_copy = [conditional_clone(output) for output in outputs]
     for input_i, output_i in itertools.product(range(len(inputs)), range(len(outputs))):
         if len(input_output_mask) > 0 and (input_i, output_i) not in input_output_mask:
             continue
@@ -796,13 +976,20 @@ def jacobian_fd(
         left = wp.clone(output)
         right = wp.clone(output)
+        left_copy = wp.clone(output)
+        right_copy = wp.clone(output)
         flat_left = scalarize_array_1d(left)
         flat_right = scalarize_array_1d(right)
-        left_outputs = outputs[:output_i] + [left] + outputs[output_i + 1 :]
-        right_outputs = outputs[:output_i] + [right] + outputs[output_i + 1 :]
+        outputs_until_left = [conditional_clone(output) for output in outputs_copy[:output_i]]
+        outputs_until_right = [conditional_clone(output) for output in outputs_copy[:output_i]]
+        outputs_after_left = [conditional_clone(output) for output in outputs_copy[output_i + 1 :]]
+        outputs_after_right = [conditional_clone(output) for output in outputs_copy[output_i + 1 :]]
+        left_outputs = outputs_until_left + [left] + outputs_after_left
+        right_outputs = outputs_until_right + [right] + outputs_after_right
         input_num = flat_input.shape[0]
+        flat_input_copy = wp.clone(flat_input)
         jacobian = wp.empty((flat_left.size, input.size), dtype=input.dtype, device=input.device)
         jacobian.fill_(wp.nan)
@@ -812,38 +999,62 @@ def jacobian_fd(
             input_num = min(input_num, max_inputs_per_var)
         for i in range(input_num):
             set_element(flat_input, i, -eps, relative=True)
-            wp.launch(
-                kernel,
-                dim=dim,
-                inputs=inputs,
-                outputs=left_outputs,
-                device=device,
-                max_blocks=max_blocks,
-                block_dim=block_dim,
-            )
+            if isinstance(function, wp.Kernel):
+                wp.launch(
+                    function,
+                    dim=dim,
+                    max_blocks=max_blocks,
+                    block_dim=block_dim,
+                    inputs=inputs,
+                    outputs=left_outputs,
+                    device=device,
+                )
+            else:
+                outputs = function(*inputs)
+                if isinstance(outputs, wp.array):
+                    outputs = [outputs]
+                left.assign(outputs[output_i])
             set_element(flat_input, i, 2 * eps, relative=True)
-            wp.launch(
-                kernel,
-                dim=dim,
-                inputs=inputs,
-                outputs=right_outputs,
-                device=device,
-                max_blocks=max_blocks,
-                block_dim=block_dim,
+            if isinstance(function, wp.Kernel):
+                wp.launch(
+                    function,
+                    dim=dim,
+                    max_blocks=max_blocks,
+                    block_dim=block_dim,
+                    inputs=inputs,
+                    outputs=right_outputs,
+                    device=device,
+                )
+            else:
+                outputs = function(*inputs)
+                if isinstance(outputs, wp.array):
+                    outputs = [outputs]
+                right.assign(outputs[output_i])
+            # restore input
+            flat_input.assign(flat_input_copy)
+            compute_fd(
+                flat_left,
+                flat_right,
+                eps,
+                jacobian_t[i],
             )
-            set_element(flat_input, i, -eps, relative=True)
-            compute_fd(flat_left, flat_right, eps, jacobian_t[i])
+            if i < input_num - 1:
+                # reset output buffers
+                left.assign(left_copy)
+                right.assign(right_copy)
+                flat_left = scalarize_array_1d(left)
+                flat_right = scalarize_array_1d(right)
-        output.grad.zero_()
         jacobians[input_i, output_i] = jacobian
     if plot_jacobians:
         jacobian_plot(
             jacobians,
-            kernel,
+            metadata,
             inputs,
             outputs,
         )
@@ -864,7 +1075,7 @@ def set_element(a: wp.array(dtype=Any), i: int, val: Any, relative: bool = False
 @wp.kernel(enable_backward=False)
-def compute_fd_kernel(left: wp.array(dtype=Any), right: wp.array(dtype=Any), eps: Any, fd: wp.array(dtype=Any)):
+def compute_fd_kernel(left: wp.array(dtype=float), right: wp.array(dtype=float), eps: float, fd: wp.array(dtype=float)):
     tid = wp.tid()
     fd[tid] = (right[tid] - left[tid]) / (2.0 * eps)
@@ -883,7 +1094,10 @@ def compute_error_kernel(
     tid = wp.tid()
     ad = jacobian_ad[tid]
     fd = jacobian_fd[tid]
-    relative_error[tid] = (ad - fd) / (ad + 1e-8)
+    denom = ad
+    if abs(ad) < 1e-8:
+        denom = (type(ad))(1e-8)
+    relative_error[tid] = (ad - fd) / denom
     absolute_error[tid] = wp.abs(ad - fd)
@@ -909,3 +1123,12 @@ def print_table(headers, cells):
         for cell, col_width in zip(cell_row, col_widths):
             print(f"{cell:{col_width}}", end=" | ")
         print()
+def zero_grads(arrays: list):
+    """
+    Zeros the gradients of all Warp arrays in the given list.
+    """
+    for array in arrays:
+        if isinstance(array, wp.array) and array.requires_grad:
+            array.grad.zero_()