PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (134) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +93 -30
warp/build_dll.py +47 -67
warp/builtins.py +955 -137
warp/codegen.py +312 -206
warp/config.py +1 -1
warp/context.py +1249 -784
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +2 -1
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +82 -5
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +283 -69
warp/native/vec.h +381 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +323 -192
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +85 -6
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +56 -5
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +184 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/codegen.py CHANGED Viewed

@@ -18,9 +18,11 @@ from __future__ import annotations
 import ast
 import builtins
 import ctypes
+import enum
 import functools
 import hashlib
 import inspect
+import itertools
 import math
 import re
 import sys
@@ -614,6 +616,8 @@ def compute_type_str(base_name, template_params):
         return base_name
     def param2str(p):
+        if isinstance(p, builtins.bool):
+            return "true" if p else "false"
         if isinstance(p, int):
             return str(p)
         elif hasattr(p, "_wp_generic_type_str_"):
@@ -625,6 +629,8 @@ def compute_type_str(base_name, template_params):
                 return f"wp::{p.__name__}"
         elif is_tile(p):
             return p.ctype()
+        elif isinstance(p, Struct):
+            return p.native_name
         return p.__name__
@@ -684,7 +690,12 @@ class Var:
     @staticmethod
     def type_to_ctype(t: type, value_type: builtins.bool = False) -> str:
-        if is_array(t):
+        if isinstance(t, fixedarray):
+            template_args = (str(t.size), Var.dtype_to_ctype(t.dtype))
+            dtypestr = ", ".join(template_args)
+            classstr = f"wp::{type(t).__name__}"
+            return f"{classstr}_t<{dtypestr}>"
+        elif is_array(t):
             dtypestr = Var.dtype_to_ctype(t.dtype)
             classstr = f"wp::{type(t).__name__}"
             return f"{classstr}_t<{dtypestr}>"
@@ -780,11 +791,10 @@ def apply_defaults(
     arguments = bound_args.arguments
     new_arguments = []
     for name in bound_args._signature.parameters.keys():
-        try:
+        if name in arguments:
             new_arguments.append((name, arguments[name]))
-        except KeyError:
-            if name in values:
-                new_arguments.append((name, values[name]))
+        elif name in values:
+            new_arguments.append((name, values[name]))
     bound_args.arguments = dict(new_arguments)
@@ -837,6 +847,9 @@ def get_arg_type(arg: Var | Any) -> type:
     if isinstance(arg, Sequence):
         return tuple(get_arg_type(x) for x in arg)
+    if is_array(arg):
+        return arg
     if get_origin(arg) is tuple:
         return tuple(get_arg_type(x) for x in get_args(arg))
@@ -896,6 +909,8 @@ class Adjoint:
         adj.skip_forward_codegen = skip_forward_codegen
         # whether the generation of the adjoint code is skipped for this function
         adj.skip_reverse_codegen = skip_reverse_codegen
+        # Whether this function is used by a kernel that has has the backward pass enabled.
+        adj.used_by_backward_kernel = False
         # extract name of source file
         adj.filename = inspect.getsourcefile(func) or "unknown source file"
@@ -962,7 +977,7 @@ class Adjoint:
                 continue
             # add variable for argument
-            arg = Var(name, type, False)
+            arg = Var(name, type, requires_grad=False)
             adj.args.append(arg)
             # pre-populate symbol dictionary with function argument names
@@ -1071,17 +1086,21 @@ class Adjoint:
         # recursively evaluate function body
         try:
             adj.eval(adj.tree.body[0])
-        except Exception:
+        except Exception as original_exc:
             try:
                 lineno = adj.lineno + adj.fun_lineno
                 line = adj.source_lines[adj.lineno]
                 msg = f'Error while parsing function "{adj.fun_name}" at {adj.filename}:{lineno}:\n{line}\n'
-                ex, data, traceback = sys.exc_info()
-                e = ex(";".join([msg] + [str(a) for a in data.args])).with_traceback(traceback)
+                # Combine the new message with the original exception's arguments
+                new_args = (";".join([msg] + [str(a) for a in original_exc.args]),)
+                # Enhance the original exception with parser context before re-raising.
+                # 'from None' is used to suppress Python's chained exceptions for a cleaner error output.
+                raise type(original_exc)(*new_args).with_traceback(original_exc.__traceback__) from None
             finally:
                 adj.skip_build = True
                 adj.builder = None
-                raise e
         if builder is not None:
             for a in adj.args:
@@ -1227,9 +1246,9 @@ class Adjoint:
         # lineinfo is enabled by default in debug mode regardless of the builder option, don't want to unnecessarily
         # emit line directives in generated code if it's not being compiled with line information
-        lineinfo_enabled = (
-            adj.builder_options.get("lineinfo", False) or adj.builder_options.get("mode", "release") == "debug"
-        )
+        build_mode = val if (val := adj.builder_options.get("mode")) is not None else warp.config.mode
+        lineinfo_enabled = adj.builder_options.get("lineinfo", False) or build_mode == "debug"
         if relative_lineno is not None and lineinfo_enabled and warp.config.line_directives:
             is_comment = statement.strip().startswith("//")
@@ -1348,7 +1367,7 @@ class Adjoint:
         # unresolved function, report error
         arg_type_reprs = []
-        for x in arg_types:
+        for x in itertools.chain(arg_types, kwarg_types.values()):
             if isinstance(x, warp.context.Function):
                 arg_type_reprs.append("function")
             else:
@@ -1378,7 +1397,7 @@ class Adjoint:
         # in order to process them as Python does it.
         bound_args: inspect.BoundArguments = func.signature.bind(*args, **kwargs)
-        # Type args are the “compile time” argument values we get from codegen.
+        # Type args are the "compile time" argument values we get from codegen.
         # For example, when calling `wp.vec3f(...)` from within a kernel,
         # this translates in fact to calling the `vector()` built-in augmented
         # with the type args `length=3, dtype=float`.
@@ -1416,20 +1435,30 @@ class Adjoint:
         bound_args = bound_args.arguments
         # if it is a user-function then build it recursively
-        if not func.is_builtin() and func not in adj.builder.functions:
-            adj.builder.build_function(func)
-            # add custom grad, replay functions to the list of functions
-            # to be built later (invalid code could be generated if we built them now)
-            # so that they are not missed when only the forward function is imported
-            # from another module
-            if func.custom_grad_func:
-                adj.builder.deferred_functions.append(func.custom_grad_func)
-            if func.custom_replay_func:
-                adj.builder.deferred_functions.append(func.custom_replay_func)
+        if not func.is_builtin():
+            # If the function called is a user function,
+            # we need to ensure its adjoint is also being generated.
+            if adj.used_by_backward_kernel:
+                func.adj.used_by_backward_kernel = True
+            if adj.builder is None:
+                func.build(None)
+            elif func not in adj.builder.functions:
+                adj.builder.build_function(func)
+                # add custom grad, replay functions to the list of functions
+                # to be built later (invalid code could be generated if we built them now)
+                # so that they are not missed when only the forward function is imported
+                # from another module
+                if func.custom_grad_func:
+                    adj.builder.deferred_functions.append(func.custom_grad_func)
+                if func.custom_replay_func:
+                    adj.builder.deferred_functions.append(func.custom_replay_func)
         # Resolve the return value based on the types and values of the given arguments.
         bound_arg_types = {k: get_arg_type(v) for k, v in bound_args.items()}
         bound_arg_values = {k: get_arg_value(v) for k, v in bound_args.items()}
         return_type = func.value_func(
             {k: strip_reference(v) for k, v in bound_arg_types.items()},
             bound_arg_values,
@@ -1493,6 +1522,9 @@ class Adjoint:
             # if the argument is a function (and not a builtin), then build it recursively
             if isinstance(func_arg_var, warp.context.Function) and not func_arg_var.is_builtin():
+                if adj.used_by_backward_kernel:
+                    func_arg_var.adj.used_by_backward_kernel = True
                 adj.builder.build_function(func_arg_var)
             fwd_args.append(strip_reference(func_arg_var))
@@ -1886,6 +1918,9 @@ class Adjoint:
             return obj
         if isinstance(obj, type):
             return obj
+        if isinstance(obj, Struct):
+            adj.builder.build_struct_recursive(obj)
+            return obj
         if isinstance(obj, types.ModuleType):
             return obj
@@ -1938,11 +1973,17 @@ class Adjoint:
         aggregate = adj.eval(node.value)
         try:
+            if isinstance(aggregate, Var) and aggregate.constant is not None:
+                # this case may occur when the attribute is a constant, e.g.: `IntEnum.A.value`
+                return aggregate
             if isinstance(aggregate, types.ModuleType) or isinstance(aggregate, type):
                 out = getattr(aggregate, node.attr)
                 if warp.types.is_value(out):
                     return adj.add_constant(out)
+                if isinstance(out, (enum.IntEnum, enum.IntFlag)):
+                    return adj.add_constant(int(out))
                 return out
@@ -1970,18 +2011,29 @@ class Adjoint:
                     return adj.add_builtin_call("transform_get_rotation", [aggregate])
             else:
-                attr_type = Reference(aggregate_type.vars[node.attr].type)
+                attr_var = aggregate_type.vars[node.attr]
+                # represent pointer types as uint64
+                if isinstance(attr_var.type, pointer_t):
+                    cast = f"({Var.dtype_to_ctype(uint64)}*)"
+                    adj_cast = f"({Var.dtype_to_ctype(attr_var.type.dtype)}*)"
+                    attr_type = Reference(uint64)
+                else:
+                    cast = ""
+                    adj_cast = ""
+                    attr_type = Reference(attr_var.type)
                 attr = adj.add_var(attr_type)
                 if is_reference(aggregate.type):
-                    adj.add_forward(f"{attr.emit()} = &({aggregate.emit()}->{node.attr});")
+                    adj.add_forward(f"{attr.emit()} = {cast}&({aggregate.emit()}->{attr_var.label});")
                 else:
-                    adj.add_forward(f"{attr.emit()} = &({aggregate.emit()}.{node.attr});")
+                    adj.add_forward(f"{attr.emit()} = {cast}&({aggregate.emit()}.{attr_var.label});")
                 if adj.is_differentiable_value_type(strip_reference(attr_type)):
-                    adj.add_reverse(f"{aggregate.emit_adj()}.{node.attr} += {attr.emit_adj()};")
+                    adj.add_reverse(f"{aggregate.emit_adj()}.{attr_var.label} += {adj_cast}{attr.emit_adj()};")
                 else:
-                    adj.add_reverse(f"{aggregate.emit_adj()}.{node.attr} = {attr.emit_adj()};")
+                    adj.add_reverse(f"{aggregate.emit_adj()}.{attr_var.label} = {adj_cast}{attr.emit_adj()};")
                 return attr
@@ -2309,9 +2361,12 @@ class Adjoint:
             return var
-        if isinstance(expr, (type, Var, warp.context.Function)):
+        if isinstance(expr, (type, Struct, Var, warp.context.Function)):
             return expr
+        if isinstance(expr, (enum.IntEnum, enum.IntFlag)):
+            return adj.add_constant(int(expr))
         return adj.add_constant(expr)
     def emit_Call(adj, node):
@@ -2360,7 +2415,8 @@ class Adjoint:
             # struct constructor
             if func is None and isinstance(caller, Struct):
-                adj.builder.build_struct_recursive(caller)
+                if adj.builder is not None:
+                    adj.builder.build_struct_recursive(caller)
                 if node.args or node.keywords:
                     func = caller.value_constructor
                 else:
@@ -2420,68 +2476,45 @@ class Adjoint:
         return adj.eval(node.value)
-    # returns the object being indexed, and the list of indices
-    def eval_subscript(adj, node):
-        # We want to coalesce multi-dimensional array indexing into a single operation. This needs to deal with expressions like `a[i][j][x][y]` where `a` is a 2D array of matrices,
-        # and essentially rewrite it into `a[i, j][x][y]`. Since the AST observes the indexing right-to-left, and we don't want to evaluate the index expressions prematurely,
-        # this requires a first loop to check if this `node` only performs indexing on the array, and a second loop to evaluate and collect index variables.
-        root = node
-        count = 0
-        array = None
-        while isinstance(root, ast.Subscript):
-            if isinstance(root.slice, ast.Tuple):
-                # handles the x[i, j] case (Python 3.8.x upward)
-                count += len(root.slice.elts)
-            elif isinstance(root.slice, ast.Index) and isinstance(root.slice.value, ast.Tuple):
-                # handles the x[i, j] case (Python 3.7.x)
-                count += len(root.slice.value.elts)
-            else:
-                # simple expression, e.g.: x[i]
-                count += 1
-            if isinstance(root.value, ast.Name):
-                symbol = adj.emit_Name(root.value)
-                symbol_type = strip_reference(symbol.type)
-                if is_array(symbol_type):
-                    array = symbol
-                    break
-            root = root.value
-        # If not all indices index into the array, just evaluate the right-most indexing operation.
-        if not array or (count > array.type.ndim):
-            count = 1
-        indices = []
-        root = node
-        while len(indices) < count:
-            if isinstance(root.slice, ast.Tuple):
-                ij = [adj.eval(arg) for arg in root.slice.elts]
-            elif isinstance(root.slice, ast.Index) and isinstance(root.slice.value, ast.Tuple):
-                ij = [adj.eval(arg) for arg in root.slice.value.elts]
-            else:
-                ij = [adj.eval(root.slice)]
-            indices = ij + indices  # prepend
-            root = root.value
-        target = adj.eval(root)
+    def eval_indices(adj, target_type, indices):
+        nodes = indices
+        if hasattr(target_type, "_wp_generic_type_hint_"):
+            indices = []
+            for dim, node in enumerate(nodes):
+                if isinstance(node, ast.Slice):
+                    # In the context of slicing a vec/mat type, indices are expected
+                    # to be compile-time constants, hence we can infer the actual slice
+                    # bounds also at compile-time.
+                    length = target_type._shape_[dim]
+                    step = 1 if node.step is None else adj.eval(node.step).constant
+                    if node.lower is None:
+                        start = length - 1 if step < 0 else 0
+                    else:
+                        start = adj.eval(node.lower).constant
+                        start = min(max(start, -length), length)
+                        start = start + length if start < 0 else start
-        return target, indices
+                    if node.upper is None:
+                        stop = -1 if step < 0 else length
+                    else:
+                        stop = adj.eval(node.upper).constant
+                        stop = min(max(stop, -length), length)
+                        stop = stop + length if stop < 0 else stop
-    def emit_Subscript(adj, node):
-        if hasattr(node.value, "attr") and node.value.attr == "adjoint":
-            # handle adjoint of a variable, i.e. wp.adjoint[var]
-            node.slice.is_adjoint = True
-            var = adj.eval(node.slice)
-            var_name = var.label
-            var = Var(f"adj_{var_name}", type=var.type, constant=None, prefix=False)
-            return var
+                    slice = adj.add_builtin_call("slice", (start, stop, step))
+                    indices.append(slice)
+                else:
+                    indices.append(adj.eval(node))
-        target, indices = adj.eval_subscript(node)
+            return tuple(indices)
+        else:
+            return tuple(adj.eval(x) for x in nodes)
+    def emit_indexing(adj, target, indices):
         target_type = strip_reference(target.type)
+        indices = adj.eval_indices(target_type, indices)
         if is_array(target_type):
             if len(indices) == target_type.ndim:
                 # handles array loads (where each dimension has an index specified)
@@ -2520,47 +2553,116 @@ class Adjoint:
         return out
+    # from a list of lists of indices, strip the first `count` indices
+    @staticmethod
+    def strip_indices(indices, count):
+        dim = count
+        while count > 0:
+            ij = indices[0]
+            indices = indices[1:]
+            count -= len(ij)
+        # report straddling like in `arr2d[0][1,2]` as a syntax error
+        if count < 0:
+            raise WarpCodegenError(
+                f"Incorrect number of indices specified for array indexing, got {dim - count} indices for a {dim} dimensional array."
+            )
+        return indices
+    def recurse_subscript(adj, node, indices):
+        if isinstance(node, ast.Name):
+            target = adj.eval(node)
+            return target, indices
+        if isinstance(node, ast.Subscript):
+            if hasattr(node.value, "attr") and node.value.attr == "adjoint":
+                return adj.eval(node), indices
+            if isinstance(node.slice, ast.Tuple):
+                ij = node.slice.elts
+            elif isinstance(node.slice, ast.Index) and isinstance(node.slice.value, ast.Tuple):
+                # The node `ast.Index` is deprecated in Python 3.9.
+                ij = node.slice.value.elts
+            elif isinstance(node.slice, ast.ExtSlice):
+                # The node `ast.ExtSlice` is deprecated in Python 3.9.
+                ij = node.slice.dims
+            else:
+                ij = [node.slice]
+            indices = [ij, *indices]  # prepend
+            target, indices = adj.recurse_subscript(node.value, indices)
+            target_type = strip_reference(target.type)
+            if is_array(target_type):
+                flat_indices = [i for ij in indices for i in ij]
+                if len(flat_indices) > target_type.ndim:
+                    target = adj.emit_indexing(target, flat_indices[: target_type.ndim])
+                    indices = adj.strip_indices(indices, target_type.ndim)
+            return target, indices
+        target = adj.eval(node)
+        return target, indices
+    # returns the object being indexed, and the list of indices
+    def eval_subscript(adj, node):
+        target, indices = adj.recurse_subscript(node, [])
+        flat_indices = [i for ij in indices for i in ij]
+        return target, flat_indices
+    def emit_Subscript(adj, node):
+        if hasattr(node.value, "attr") and node.value.attr == "adjoint":
+            # handle adjoint of a variable, i.e. wp.adjoint[var]
+            node.slice.is_adjoint = True
+            var = adj.eval(node.slice)
+            var_name = var.label
+            var = Var(f"adj_{var_name}", type=var.type, constant=None, prefix=False)
+            return var
+        target, indices = adj.eval_subscript(node)
+        return adj.emit_indexing(target, indices)
     def emit_Assign(adj, node):
         if len(node.targets) != 1:
             raise WarpCodegenError("Assigning the same value to multiple variables is not supported")
-        lhs = node.targets[0]
+        # Check if the rhs corresponds to an unsupported construct.
+        # Tuples are supported in the context of assigning multiple variables
+        # at once, but not for simple assignments like `x = (1, 2, 3)`.
+        # Therefore, we need to catch this specific case here instead of
+        # more generally in `adj.eval()`.
+        if isinstance(node.value, ast.List):
+            raise WarpCodegenError(
+                "List constructs are not supported in kernels. Use vectors like `wp.vec3()` for small collections instead."
+            )
-        if not isinstance(lhs, ast.Tuple):
-            # Check if the rhs corresponds to an unsupported construct.
-            # Tuples are supported in the context of assigning multiple variables
-            # at once, but not for simple assignments like `x = (1, 2, 3)`.
-            # Therefore, we need to catch this specific case here instead of
-            # more generally in `adj.eval()`.
-            if isinstance(node.value, ast.List):
-                raise WarpCodegenError(
-                    "List constructs are not supported in kernels. Use vectors like `wp.vec3()` for small collections instead."
-                )
+        lhs = node.targets[0]
-        # handle the case where we are assigning multiple output variables
-        if isinstance(lhs, ast.Tuple):
+        if isinstance(lhs, ast.Tuple) and isinstance(node.value, ast.Call):
             # record the expected number of outputs on the node
             # we do this so we can decide which function to
             # call based on the number of expected outputs
-            if isinstance(node.value, ast.Call):
-                node.value.expects = len(lhs.elts)
+            node.value.expects = len(lhs.elts)
-            # evaluate values
-            if isinstance(node.value, ast.Tuple):
-                out = [adj.eval(v) for v in node.value.elts]
-            else:
-                out = adj.eval(node.value)
+        # evaluate rhs
+        if isinstance(lhs, ast.Tuple) and isinstance(node.value, ast.Tuple):
+            rhs = [adj.eval(v) for v in node.value.elts]
+        else:
+            rhs = adj.eval(node.value)
+        # handle the case where we are assigning multiple output variables
+        if isinstance(lhs, ast.Tuple):
+            subtype = getattr(rhs, "type", None)
-            subtype = getattr(out, "type", None)
             if isinstance(subtype, warp.types.tuple_t):
-                if len(out.type.types) != len(lhs.elts):
+                if len(rhs.type.types) != len(lhs.elts):
                     raise WarpCodegenError(
-                        f"Invalid number of values to unpack (expected {len(lhs.elts)}, got {len(out.type.types)})."
+                        f"Invalid number of values to unpack (expected {len(lhs.elts)}, got {len(rhs.type.types)})."
                     )
-                target = out
-                out = tuple(
-                    adj.add_builtin_call("extract", (target, adj.add_constant(i))) for i in range(len(lhs.elts))
-                )
+                rhs = tuple(adj.add_builtin_call("extract", (rhs, adj.add_constant(i))) for i in range(len(lhs.elts)))
             names = []
             for v in lhs.elts:
@@ -2571,11 +2673,12 @@ class Adjoint:
                         "Multiple return functions can only assign to simple variables, e.g.: x, y = func()"
                     )
-            if len(names) != len(out):
+            if len(names) != len(rhs):
                 raise WarpCodegenError(
-                    f"Multiple return functions need to receive all their output values, incorrect number of values to unpack (expected {len(out)}, got {len(names)})"
+                    f"Multiple return functions need to receive all their output values, incorrect number of values to unpack (expected {len(rhs)}, got {len(names)})"
                 )
+            out = rhs
             for name, rhs in zip(names, out):
                 if name in adj.symbols:
                     if not types_equal(rhs.type, adj.symbols[name].type):
@@ -2587,8 +2690,6 @@ class Adjoint:
         # handles the case where we are assigning to an array index (e.g.: arr[i] = 2.0)
         elif isinstance(lhs, ast.Subscript):
-            rhs = adj.eval(node.value)
             if hasattr(lhs.value, "attr") and lhs.value.attr == "adjoint":
                 # handle adjoint of a variable, i.e. wp.adjoint[var]
                 lhs.slice.is_adjoint = True
@@ -2600,6 +2701,7 @@ class Adjoint:
             target, indices = adj.eval_subscript(lhs)
             target_type = strip_reference(target.type)
+            indices = adj.eval_indices(target_type, indices)
             if is_array(target_type):
                 adj.add_builtin_call("array_store", [target, *indices, rhs])
@@ -2621,14 +2723,11 @@ class Adjoint:
                 or type_is_transformation(target_type)
             ):
                 # recursively unwind AST, stopping at penultimate node
-                node = lhs
-                while hasattr(node, "value"):
-                    if hasattr(node.value, "value"):
-                        node = node.value
-                    else:
-                        break
+                root = lhs
+                while hasattr(root.value, "value"):
+                    root = root.value
                 # lhs is updating a variable adjoint (i.e. wp.adjoint[var])
-                if hasattr(node, "attr") and node.attr == "adjoint":
+                if hasattr(root, "attr") and root.attr == "adjoint":
                     attr = adj.add_builtin_call("index", [target, *indices])
                     adj.add_builtin_call("store", [attr, rhs])
                     return
@@ -2666,9 +2765,6 @@ class Adjoint:
             # symbol name
             name = lhs.id
-            # evaluate rhs
-            rhs = adj.eval(node.value)
             # check type matches if symbol already defined
             if name in adj.symbols:
                 if not types_equal(strip_reference(rhs.type), adj.symbols[name].type):
@@ -2689,7 +2785,6 @@ class Adjoint:
             adj.symbols[name] = out
         elif isinstance(lhs, ast.Attribute):
-            rhs = adj.eval(node.value)
             aggregate = adj.eval(lhs.value)
             aggregate_type = strip_reference(aggregate.type)
@@ -2777,9 +2872,9 @@ class Adjoint:
             new_node = ast.Assign(targets=[lhs], value=ast.BinOp(lhs, node.op, node.value))
             adj.eval(new_node)
-        if isinstance(lhs, ast.Subscript):
-            rhs = adj.eval(node.value)
+        rhs = adj.eval(node.value)
+        if isinstance(lhs, ast.Subscript):
             # wp.adjoint[var] appears in custom grad functions, and does not require
             # special consideration in the AugAssign case
             if hasattr(lhs.value, "attr") and lhs.value.attr == "adjoint":
@@ -2789,6 +2884,7 @@ class Adjoint:
             target, indices = adj.eval_subscript(lhs)
             target_type = strip_reference(target.type)
+            indices = adj.eval_indices(target_type, indices)
             if is_array(target_type):
                 # target_types int8, uint8, int16, uint16 are not suitable for atomic array accumulation
@@ -2861,7 +2957,6 @@ class Adjoint:
         elif isinstance(lhs, ast.Name):
             target = adj.eval(node.target)
-            rhs = adj.eval(node.value)
             if is_tile(target.type) and is_tile(rhs.type):
                 if isinstance(node.op, ast.Add):
@@ -3163,6 +3258,8 @@ class Adjoint:
         try:
             value = eval(code_to_eval, vars_dict)
+            if isinstance(value, (enum.IntEnum, enum.IntFlag)):
+                value = int(value)
             if warp.config.verbose:
                 print(f"Evaluated static command: {static_code} = {value}")
         except NameError as e:
@@ -3373,6 +3470,11 @@ cuda_module_header = """
 #define WP_NO_CRT
 #include "builtin.h"
+// Map wp.breakpoint() to a device brkpt at the call site so cuda-gdb attributes the stop to the generated .cu line
+#if defined(__CUDACC__) && !defined(_MSC_VER)
+#define __debugbreak() __brkpt()
+#endif
 // avoid namespacing of float type for casting to float type, this is to avoid wp::float(x), which is not valid in C++
 #define float(x) cast_float(x)
 #define adj_float(x, adj_x, adj_ret) adj_cast_float(x, adj_x, adj_ret)
@@ -3410,6 +3512,12 @@ static CUDA_CALLABLE void adj_{name}({reverse_args})
 {{
 {reverse_body}}}
+// Required when compiling adjoints.
+CUDA_CALLABLE {name} add(const {name}& a, const {name}& b)
+{{
+    return {name}();
+}}
 CUDA_CALLABLE void adj_atomic_add({name}* p, {name} t)
 {{
 {atomic_add_body}}}
@@ -3490,7 +3598,8 @@ cuda_kernel_template_backward = """
 cpu_kernel_template_forward = """
 void {name}_cpu_kernel_forward(
-    {forward_args})
+    {forward_args},
+    wp_args_{name} *_wp_args)
 {{
 {forward_body}}}
@@ -3499,7 +3608,9 @@ void {name}_cpu_kernel_forward(
 cpu_kernel_template_backward = """
 void {name}_cpu_kernel_backward(
-    {reverse_args})
+    {reverse_args},
+    wp_args_{name} *_wp_args,
+    wp_args_{name} *_wp_adj_args)
 {{
 {reverse_body}}}
@@ -3511,15 +3622,15 @@ extern "C" {{
 // Python CPU entry points
 WP_API void {name}_cpu_forward(
-    {forward_args})
+    wp::launch_bounds_t dim,
+    wp_args_{name} *_wp_args)
 {{
 for (size_t task_index = 0; task_index < dim.size; ++task_index)
     {{
         // init shared memory allocator
         wp::tile_alloc_shared(0, true);
-        {name}_cpu_kernel_forward(
-            {forward_params});
+        {name}_cpu_kernel_forward(dim, task_index, _wp_args);
         // check shared memory allocator
         wp::tile_alloc_shared(0, false, true);
@@ -3536,15 +3647,16 @@ cpu_module_template_backward = """
 extern "C" {{
 WP_API void {name}_cpu_backward(
-    {reverse_args})
+    wp::launch_bounds_t dim,
+    wp_args_{name} *_wp_args,
+    wp_args_{name} *_wp_adj_args)
 {{
     for (size_t task_index = 0; task_index < dim.size; ++task_index)
     {{
         // initialize shared memory allocator
         wp::tile_alloc_shared(0, true);
-        {name}_cpu_kernel_backward(
-            {reverse_params});
+        {name}_cpu_kernel_backward(dim, task_index, _wp_args, _wp_adj_args);
         // check shared memory allocator
         wp::tile_alloc_shared(0, false, true);
@@ -3575,7 +3687,7 @@ def constant_str(value):
             # special case for float16, which is stored as uint16 in the ctypes.Array
             from warp.context import runtime
-            scalar_value = runtime.core.half_bits_to_float
+            scalar_value = runtime.core.wp_half_bits_to_float
         else:
             def scalar_value(x):
@@ -3713,8 +3825,17 @@ def codegen_func_forward(adj, func_type="kernel", device="cpu"):
     indent_block = " " * indent
-    # primal vars
     lines = []
+    # argument vars
+    if device == "cpu" and func_type == "kernel":
+        lines += ["//---------\n"]
+        lines += ["// argument vars\n"]
+        for var in adj.args:
+            lines += [f"{var.ctype()} {var.emit()} = _wp_args->{var.label};\n"]
+    # primal vars
     lines += ["//---------\n"]
     lines += ["// primal vars\n"]
@@ -3758,6 +3879,17 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
     lines = []
+    # argument vars
+    if device == "cpu" and func_type == "kernel":
+        lines += ["//---------\n"]
+        lines += ["// argument vars\n"]
+        for var in adj.args:
+            lines += [f"{var.ctype()} {var.emit()} = _wp_args->{var.label};\n"]
+        for var in adj.args:
+            lines += [f"{var.ctype()} {var.emit_adj()} = _wp_adj_args->{var.label};\n"]
     # primal vars
     lines += ["//---------\n"]
     lines += ["// primal vars\n"]
@@ -3849,6 +3981,19 @@ def codegen_func(adj, c_func_name: str, device="cpu", options=None):
                 f"annotated as `{warp.context.type_str(adj.arg_types['return'])}` "
                 f"but the code returns a value of type `{warp.context.type_str(adj.return_var[0].type)}`."
             )
+        elif (
+            isinstance(adj.return_var[0].type, warp.types.fixedarray)
+            and type(adj.arg_types["return"]) is warp.types.array
+        ):
+            # If the return statement yields a `fixedarray` while the function is annotated
+            # to return a standard `array`, then raise an error since the `fixedarray` storage
+            # allocated on the stack will be freed once the function exits, meaning that the
+            # resulting `array` instance will point to an invalid data.
+            raise WarpCodegenError(
+                f"The function `{adj.fun_name}` returns a fixed-size array "
+                f"whereas it has its return type annotated as "
+                f"`{warp.context.type_str(adj.arg_types['return'])}`."
+            )
     # Build line directive for function definition (subtract 1 to account for 1-indexing of AST line numbers)
     # This is used as a catch-all C-to-Python source line mapping for any code that does not have
@@ -3927,10 +4072,10 @@ def codegen_func(adj, c_func_name: str, device="cpu", options=None):
         if adj.custom_reverse_mode:
             reverse_body = "\t// user-defined adjoint code\n" + forward_body
         else:
-            if options.get("enable_backward", True):
+            if options.get("enable_backward", True) and adj.used_by_backward_kernel:
                 reverse_body = codegen_func_reverse(adj, func_type="function", device=device)
             else:
-                reverse_body = '\t// reverse mode disabled (module option "enable_backward" is False)\n'
+                reverse_body = '\t// reverse mode disabled (module option "enable_backward" is False or no dependent kernel found with "enable_backward")\n'
         s += reverse_template.format(
             name=c_func_name,
             return_type=return_type,
@@ -4022,6 +4167,13 @@ def codegen_kernel(kernel, device, options):
     adj = kernel.adj
+    args_struct = ""
+    if device == "cpu":
+        args_struct = f"struct wp_args_{kernel.get_mangled_name()} {{\n"
+        for i in adj.args:
+            args_struct += f"    {i.ctype()} {i.label};\n"
+        args_struct += "};\n"
     # Build line directive for function definition (subtract 1 to account for 1-indexing of AST line numbers)
     # This is used as a catch-all C-to-Python source line mapping for any code that does not have
     # a direct mapping to a Python source line.
@@ -4047,9 +4199,9 @@ def codegen_kernel(kernel, device, options):
     forward_args = ["wp::launch_bounds_t dim"]
     if device == "cpu":
         forward_args.append("size_t task_index")
-    for arg in adj.args:
-        forward_args.append(arg.ctype() + " var_" + arg.label)
+    else:
+        for arg in adj.args:
+            forward_args.append(arg.ctype() + " var_" + arg.label)
     forward_body = codegen_func_forward(adj, func_type="kernel", device=device)
     template_fmt_args.update(
@@ -4066,17 +4218,16 @@ def codegen_kernel(kernel, device, options):
         reverse_args = ["wp::launch_bounds_t dim"]
         if device == "cpu":
             reverse_args.append("size_t task_index")
-        for arg in adj.args:
-            reverse_args.append(arg.ctype() + " var_" + arg.label)
-        for arg in adj.args:
-            # indexed array gradients are regular arrays
-            if isinstance(arg.type, indexedarray):
-                _arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
-                reverse_args.append(_arg.ctype() + " adj_" + arg.label)
-            else:
-                reverse_args.append(arg.ctype() + " adj_" + arg.label)
+        else:
+            for arg in adj.args:
+                reverse_args.append(arg.ctype() + " var_" + arg.label)
+            for arg in adj.args:
+                # indexed array gradients are regular arrays
+                if isinstance(arg.type, indexedarray):
+                    _arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
+                    reverse_args.append(_arg.ctype() + " adj_" + arg.label)
+                else:
+                    reverse_args.append(arg.ctype() + " adj_" + arg.label)
         reverse_body = codegen_func_reverse(adj, func_type="kernel", device=device)
         template_fmt_args.update(
@@ -4088,7 +4239,7 @@ def codegen_kernel(kernel, device, options):
         template += template_backward
     s = template.format(**template_fmt_args)
-    return s
+    return args_struct + s
 def codegen_module(kernel, device, options):
@@ -4099,59 +4250,14 @@ def codegen_module(kernel, device, options):
     options = dict(options)
     options.update(kernel.options)
-    adj = kernel.adj
     template = ""
     template_fmt_args = {
         "name": kernel.get_mangled_name(),
     }
-    # build forward signature
-    forward_args = ["wp::launch_bounds_t dim"]
-    forward_params = ["dim", "task_index"]
-    for arg in adj.args:
-        if hasattr(arg.type, "_wp_generic_type_str_"):
-            # vectors and matrices are passed from Python by pointer
-            forward_args.append(f"const {arg.ctype()}* var_" + arg.label)
-            forward_params.append(f"*var_{arg.label}")
-        else:
-            forward_args.append(f"{arg.ctype()} var_{arg.label}")
-            forward_params.append("var_" + arg.label)
-    template_fmt_args.update(
-        {
-            "forward_args": indent(forward_args),
-            "forward_params": indent(forward_params, 3),
-        }
-    )
     template += cpu_module_template_forward
     if options["enable_backward"]:
-        # build reverse signature
-        reverse_args = [*forward_args]
-        reverse_params = [*forward_params]
-        for arg in adj.args:
-            if isinstance(arg.type, indexedarray):
-                # indexed array gradients are regular arrays
-                _arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
-                reverse_args.append(f"const {_arg.ctype()} adj_{arg.label}")
-                reverse_params.append(f"adj_{_arg.label}")
-            elif hasattr(arg.type, "_wp_generic_type_str_"):
-                # vectors and matrices are passed from Python by pointer
-                reverse_args.append(f"const {arg.ctype()}* adj_{arg.label}")
-                reverse_params.append(f"*adj_{arg.label}")
-            else:
-                reverse_args.append(f"{arg.ctype()} adj_{arg.label}")
-                reverse_params.append(f"adj_{arg.label}")
-        template_fmt_args.update(
-            {
-                "reverse_args": indent(reverse_args),
-                "reverse_params": indent(reverse_params, 3),
-            }
-        )
         template += cpu_module_template_backward
     s = template.format(**template_fmt_args)