PyPI - warp-lang - Versions diffs - 1.2.1__py3-none-win_amd64.whl → 1.3.0__py3-none-win_amd64.whl - Mend

warp-lang 1.2.1__py3-none-win_amd64.whl → 1.3.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (194) hide show

warp/__init__.py +8 -6
warp/autograd.py +823 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +6 -2
warp/builtins.py +1410 -886
warp/codegen.py +503 -166
warp/config.py +48 -18
warp/context.py +401 -199
warp/dlpack.py +8 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +1 -1
warp/examples/benchmarks/benchmark_interop_torch.py +158 -0
warp/examples/benchmarks/benchmark_launches.py +1 -1
warp/examples/core/example_cupy.py +78 -0
warp/examples/fem/example_apic_fluid.py +17 -36
warp/examples/fem/example_burgers.py +9 -18
warp/examples/fem/example_convection_diffusion.py +7 -17
warp/examples/fem/example_convection_diffusion_dg.py +27 -47
warp/examples/fem/example_deformed_geometry.py +11 -22
warp/examples/fem/example_diffusion.py +7 -18
warp/examples/fem/example_diffusion_3d.py +24 -28
warp/examples/fem/example_diffusion_mgpu.py +7 -14
warp/examples/fem/example_magnetostatics.py +190 -0
warp/examples/fem/example_mixed_elasticity.py +111 -80
warp/examples/fem/example_navier_stokes.py +30 -34
warp/examples/fem/example_nonconforming_contact.py +290 -0
warp/examples/fem/example_stokes.py +17 -32
warp/examples/fem/example_stokes_transfer.py +12 -21
warp/examples/fem/example_streamlines.py +350 -0
warp/examples/fem/utils.py +936 -0
warp/fabric.py +5 -2
warp/fem/__init__.py +13 -3
warp/fem/cache.py +161 -11
warp/fem/dirichlet.py +37 -28
warp/fem/domain.py +105 -14
warp/fem/field/__init__.py +14 -3
warp/fem/field/field.py +454 -11
warp/fem/field/nodal_field.py +33 -18
warp/fem/geometry/deformed_geometry.py +50 -15
warp/fem/geometry/hexmesh.py +12 -24
warp/fem/geometry/nanogrid.py +106 -31
warp/fem/geometry/quadmesh_2d.py +6 -11
warp/fem/geometry/tetmesh.py +103 -61
warp/fem/geometry/trimesh_2d.py +98 -47
warp/fem/integrate.py +231 -186
warp/fem/operator.py +14 -9
warp/fem/quadrature/pic_quadrature.py +35 -9
warp/fem/quadrature/quadrature.py +119 -32
warp/fem/space/basis_space.py +98 -22
warp/fem/space/collocated_function_space.py +3 -1
warp/fem/space/function_space.py +7 -2
warp/fem/space/grid_2d_function_space.py +3 -3
warp/fem/space/grid_3d_function_space.py +4 -4
warp/fem/space/hexmesh_function_space.py +3 -2
warp/fem/space/nanogrid_function_space.py +12 -14
warp/fem/space/partition.py +45 -47
warp/fem/space/restriction.py +19 -16
warp/fem/space/shape/cube_shape_function.py +91 -3
warp/fem/space/shape/shape_function.py +7 -0
warp/fem/space/shape/square_shape_function.py +32 -0
warp/fem/space/shape/tet_shape_function.py +11 -7
warp/fem/space/shape/triangle_shape_function.py +10 -1
warp/fem/space/topology.py +116 -42
warp/fem/types.py +8 -1
warp/fem/utils.py +301 -83
warp/native/array.h +16 -0
warp/native/builtin.h +0 -15
warp/native/cuda_util.cpp +14 -6
warp/native/exports.h +1348 -1308
warp/native/quat.h +79 -0
warp/native/rand.h +27 -4
warp/native/sparse.cpp +83 -81
warp/native/sparse.cu +381 -453
warp/native/vec.h +64 -0
warp/native/volume.cpp +40 -49
warp/native/volume_builder.cu +2 -3
warp/native/volume_builder.h +12 -17
warp/native/warp.cu +3 -3
warp/native/warp.h +69 -59
warp/render/render_opengl.py +17 -9
warp/sim/articulation.py +117 -17
warp/sim/collide.py +35 -29
warp/sim/model.py +123 -18
warp/sim/render.py +3 -1
warp/sparse.py +867 -203
warp/stubs.py +312 -541
warp/tape.py +29 -1
warp/tests/disabled_kinematics.py +1 -1
warp/tests/test_adam.py +1 -1
warp/tests/test_arithmetic.py +1 -1
warp/tests/test_array.py +58 -1
warp/tests/test_array_reduce.py +1 -1
warp/tests/test_async.py +1 -1
warp/tests/test_atomic.py +1 -1
warp/tests/test_bool.py +1 -1
warp/tests/test_builtins_resolution.py +1 -1
warp/tests/test_bvh.py +6 -1
warp/tests/test_closest_point_edge_edge.py +1 -1
warp/tests/test_codegen.py +66 -1
warp/tests/test_compile_consts.py +1 -1
warp/tests/test_conditional.py +1 -1
warp/tests/test_copy.py +1 -1
warp/tests/test_ctypes.py +1 -1
warp/tests/test_dense.py +1 -1
warp/tests/test_devices.py +1 -1
warp/tests/test_dlpack.py +1 -1
warp/tests/test_examples.py +33 -4
warp/tests/test_fabricarray.py +5 -2
warp/tests/test_fast_math.py +1 -1
warp/tests/test_fem.py +213 -6
warp/tests/test_fp16.py +1 -1
warp/tests/test_func.py +1 -1
warp/tests/test_future_annotations.py +90 -0
warp/tests/test_generics.py +1 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +1 -1
warp/tests/test_grad_debug.py +247 -0
warp/tests/test_hash_grid.py +6 -1
warp/tests/test_implicit_init.py +354 -0
warp/tests/test_import.py +1 -1
warp/tests/test_indexedarray.py +1 -1
warp/tests/test_intersect.py +1 -1
warp/tests/test_jax.py +1 -1
warp/tests/test_large.py +1 -1
warp/tests/test_launch.py +1 -1
warp/tests/test_lerp.py +1 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_lvalue.py +1 -1
warp/tests/test_marching_cubes.py +5 -2
warp/tests/test_mat.py +34 -35
warp/tests/test_mat_lite.py +2 -1
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_math.py +1 -1
warp/tests/test_matmul.py +20 -16
warp/tests/test_matmul_lite.py +1 -1
warp/tests/test_mempool.py +1 -1
warp/tests/test_mesh.py +5 -2
warp/tests/test_mesh_query_aabb.py +1 -1
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_mesh_query_ray.py +1 -1
warp/tests/test_mlp.py +1 -1
warp/tests/test_model.py +1 -1
warp/tests/test_module_hashing.py +77 -1
warp/tests/test_modules_lite.py +1 -1
warp/tests/test_multigpu.py +1 -1
warp/tests/test_noise.py +1 -1
warp/tests/test_operators.py +1 -1
warp/tests/test_options.py +1 -1
warp/tests/test_overwrite.py +542 -0
warp/tests/test_peer.py +1 -1
warp/tests/test_pinned.py +1 -1
warp/tests/test_print.py +1 -1
warp/tests/test_quat.py +15 -1
warp/tests/test_rand.py +1 -1
warp/tests/test_reload.py +1 -1
warp/tests/test_rounding.py +1 -1
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +95 -0
warp/tests/test_sim_grad.py +1 -1
warp/tests/test_sim_kinematics.py +1 -1
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +82 -15
warp/tests/test_spatial.py +1 -1
warp/tests/test_special_values.py +2 -11
warp/tests/test_streams.py +11 -1
warp/tests/test_struct.py +1 -1
warp/tests/test_tape.py +1 -1
warp/tests/test_torch.py +194 -1
warp/tests/test_transient_module.py +1 -1
warp/tests/test_types.py +1 -1
warp/tests/test_utils.py +1 -1
warp/tests/test_vec.py +15 -63
warp/tests/test_vec_lite.py +2 -1
warp/tests/test_vec_scalar_ops.py +122 -39
warp/tests/test_verify_fp.py +1 -1
warp/tests/test_volume.py +28 -2
warp/tests/test_volume_write.py +1 -1
warp/tests/unittest_serial.py +1 -1
warp/tests/unittest_suites.py +9 -1
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +2 -5
warp/torch.py +103 -41
warp/types.py +344 -227
warp/utils.py +11 -2
{warp_lang-1.2.1.dist-info → warp_lang-1.3.0.dist-info}/METADATA +99 -46
warp_lang-1.3.0.dist-info/RECORD +368 -0
warp/examples/fem/bsr_utils.py +0 -378
warp/examples/fem/mesh_utils.py +0 -133
warp/examples/fem/plot_utils.py +0 -292
warp_lang-1.2.1.dist-info/RECORD +0 -359
{warp_lang-1.2.1.dist-info → warp_lang-1.3.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.2.1.dist-info → warp_lang-1.3.0.dist-info}/WHEEL +0 -0
{warp_lang-1.2.1.dist-info → warp_lang-1.3.0.dist-info}/top_level.txt +0 -0

warp/codegen.py CHANGED Viewed

@@ -10,13 +10,14 @@ from __future__ import annotations
 import ast
 import builtins
 import ctypes
+import functools
 import inspect
 import math
 import re
 import sys
 import textwrap
 import types
-from typing import Any, Callable, Dict, Mapping
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence
 import warp.config
 from warp.types import *
@@ -84,17 +85,108 @@ comparison_chain_strings = [
 ]
+def values_check_equal(a, b):
+    if isinstance(a, Sequence) and isinstance(b, Sequence):
+        if len(a) != len(b):
+            return False
+        return all(x == y for x, y in zip(a, b))
+    return a == b
 def op_str_is_chainable(op: str) -> builtins.bool:
     return op in comparison_chain_strings
+def get_closure_cell_contents(obj):
+    """Retrieve a closure's cell contents or `None` if it's empty."""
+    try:
+        return obj.cell_contents
+    except ValueError:
+        pass
+    return None
+def eval_annotations(annotations: Mapping[str, Any], obj: Any) -> Mapping[str, Any]:
+    """Un-stringize annotations caused by `from __future__ import annotations` of PEP 563."""
+    # Implementation backported from `inspect.get_annotations()` for Python 3.9 and older.
+    if not annotations:
+        return {}
+    if not any(isinstance(x, str) for x in annotations.values()):
+        # No annotation to un-stringize.
+        return annotations
+    if isinstance(obj, type):
+        # class
+        globals = {}
+        module_name = getattr(obj, "__module__", None)
+        if module_name:
+            module = sys.modules.get(module_name, None)
+            if module:
+                globals = getattr(module, "__dict__", {})
+        locals = dict(vars(obj))
+        unwrap = obj
+    elif isinstance(obj, types.ModuleType):
+        # module
+        globals = obj.__dict__
+        locals = {}
+        unwrap = None
+    elif callable(obj):
+        # function
+        globals = getattr(obj, "__globals__", {})
+        # Capture the variables from the surrounding scope.
+        closure_vars = zip(
+            obj.__code__.co_freevars, tuple(get_closure_cell_contents(x) for x in (obj.__closure__ or ()))
+        )
+        locals = {k: v for k, v in closure_vars if v is not None}
+        unwrap = obj
+    else:
+        raise TypeError(f"{obj!r} is not a module, class, or callable.")
+    if unwrap is not None:
+        while True:
+            if hasattr(unwrap, "__wrapped__"):
+                unwrap = unwrap.__wrapped__
+                continue
+            if isinstance(unwrap, functools.partial):
+                unwrap = unwrap.func
+                continue
+            break
+        if hasattr(unwrap, "__globals__"):
+            globals = unwrap.__globals__
+    # "Inject" type parameters into the local namespace
+    # (unless they are shadowed by assignments *in* the local namespace),
+    # as a way of emulating annotation scopes when calling `eval()`
+    type_params = getattr(obj, "__type_params__", ())
+    if type_params:
+        locals = {param.__name__: param for param in type_params} | locals
+    return {k: v if not isinstance(v, str) else eval(v, globals, locals) for k, v in annotations.items()}
 def get_annotations(obj: Any) -> Mapping[str, Any]:
-    """Alternative to `inspect.get_annotations()` for Python 3.9 and older."""
+    """Same as `inspect.get_annotations()` but always returning un-stringized annotations."""
+    # This backports `inspect.get_annotations()` for Python 3.9 and older.
     # See https://docs.python.org/3/howto/annotations.html#accessing-the-annotations-dict-of-an-object-in-python-3-9-and-older
     if isinstance(obj, type):
-        return obj.__dict__.get("__annotations__", {})
+        annotations = obj.__dict__.get("__annotations__", {})
+    else:
+        annotations = getattr(obj, "__annotations__", {})
+    # Evaluating annotations can be done using the `eval_str` parameter with
+    # the official function from the `inspect` module.
+    return eval_annotations(annotations, obj)
-    return getattr(obj, "__annotations__", {})
+def get_full_arg_spec(func: Callable) -> inspect.FullArgSpec:
+    """Same as `inspect.getfullargspec()` but always returning un-stringized annotations."""
+    # See https://docs.python.org/3/howto/annotations.html#manually-un-stringizing-stringized-annotations
+    spec = inspect.getfullargspec(func)
+    return spec._replace(annotations=eval_annotations(spec.annotations, func))
 def struct_instance_repr_recursive(inst: StructInstance, depth: int) -> str:
@@ -490,6 +582,14 @@ class Var:
         self.constant = constant
         self.prefix = prefix
+        # records whether this Var has been read from in a kernel function (array only)
+        self.is_read = False
+        # records whether this Var has been written to in a kernel function (array only)
+        self.is_write = False
+        # used to associate a view array Var with its parent array Var
+        self.parent = None
     def __str__(self):
         return self.label
@@ -532,6 +632,42 @@ class Var:
     def emit_adj(self):
         return self.emit("adj")
+    def mark_read(self):
+        """Marks this Var as having been read from in a kernel (array only)."""
+        if not is_array(self.type):
+            return
+        self.is_read = True
+        # recursively update all parent states
+        parent = self.parent
+        while parent is not None:
+            parent.is_read = True
+            parent = parent.parent
+    def mark_write(self, **kwargs):
+        """Marks this Var has having been written to in a kernel (array only)."""
+        if not is_array(self.type):
+            return
+        # detect if we are writing to an array after reading from it within the same kernel
+        if self.is_read and warp.config.verify_autograd_array_access:
+            if "kernel_name" and "filename" and "lineno" in kwargs:
+                print(
+                    f"Warning: Array passed to argument {self.label} in kernel {kwargs['kernel_name']} at {kwargs['filename']}:{kwargs['lineno']} is being written to after it has been read from within the same kernel. This may corrupt gradient computation in the backward pass."
+                )
+            else:
+                print(
+                    f"Warning: Array {self} is being written to after it has been read from within the same kernel. This may corrupt gradient computation in the backward pass."
+                )
+        self.is_write = True
+        # recursively update all parent states
+        parent = self.parent
+        while parent is not None:
+            parent.is_write = True
+            parent = parent.parent
 class Block:
     # Represents a basic block of instructions, e.g.: list
@@ -547,6 +683,91 @@ class Block:
         self.vars = []
+def apply_defaults(
+    bound_args: inspect.BoundArguments,
+    values: Mapping[str, Any],
+):
+    # Similar to Python's `inspect.BoundArguments.apply_defaults()`
+    # but with the possibility to pass an augmented set of default values.
+    arguments = bound_args.arguments
+    new_arguments = []
+    for name in bound_args._signature.parameters.keys():
+        try:
+            new_arguments.append((name, arguments[name]))
+        except KeyError:
+            if name in values:
+                new_arguments.append((name, values[name]))
+    bound_args.arguments = dict(new_arguments)
+def func_match_args(func, arg_types, kwarg_types):
+    try:
+        # Try to bind the given arguments to the function's signature.
+        # This is not checking whether the argument types are matching,
+        # rather it's just assigning each argument to the corresponding
+        # function parameter.
+        bound_arg_types = func.signature.bind(*arg_types, **kwarg_types)
+    except TypeError:
+        return False
+    # Populate the bound arguments with any default values.
+    default_arg_types = {
+        k: None if v is None else get_arg_type(v)
+        for k, v in func.defaults.items()
+        if k not in bound_arg_types.arguments
+    }
+    apply_defaults(bound_arg_types, default_arg_types)
+    bound_arg_types = tuple(bound_arg_types.arguments.values())
+    # Check the given argument types against the ones defined on the function.
+    for bound_arg_type, func_arg_type in zip(bound_arg_types, func.input_types.values()):
+        # Let the `value_func` callback infer the type.
+        if bound_arg_type is None:
+            continue
+        # if arg type registered as Any, treat as
+        # template allowing any type to match
+        if func_arg_type == Any:
+            continue
+        # handle function refs as a special case
+        if func_arg_type == Callable and isinstance(bound_arg_type, warp.context.Function):
+            continue
+        # check arg type matches input variable type
+        if not types_equal(func_arg_type, strip_reference(bound_arg_type), match_generic=True):
+            return False
+    return True
+def get_arg_type(arg: Union[Var, Any]):
+    if isinstance(arg, Sequence):
+        return tuple(get_arg_type(x) for x in arg)
+    if isinstance(arg, (type, warp.context.Function)):
+        return arg
+    if isinstance(arg, Var):
+        return arg.type
+    return type(arg)
+def get_arg_value(arg: Union[Var, Any]):
+    if isinstance(arg, Sequence):
+        return tuple(get_arg_value(x) for x in arg)
+    if isinstance(arg, (type, warp.context.Function)):
+        return arg
+    if isinstance(arg, Var):
+        return arg.constant
+    return arg
 class Adjoint:
     # Source code transformer, this class takes a Python function and
     # generates forward and backward SSA forms of the function instructions
@@ -605,7 +826,7 @@ class Adjoint:
         adj.custom_reverse_num_input_args = custom_reverse_num_input_args
         # parse argument types
-        argspec = inspect.getfullargspec(func)
+        argspec = get_full_arg_spec(func)
         # ensure all arguments are annotated
         if overload_annotations is None:
@@ -646,6 +867,11 @@ class Adjoint:
     # generate function ssa form and adjoint
     def build(adj, builder, default_builder_options=None):
+        # arg Var read/write flags are held during module rebuilds, so we reset here even when skipping a build
+        for arg in adj.args:
+            arg.is_read = False
+            arg.is_write = False
         if adj.skip_build:
             return
@@ -682,15 +908,11 @@ class Adjoint:
         # recursively evaluate function body
         try:
             adj.eval(adj.tree.body[0])
-        except Exception as e:
+        except Exception:
             try:
-                if isinstance(e, KeyError) and getattr(e.args[0], "__module__", None) == "ast":
-                    msg = f'Syntax error: unsupported construct "ast.{e.args[0].__name__}"'
-                else:
-                    msg = "Error"
                 lineno = adj.lineno + adj.fun_lineno
                 line = adj.source_lines[adj.lineno]
-                msg += f' while parsing function "{adj.fun_name}" at {adj.filename}:{lineno}:\n{line}\n'
+                msg = f'Error while parsing function "{adj.fun_name}" at {adj.filename}:{lineno}:\n{line}\n'
                 ex, data, traceback = sys.exc_info()
                 e = ex(";".join([msg] + [str(a) for a in data.args])).with_traceback(traceback)
             finally:
@@ -808,6 +1030,20 @@ class Adjoint:
         return v
+    def register_var(adj, var):
+        # We sometimes initialize `Var` instances that might be thrown away
+        # afterwards, so this method allows to defer their registration among
+        # the list of primal vars until later on, instead of registering them
+        # immediately if we were to use `adj.add_var()` or `adj.add_constant()`.
+        if isinstance(var, (Reference, warp.context.Function)):
+            return var
+        if var.label is None:
+            return adj.add_var(var.type, var.constant)
+        return var
     # append a statement to the forward pass
     def add_forward(adj, statement, replay=None, skip_replay=False):
         adj.blocks[-1].body_forward.append(adj.indentation + statement)
@@ -873,12 +1109,10 @@ class Adjoint:
         return output
-    def resolve_func(adj, func, args, min_outputs, templates, kwds):
-        arg_types = [strip_reference(a.type) for a in args if not isinstance(a, warp.context.Function)]
+    def resolve_func(adj, func, arg_types, kwarg_types, min_outputs):
         if not func.is_builtin():
             # user-defined function
-            overload = func.get_overload(arg_types)
+            overload = func.get_overload(arg_types, kwarg_types)
             if overload is not None:
                 return overload
         else:
@@ -888,88 +1122,89 @@ class Adjoint:
                 # skip type checking for variadic functions
                 if not f.variadic:
                     # check argument counts match are compatible (may be some default args)
-                    if len(f.input_types) < len(args):
+                    if len(f.input_types) < len(arg_types) + len(kwarg_types):
                         continue
-                    def match_args(args, f):
-                        # check argument types equal
-                        for i, (arg_name, arg_type) in enumerate(f.input_types.items()):
-                            # if arg type registered as Any, treat as
-                            # template allowing any type to match
-                            if arg_type == Any:
-                                continue
-                            # handle function refs as a special case
-                            if arg_type == Callable and type(args[i]) is warp.context.Function:
-                                continue
-                            if arg_type == Reference and is_reference(args[i].type):
-                                continue
-                            # look for default values for missing args
-                            if i >= len(args):
-                                if arg_name not in f.defaults:
-                                    return False
-                            else:
-                                # otherwise check arg type matches input variable type
-                                if not types_equal(arg_type, strip_reference(args[i].type), match_generic=True):
-                                    return False
-                        return True
-                    if not match_args(args, f):
+                    if not func_match_args(f, arg_types, kwarg_types):
                         continue
                 # check output dimensions match expectations
                 if min_outputs:
-                    try:
-                        value_type = f.value_func(args, kwds, templates)
-                        if not hasattr(value_type, "__len__") or len(value_type) != min_outputs:
-                            continue
-                    except Exception:
-                        # value func may fail if the user has given
-                        # incorrect args, so we need to catch this
+                    if not isinstance(f.value_type, Sequence) or len(f.value_type) != min_outputs:
                         continue
                 # found a match, use it
                 return f
         # unresolved function, report error
-        arg_types = []
+        arg_type_reprs = []
-        for x in args:
-            if isinstance(x, Var):
+        for x in arg_types:
+            if isinstance(x, warp.context.Function):
+                arg_type_reprs.append("function")
+            else:
                 # shorten Warp primitive type names
-                if isinstance(x.type, list):
-                    if len(x.type) != 1:
+                if isinstance(x, Sequence):
+                    if len(x) != 1:
                         raise WarpCodegenError("Argument must not be the result from a multi-valued function")
-                    arg_type = x.type[0]
+                    arg_type = x[0]
                 else:
-                    arg_type = x.type
+                    arg_type = x
-                arg_types.append(type_repr(arg_type))
-            if isinstance(x, warp.context.Function):
-                arg_types.append("function")
+                arg_type_reprs.append(type_repr(arg_type))
         raise WarpCodegenError(
-            f"Couldn't find function overload for '{func.key}' that matched inputs with types: [{', '.join(arg_types)}]"
+            f"Couldn't find function overload for '{func.key}' that matched inputs with types: [{', '.join(arg_type_reprs)}]"
         )
-    def add_call(adj, func, args, min_outputs=None, templates=None, kwds=None):
-        if templates is None:
-            templates = []
+    def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
+        # Extract the types and values passed as arguments to the function call.
+        arg_types = tuple(strip_reference(get_arg_type(x)) for x in args)
+        kwarg_types = {k: strip_reference(get_arg_type(v)) for k, v in kwargs.items()}
+        # Resolve the exact function signature among any existing overload.
+        func = adj.resolve_func(func, arg_types, kwarg_types, min_outputs)
+        # Bind the positional and keyword arguments to the function's signature
+        # in order to process them as Python does it.
+        bound_args = func.signature.bind(*args, **kwargs)
+        # Type args are the “compile time” argument values we get from codegen.
+        # For example, when calling `wp.vec3f(...)` from within a kernel,
+        # this translates in fact to calling the `vector()` built-in augmented
+        # with the type args `length=3, dtype=float`.
+        # Eventually, these need to be passed to the underlying C++ function,
+        # so we update the arguments with the type args here.
+        if type_args:
+            for arg in type_args:
+                if arg in bound_args.arguments:
+                    # In case of conflict, ideally we'd throw an error since
+                    # what comes from codegen should be the source of truth
+                    # and users also passing the same value as an argument
+                    # is redundant (e.g.: `wp.mat22(shape=(2, 2))`).
+                    # However, for backward compatibility, we allow that form
+                    # as long as the values are equal.
+                    if values_check_equal(get_arg_value(bound_args.arguments[arg]), type_args[arg]):
+                        continue
-        func = adj.resolve_func(func, args, min_outputs, templates, kwds)
+                    raise RuntimeError(
+                        f"Remove the extraneous `{arg}` parameter "
+                        f"when calling the templated version of "
+                        f"`wp.{func.native_func}()`"
+                    )
-        # push any default values onto args
-        for i, (arg_name, _arg_type) in enumerate(func.input_types.items()):
-            if i >= len(args):
-                if arg_name in func.defaults:
-                    const = adj.add_constant(func.defaults[arg_name])
-                    args.append(const)
-                else:
-                    break
+            type_vars = {k: Var(None, type=type(v), constant=v) for k, v in type_args.items()}
+            apply_defaults(bound_args, type_vars)
+        if func.defaults:
+            default_vars = {
+                k: Var(None, type=type(v), constant=v)
+                for k, v in func.defaults.items()
+                if k not in bound_args.arguments and v is not None
+            }
+            apply_defaults(bound_args, default_vars)
+        bound_args = bound_args.arguments
         # if it is a user-function then build it recursively
         if not func.is_builtin() and func not in adj.builder.functions:
@@ -983,23 +1218,38 @@ class Adjoint:
             if func.custom_replay_func:
                 adj.builder.deferred_functions.append(func.custom_replay_func)
-        # evaluate the function type based on inputs
-        arg_types = [strip_reference(a.type) for a in args if not isinstance(a, warp.context.Function)]
-        return_type = func.value_func(arg_types, kwds, templates)
+        # Resolve the return value based on the types and values of the given arguments.
+        bound_arg_types = {k: get_arg_type(v) for k, v in bound_args.items()}
+        bound_arg_values = {k: get_arg_value(v) for k, v in bound_args.items()}
+        return_type = func.value_func(
+            {k: strip_reference(v) for k, v in bound_arg_types.items()},
+            bound_arg_values,
+        )
+        if func.dispatch_func is not None:
+            # If we have a built-in that requires special handling to dispatch
+            # the arguments to the underlying C++ function, then we can resolve
+            # these using the `dispatch_func`. Since this is only called from
+            # within codegen, we pass it directly `codegen.Var` objects,
+            # which allows for some more advanced resolution to be performed,
+            # for example by checking whether an argument corresponds to
+            # a literal value or references a variable.
-        func_name = compute_type_str(func.native_func, templates)
-        param_types = list(func.input_types.values())
+            func_args, template_args = func.dispatch_func(func.input_types, return_type, bound_args)
+        else:
+            func_args = tuple(bound_args.values())
+            template_args = ()
-        use_initializer_list = func.initializer_list_func(args, templates)
+        func_args = tuple(adj.register_var(x) for x in func_args)
+        func_name = compute_type_str(func.native_func, template_args)
+        use_initializer_list = func.initializer_list_func(bound_args, return_type)
-        args_var = [
-            (
-                adj.load(a)
-                if not ((param_types[i] == Reference or param_types[i] == Callable) if i < len(param_types) else False)
-                else a
-            )
-            for i, a in enumerate(args)
-        ]
+        fwd_args = []
+        for func_arg in func_args:
+            if not isinstance(func_arg, (Reference, warp.context.Function)):
+                func_arg = adj.load(func_arg)
+            fwd_args.append(strip_reference(func_arg))
         if return_type is None:
             # handles expression (zero output) functions, e.g.: void do_something();
@@ -1008,24 +1258,24 @@ class Adjoint:
             output_list = []
             forward_call = (
-                f"{func.namespace}{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
+                f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
             )
             replay_call = forward_call
             if func.custom_replay_func is not None or func.replay_snippet is not None:
-                replay_call = f"{func.namespace}replay_{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
+                replay_call = f"{func.namespace}replay_{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
-        elif not isinstance(return_type, list) or len(return_type) == 1:
+        elif not isinstance(return_type, Sequence) or len(return_type) == 1:
             # handle simple function (one output)
-            if isinstance(return_type, list):
+            if isinstance(return_type, Sequence):
                 return_type = return_type[0]
             output = adj.add_var(return_type)
             output_list = [output]
-            forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
+            forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
             replay_call = forward_call
             if func.custom_replay_func is not None:
-                replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
+                replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
         else:
             # handle multiple value functions
@@ -1034,7 +1284,7 @@ class Adjoint:
             output_list = output
             forward_call = (
-                f"{func.namespace}{func_name}({adj.format_forward_call_args(args_var + output, use_initializer_list)});"
+                f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args + output, use_initializer_list)});"
             )
             replay_call = forward_call
@@ -1043,13 +1293,14 @@ class Adjoint:
         else:
             adj.add_forward(forward_call, replay=replay_call)
-        if not func.missing_grad and len(args):
+        if not func.missing_grad and len(func_args):
+            adj_args = tuple(strip_reference(x) for x in func_args)
             reverse_has_output_args = (
                 func.require_original_output_arg or len(output_list) > 1
             ) and func.custom_grad_func is None
             arg_str = adj.format_reverse_call_args(
-                args_var,
-                args,
+                fwd_args,
+                adj_args,
                 output_list,
                 use_initializer_list,
                 has_output_args=reverse_has_output_args,
@@ -1061,12 +1312,9 @@ class Adjoint:
         return output
-    def add_builtin_call(adj, func_name, args, min_outputs=None, templates=None, kwds=None):
-        if templates is None:
-            templates = []
+    def add_builtin_call(adj, func_name, args, min_outputs=None):
         func = warp.context.builtin_functions[func_name]
-        return adj.add_call(func, args, min_outputs, templates, kwds)
+        return adj.add_call(func, args, {}, {}, min_outputs=min_outputs)
     def add_return(adj, var):
         if var is None or len(var) == 0:
@@ -1505,7 +1753,24 @@ class Adjoint:
     def emit_BinOp(adj, node):
         # evaluate binary operator arguments
+        if warp.config.verify_autograd_array_access:
+            # array overwrite tracking: in-place operators are a special case
+            # x[tid] = x[tid] + 1 is a read followed by a write, but we only want to record the write
+            # so we save the current arg read flags and restore them after lhs eval
+            is_read_states = []
+            for arg in adj.args:
+                is_read_states.append(arg.is_read)
+        # evaluate lhs binary operator argument
         left = adj.eval(node.left)
+        if warp.config.verify_autograd_array_access:
+            # restore arg read flags
+            for i, arg in enumerate(adj.args):
+                arg.is_read = is_read_states[i]
+        # evaluate rhs binary operator argument
         right = adj.eval(node.right)
         name = builtin_operators[type(node.op)]
@@ -1569,6 +1834,9 @@ class Adjoint:
         # e.g.: wp.constant in the globals scope
         obj, _ = adj.resolve_static_expression(a)
+        if obj is None:
+            obj = adj.eval(a)
         if isinstance(obj, Var) and obj.constant is not None:
             obj = obj.constant
@@ -1728,13 +1996,40 @@ class Adjoint:
                     f"arguments to the function {adj.fun_name}, {adj.filename}:{lineno}:\n{line}\n"
                 )
+    def resolve_arg(adj, arg):
+        # Always try to start with evaluating the argument since it can help
+        # detecting some issues such as global variables being accessed.
+        try:
+            var = adj.eval(arg)
+        except (WarpCodegenError, WarpCodegenKeyError) as e:
+            error = e
+        else:
+            error = None
+        # Check if we can resolve the argument as a static expression.
+        # If not, return the variable resulting from evaluating the argument.
+        expr, _ = adj.resolve_static_expression(arg)
+        if expr is None:
+            if error is not None:
+                raise error
+            return var
+        if isinstance(expr, (type, Var, warp.context.Function)):
+            return expr
+        return adj.add_constant(expr)
     def emit_Call(adj, node):
         adj.check_tid_in_func_error(node)
         # try and lookup function in globals by
         # resolving path (e.g.: module.submodule.attr)
         func, path = adj.resolve_static_expression(node.func)
-        templates = []
+        if func is None:
+            func = adj.eval(node.func)
+        type_args = {}
         if not isinstance(func, warp.context.Function):
             attr = path[-1]
@@ -1747,7 +2042,6 @@ class Adjoint:
             # vector class type e.g.: wp.vec3f constructor
             if func is None and hasattr(caller, "_wp_generic_type_str_"):
-                templates = caller._wp_type_params_
                 func = warp.context.builtin_functions.get(caller._wp_constructor_)
             # scalar class type e.g.: wp.int8 constructor
@@ -1757,43 +2051,53 @@ class Adjoint:
             # struct constructor
             if func is None and isinstance(caller, Struct):
                 adj.builder.build_struct_recursive(caller)
-                func = caller.initializer()
+                if node.args or node.keywords:
+                    func = caller.value_constructor
+                else:
+                    func = caller.default_constructor
+            if hasattr(caller, "_wp_type_args_"):
+                type_args = caller._wp_type_args_
             if func is None:
                 raise WarpCodegenError(
                     f"Could not find function {'.'.join(path)} as a built-in or user-defined function. Note that user functions must be annotated with a @wp.func decorator to be called from a kernel."
                 )
-        args = []
-        # eval all arguments
+        # Check if any argument correspond to an unsupported construct.
+        # Tuples are supported in the context of assigning multiple variables
+        # at once, but not in place of vectors when calling built-ins like
+        # `wp.length((1, 2, 3))`.
+        # Therefore, we need to catch this specific case here instead of
+        # more generally in `adj.eval()`.
         for arg in node.args:
-            var = adj.eval(arg)
-            args.append(var)
-        # eval all keyword args
-        def kwval(kw):
-            if isinstance(kw.value, ast.Num):
-                return kw.value.n
-            elif isinstance(kw.value, ast.Tuple):
-                arg_is_numeric, arg_values = zip(*(adj.eval_num(e) for e in kw.value.elts))
-                if not all(arg_is_numeric):
-                    raise WarpCodegenError(
-                        f"All elements of the tuple keyword argument '{kw.name}' must be numeric constants, got '{arg_values}'"
-                    )
-                return arg_values
-            else:
-                return adj.resolve_static_expression(kw.value)[0]
-        kwds = {kw.arg: kwval(kw) for kw in node.keywords}
+            if isinstance(arg, ast.Tuple):
+                raise WarpCodegenError(
+                    "Tuple constructs are not supported in kernels. Use vectors like `wp.vec3()` instead."
+                )
         # get expected return count, e.g.: for multi-assignment
         min_outputs = None
         if hasattr(node, "expects"):
             min_outputs = node.expects
-        # add var with value type from the function
-        out = adj.add_call(func=func, args=args, kwds=kwds, templates=templates, min_outputs=min_outputs)
+        # Evaluate all positional and keywords arguments.
+        args = tuple(adj.resolve_arg(x) for x in node.args)
+        kwargs = {x.arg: adj.resolve_arg(x.value) for x in node.keywords}
+        if warp.config.verify_autograd_array_access:
+            # update arg read/write states according to what happens to that arg in the called function
+            if hasattr(func, "adj"):
+                for i, arg in enumerate(args):
+                    if func.adj.args[i].is_write:
+                        kernel_name = adj.fun_name
+                        filename = adj.filename
+                        lineno = adj.lineno + adj.fun_lineno
+                        arg.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+                    if func.adj.args[i].is_read:
+                        arg.mark_read()
+        out = adj.add_call(func, args, kwargs, type_args, min_outputs=min_outputs)
         return out
     def emit_Index(adj, node):
@@ -1872,10 +2176,22 @@ class Adjoint:
             if len(indices) == target_type.ndim:
                 # handles array loads (where each dimension has an index specified)
                 out = adj.add_builtin_call("address", [target, *indices])
+                if warp.config.verify_autograd_array_access:
+                    target.mark_read()
             else:
                 # handles array views (fewer indices than dimensions)
                 out = adj.add_builtin_call("view", [target, *indices])
+                if warp.config.verify_autograd_array_access:
+                    # store reference to target Var to propagate downstream read/write state back to root arg Var
+                    out.parent = target
+                    # view arg inherits target Var's read/write states
+                    out.is_read = target.is_read
+                    out.is_write = target.is_write
         else:
             # handles non-array type indexing, e.g: vec3, mat33, etc
             out = adj.add_builtin_call("extract", [target, *indices])
@@ -1888,6 +2204,21 @@ class Adjoint:
         lhs = node.targets[0]
+        if not isinstance(lhs, ast.Tuple):
+            # Check if the rhs corresponds to an unsupported construct.
+            # Tuples are supported in the context of assigning multiple variables
+            # at once, but not for simple assignments like `x = (1, 2, 3)`.
+            # Therefore, we need to catch this specific case here instead of
+            # more generally in `adj.eval()`.
+            if isinstance(node.value, ast.List):
+                raise WarpCodegenError(
+                    "List constructs are not supported in kernels. Use vectors like `wp.vec3()` for small collections instead."
+                )
+            elif isinstance(node.value, ast.Tuple):
+                raise WarpCodegenError(
+                    "Tuple constructs are not supported in kernels. Use vectors like `wp.vec3()` for small collections instead."
+                )
         # handle the case where we are assigning multiple output variables
         if isinstance(lhs, ast.Tuple):
             # record the expected number of outputs on the node
@@ -1944,7 +2275,14 @@ class Adjoint:
             if is_array(target_type):
                 adj.add_builtin_call("array_store", [target, *indices, rhs])
-            elif type_is_vector(target_type) or type_is_matrix(target_type):
+                if warp.config.verify_autograd_array_access:
+                    kernel_name = adj.fun_name
+                    filename = adj.filename
+                    lineno = adj.lineno + adj.fun_lineno
+                    target.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+            elif type_is_vector(target_type) or type_is_quaternion(target_type) or type_is_matrix(target_type):
                 if is_reference(target.type):
                     attr = adj.add_builtin_call("indexref", [target, *indices])
                 else:
@@ -1961,7 +2299,7 @@ class Adjoint:
                     )
             else:
-                raise WarpCodegenError("Can only subscript assign array, vector, and matrix types")
+                raise WarpCodegenError("Can only subscript assign array, vector, quaternion, and matrix types")
         elif isinstance(lhs, ast.Name):
             # symbol name
@@ -2050,8 +2388,7 @@ class Adjoint:
     def emit_Tuple(adj, node):
         # LHS for expressions, such as i, j, k = 1, 2, 3
-        for elem in node.elts:
-            adj.eval(elem)
+        return tuple(adj.eval(x) for x in node.elts)
     def emit_Pass(adj, node):
         pass
@@ -2089,7 +2426,12 @@ class Adjoint:
         if hasattr(node, "lineno"):
             adj.set_lineno(node.lineno - 1)
-        emit_node = adj.node_visitors[type(node)]
+        try:
+            emit_node = adj.node_visitors[type(node)]
+        except KeyError as e:
+            type_name = type(node).__name__
+            namespace = "ast." if isinstance(node, ast.AST) else ""
+            raise WarpCodegenError(f"Construct `{namespace}{type_name}` not supported in kernels.") from e
         return emit_node(adj, node)
@@ -2120,18 +2462,18 @@ class Adjoint:
         vars_dict = {**adj.func.__globals__, **capturedvars}
         if path[0] in vars_dict:
-            func = vars_dict[path[0]]
+            expr = vars_dict[path[0]]
         # Support Warp types in kernels without the module suffix (e.g. v = vec3(0.0,0.2,0.4)):
         else:
-            func = getattr(warp, path[0], None)
+            expr = getattr(warp, path[0], None)
-        if func:
+        if expr:
             for i in range(1, len(path)):
-                if hasattr(func, path[i]):
-                    func = getattr(func, path[i])
+                if hasattr(expr, path[i]):
+                    expr = getattr(expr, path[i])
-        return func
+        return expr
     # Evaluates a static expression that does not depend on runtime values
     # if eval_types is True, try resolving the path using evaluated type information as well
@@ -2182,11 +2524,6 @@ class Adjoint:
         if captured_obj is not None:
             return captured_obj, path
-        # Still nothing found, maybe this is a predefined type attribute like `dtype`
-        if eval_types:
-            val = adj.eval(root_node)
-            return [val, path]
         return None, path
     # annotate generated code with the original source code line
@@ -2262,10 +2599,10 @@ cpu_module_header = """
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
-#define builtin_tid1d() wp::tid(wp::s_threadIdx)
-#define builtin_tid2d(x, y) wp::tid(x, y, wp::s_threadIdx, dim)
-#define builtin_tid3d(x, y, z) wp::tid(x, y, z, wp::s_threadIdx, dim)
-#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, wp::s_threadIdx, dim)
+#define builtin_tid1d() wp::tid(task_index)
+#define builtin_tid2d(x, y) wp::tid(x, y, task_index, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, task_index, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, task_index, dim)
 """
@@ -2280,10 +2617,10 @@ cuda_module_header = """
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
-#define builtin_tid1d() wp::tid(_idx)
-#define builtin_tid2d(x, y) wp::tid(x, y, _idx, dim)
-#define builtin_tid3d(x, y, z) wp::tid(x, y, z, _idx, dim)
-#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, _idx, dim)
+#define builtin_tid1d() wp::tid(task_index)
+#define builtin_tid2d(x, y) wp::tid(x, y, task_index, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, task_index, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, task_index, dim)
 """
@@ -2355,9 +2692,9 @@ cuda_kernel_template = """
 extern "C" __global__ void {name}_cuda_kernel_forward(
     {forward_args})
 {{
-    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
-         _idx < dim.size;
-         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+    for (size_t task_index = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+         task_index < dim.size;
+         task_index += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
     {{
 {forward_body}    }}
 }}
@@ -2365,9 +2702,9 @@ extern "C" __global__ void {name}_cuda_kernel_forward(
 extern "C" __global__ void {name}_cuda_kernel_backward(
     {reverse_args})
 {{
-    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
-         _idx < dim.size;
-         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+    for (size_t task_index = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+         task_index < dim.size;
+         task_index += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
     {{
 {reverse_body}    }}
 }}
@@ -2396,10 +2733,8 @@ extern "C" {{
 WP_API void {name}_cpu_forward(
     {forward_args})
 {{
-    for (size_t i=0; i < dim.size; ++i)
+    for (size_t task_index = 0; task_index < dim.size; ++task_index)
     {{
-        wp::s_threadIdx = i;
         {name}_cpu_kernel_forward(
             {forward_params});
     }}
@@ -2408,10 +2743,8 @@ WP_API void {name}_cpu_forward(
 WP_API void {name}_cpu_backward(
     {reverse_args})
 {{
-    for (size_t i=0; i < dim.size; ++i)
+    for (size_t task_index = 0; task_index < dim.size; ++task_index)
     {{
-        wp::s_threadIdx = i;
         {name}_cpu_kernel_backward(
             {reverse_params});
     }}
@@ -2838,6 +3171,10 @@ def codegen_kernel(kernel, device, options):
     forward_args = ["wp::launch_bounds_t dim"]
     reverse_args = ["wp::launch_bounds_t dim"]
+    if device == "cpu":
+        forward_args.append("size_t task_index")
+        reverse_args.append("size_t task_index")
     # forward args
     for arg in adj.args:
         forward_args.append(arg.ctype() + " var_" + arg.label)
@@ -2886,7 +3223,7 @@ def codegen_module(kernel, device="cpu"):
     # build forward signature
     forward_args = ["wp::launch_bounds_t dim"]
-    forward_params = ["dim"]
+    forward_params = ["dim", "task_index"]
     for arg in adj.args:
         if hasattr(arg.type, "_wp_generic_type_str_"):