PyPI - warp-lang - Versions diffs - 1.2.2__py3-none-manylinux2014_aarch64.whl → 1.3.1__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.2.2__py3-none-manylinux2014_aarch64.whl → 1.3.1__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (193) hide show

warp/__init__.py +8 -6
warp/autograd.py +823 -0
warp/bin/warp.so +0 -0
warp/build.py +6 -2
warp/builtins.py +1412 -888
warp/codegen.py +503 -166
warp/config.py +48 -18
warp/context.py +400 -198
warp/dlpack.py +8 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +1 -1
warp/examples/benchmarks/benchmark_interop_torch.py +158 -0
warp/examples/benchmarks/benchmark_launches.py +1 -1
warp/examples/core/example_cupy.py +78 -0
warp/examples/fem/example_apic_fluid.py +17 -36
warp/examples/fem/example_burgers.py +9 -18
warp/examples/fem/example_convection_diffusion.py +7 -17
warp/examples/fem/example_convection_diffusion_dg.py +27 -47
warp/examples/fem/example_deformed_geometry.py +11 -22
warp/examples/fem/example_diffusion.py +7 -18
warp/examples/fem/example_diffusion_3d.py +24 -28
warp/examples/fem/example_diffusion_mgpu.py +7 -14
warp/examples/fem/example_magnetostatics.py +190 -0
warp/examples/fem/example_mixed_elasticity.py +111 -80
warp/examples/fem/example_navier_stokes.py +30 -34
warp/examples/fem/example_nonconforming_contact.py +290 -0
warp/examples/fem/example_stokes.py +17 -32
warp/examples/fem/example_stokes_transfer.py +12 -21
warp/examples/fem/example_streamlines.py +350 -0
warp/examples/fem/utils.py +936 -0
warp/fabric.py +5 -2
warp/fem/__init__.py +13 -3
warp/fem/cache.py +161 -11
warp/fem/dirichlet.py +37 -28
warp/fem/domain.py +105 -14
warp/fem/field/__init__.py +14 -3
warp/fem/field/field.py +454 -11
warp/fem/field/nodal_field.py +33 -18
warp/fem/geometry/deformed_geometry.py +50 -15
warp/fem/geometry/hexmesh.py +12 -24
warp/fem/geometry/nanogrid.py +106 -31
warp/fem/geometry/quadmesh_2d.py +6 -11
warp/fem/geometry/tetmesh.py +103 -61
warp/fem/geometry/trimesh_2d.py +98 -47
warp/fem/integrate.py +231 -186
warp/fem/operator.py +14 -9
warp/fem/quadrature/pic_quadrature.py +35 -9
warp/fem/quadrature/quadrature.py +119 -32
warp/fem/space/basis_space.py +98 -22
warp/fem/space/collocated_function_space.py +3 -1
warp/fem/space/function_space.py +7 -2
warp/fem/space/grid_2d_function_space.py +3 -3
warp/fem/space/grid_3d_function_space.py +4 -4
warp/fem/space/hexmesh_function_space.py +3 -2
warp/fem/space/nanogrid_function_space.py +12 -14
warp/fem/space/partition.py +45 -47
warp/fem/space/restriction.py +19 -16
warp/fem/space/shape/cube_shape_function.py +91 -3
warp/fem/space/shape/shape_function.py +7 -0
warp/fem/space/shape/square_shape_function.py +32 -0
warp/fem/space/shape/tet_shape_function.py +11 -7
warp/fem/space/shape/triangle_shape_function.py +10 -1
warp/fem/space/topology.py +116 -42
warp/fem/types.py +8 -1
warp/fem/utils.py +301 -83
warp/native/array.h +16 -0
warp/native/builtin.h +0 -15
warp/native/cuda_util.cpp +14 -6
warp/native/exports.h +1348 -1308
warp/native/quat.h +79 -0
warp/native/rand.h +27 -4
warp/native/sparse.cpp +83 -81
warp/native/sparse.cu +381 -453
warp/native/vec.h +64 -0
warp/native/volume.cpp +40 -49
warp/native/volume_builder.cu +2 -3
warp/native/volume_builder.h +12 -17
warp/native/warp.cu +3 -3
warp/native/warp.h +69 -59
warp/render/render_opengl.py +17 -9
warp/sim/articulation.py +117 -17
warp/sim/collide.py +35 -29
warp/sim/model.py +123 -18
warp/sim/render.py +3 -1
warp/sparse.py +867 -203
warp/stubs.py +312 -541
warp/tape.py +29 -1
warp/tests/disabled_kinematics.py +1 -1
warp/tests/test_adam.py +1 -1
warp/tests/test_arithmetic.py +1 -1
warp/tests/test_array.py +58 -1
warp/tests/test_array_reduce.py +1 -1
warp/tests/test_async.py +1 -1
warp/tests/test_atomic.py +1 -1
warp/tests/test_bool.py +1 -1
warp/tests/test_builtins_resolution.py +1 -1
warp/tests/test_bvh.py +6 -1
warp/tests/test_closest_point_edge_edge.py +1 -1
warp/tests/test_codegen.py +91 -1
warp/tests/test_compile_consts.py +1 -1
warp/tests/test_conditional.py +1 -1
warp/tests/test_copy.py +1 -1
warp/tests/test_ctypes.py +1 -1
warp/tests/test_dense.py +1 -1
warp/tests/test_devices.py +1 -1
warp/tests/test_dlpack.py +1 -1
warp/tests/test_examples.py +33 -4
warp/tests/test_fabricarray.py +5 -2
warp/tests/test_fast_math.py +1 -1
warp/tests/test_fem.py +213 -6
warp/tests/test_fp16.py +1 -1
warp/tests/test_func.py +1 -1
warp/tests/test_future_annotations.py +90 -0
warp/tests/test_generics.py +1 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +1 -1
warp/tests/test_grad_debug.py +247 -0
warp/tests/test_hash_grid.py +6 -1
warp/tests/test_implicit_init.py +354 -0
warp/tests/test_import.py +1 -1
warp/tests/test_indexedarray.py +1 -1
warp/tests/test_intersect.py +1 -1
warp/tests/test_jax.py +1 -1
warp/tests/test_large.py +1 -1
warp/tests/test_launch.py +1 -1
warp/tests/test_lerp.py +1 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_lvalue.py +1 -1
warp/tests/test_marching_cubes.py +5 -2
warp/tests/test_mat.py +34 -35
warp/tests/test_mat_lite.py +2 -1
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_math.py +1 -1
warp/tests/test_matmul.py +20 -16
warp/tests/test_matmul_lite.py +1 -1
warp/tests/test_mempool.py +1 -1
warp/tests/test_mesh.py +5 -2
warp/tests/test_mesh_query_aabb.py +1 -1
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_mesh_query_ray.py +1 -1
warp/tests/test_mlp.py +1 -1
warp/tests/test_model.py +1 -1
warp/tests/test_module_hashing.py +77 -1
warp/tests/test_modules_lite.py +1 -1
warp/tests/test_multigpu.py +1 -1
warp/tests/test_noise.py +1 -1
warp/tests/test_operators.py +1 -1
warp/tests/test_options.py +1 -1
warp/tests/test_overwrite.py +542 -0
warp/tests/test_peer.py +1 -1
warp/tests/test_pinned.py +1 -1
warp/tests/test_print.py +1 -1
warp/tests/test_quat.py +15 -1
warp/tests/test_rand.py +1 -1
warp/tests/test_reload.py +1 -1
warp/tests/test_rounding.py +1 -1
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +95 -0
warp/tests/test_sim_grad.py +1 -1
warp/tests/test_sim_kinematics.py +1 -1
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +82 -15
warp/tests/test_spatial.py +1 -1
warp/tests/test_special_values.py +2 -11
warp/tests/test_streams.py +11 -1
warp/tests/test_struct.py +1 -1
warp/tests/test_tape.py +1 -1
warp/tests/test_torch.py +194 -1
warp/tests/test_transient_module.py +1 -1
warp/tests/test_types.py +1 -1
warp/tests/test_utils.py +1 -1
warp/tests/test_vec.py +15 -63
warp/tests/test_vec_lite.py +2 -1
warp/tests/test_vec_scalar_ops.py +65 -1
warp/tests/test_verify_fp.py +1 -1
warp/tests/test_volume.py +28 -2
warp/tests/test_volume_write.py +1 -1
warp/tests/unittest_serial.py +1 -1
warp/tests/unittest_suites.py +9 -1
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +2 -5
warp/torch.py +103 -41
warp/types.py +341 -224
warp/utils.py +11 -2
{warp_lang-1.2.2.dist-info → warp_lang-1.3.1.dist-info}/METADATA +99 -46
warp_lang-1.3.1.dist-info/RECORD +368 -0
warp/examples/fem/bsr_utils.py +0 -378
warp/examples/fem/mesh_utils.py +0 -133
warp/examples/fem/plot_utils.py +0 -292
warp_lang-1.2.2.dist-info/RECORD +0 -359
{warp_lang-1.2.2.dist-info → warp_lang-1.3.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.2.2.dist-info → warp_lang-1.3.1.dist-info}/WHEEL +0 -0
{warp_lang-1.2.2.dist-info → warp_lang-1.3.1.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -18,6 +18,7 @@ import os
 import platform
 import sys
 import types
+import typing
 from copy import copy as shallowcopy
 from pathlib import Path
 from struct import pack as struct_pack
@@ -34,7 +35,7 @@ import warp.config
 def create_value_func(type):
-    def value_func(args, kwds, templates):
+    def value_func(arg_types, arg_values):
         return type
     return value_func
@@ -42,7 +43,7 @@ def create_value_func(type):
 def get_function_args(func):
     """Ensures that all function arguments are annotated and returns a dictionary mapping from argument name to its type."""
-    argspec = inspect.getfullargspec(func)
+    argspec = warp.codegen.get_full_arg_spec(func)
     # use source-level argument annotations
     if len(argspec.annotations) < len(argspec.args):
@@ -63,7 +64,8 @@ class Function:
         input_types=None,
         value_type=None,
         value_func=None,
-        template_func=None,
+        export_func=None,
+        dispatch_func=None,
         module=None,
         variadic=False,
         initializer_list_func=None,
@@ -97,14 +99,15 @@ class Function:
         self.namespace = namespace
         self.value_type = value_type
         self.value_func = value_func  # a function that takes a list of args and a list of templates and returns the value type, e.g.: load(array, index) returns the type of value being loaded
-        self.template_func = template_func
+        self.export_func = export_func
+        self.dispatch_func = dispatch_func
         self.input_types = {}
         self.export = export
         self.doc = doc
         self.group = group
         self.module = module
         self.variadic = variadic  # function can take arbitrary number of inputs, e.g.: printf()
-        self.defaults = defaults
+        self.defaults = {} if defaults is None else defaults
         # Function instance for a custom implementation of the replay pass
         self.custom_replay_func = custom_replay_func
         self.native_snippet = native_snippet
@@ -180,6 +183,33 @@ class Function:
         if not skip_adding_overload:
             self.add_overload(self)
+        # Store a description of the function's signature that can be used
+        # to resolve a bunch of positional/keyword/variadic arguments against,
+        # in a way that is compatible with Python's semantics.
+        signature_params = []
+        signature_default_param_kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
+        for param_name in self.input_types.keys():
+            if param_name.startswith("**"):
+                param_name = param_name[2:]
+                param_kind = inspect.Parameter.VAR_KEYWORD
+            elif param_name.startswith("*"):
+                param_name = param_name[1:]
+                param_kind = inspect.Parameter.VAR_POSITIONAL
+                # Once a variadic argument like `*args` is found, any following
+                # arguments need to be passed using keywords.
+                signature_default_param_kind = inspect.Parameter.KEYWORD_ONLY
+            else:
+                param_kind = signature_default_param_kind
+            param = param = inspect.Parameter(
+                param_name,
+                param_kind,
+                default=self.defaults.get(param_name, inspect.Parameter.empty),
+            )
+            signature_params.append(param)
+        self.signature = inspect.Signature(signature_params)
         # add to current module
         if module:
             module.register_function(self, skip_adding_overload)
@@ -247,7 +277,7 @@ class Function:
         # only export simple types that don't use arrays
         for v in self.input_types.values():
-            if isinstance(v, warp.array) or v in complex_type_hints:
+            if warp.types.is_array(v) or v in complex_type_hints:
                 return False
         if type(self.value_type) in sequence_types:
@@ -261,8 +291,14 @@ class Function:
         name = "builtin_" + self.key
+        # Runtime arguments that are to be passed to the function, not its template signature.
+        if self.export_func is not None:
+            func_args = self.export_func(self.input_types)
+        else:
+            func_args = self.input_types
         types = []
-        for t in self.input_types.values():
+        for t in func_args.values():
             types.append(t.__name__)
         return "_".join([name, *types])
@@ -299,7 +335,7 @@ class Function:
                     )
                 self.user_overloads[sig] = f
-    def get_overload(self, arg_types):
+    def get_overload(self, arg_types, kwarg_types):
         assert not self.is_builtin()
         sig = warp.types.get_signature(arg_types, func_name=self.key)
@@ -347,15 +383,21 @@ class Function:
 def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
     uses_non_warp_array_type = False
-    warp.context.init()
+    init()
     # Retrieve the built-in function from Warp's dll.
     c_func = getattr(warp.context.runtime.core, func.mangled_name)
+    # Runtime arguments that are to be passed to the function, not its template signature.
+    if func.export_func is not None:
+        func_args = func.export_func(func.input_types)
+    else:
+        func_args = func.input_types
     # Try gathering the parameters that the function expects and pack them
     # into their corresponding C types.
     c_params = []
-    for i, (_, arg_type) in enumerate(func.input_types.items()):
+    for i, (_, arg_type) in enumerate(func_args.items()):
         param = params[i]
         try:
@@ -485,7 +527,8 @@ def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
                 c_params.append(arg_type._type_(param))
     # returns the corresponding ctype for a scalar or vector warp type
-    value_type = func.value_func(None, None, None)
+    value_type = func.value_func(None, None)
     if value_type == float:
         value_ctype = ctypes.c_float
     elif value_type == int:
@@ -521,10 +564,12 @@ def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
         return (True, ret)
     if value_type == warp.types.float16:
-        return (True, warp.types.half_bits_to_float(ret.value))
+        value = warp.types.half_bits_to_float(ret.value)
+    else:
+        value = ret.value
     # return scalar types as int/float
-    return (True, ret.value)
+    return (True, value)
 class KernelHooks:
@@ -742,7 +787,6 @@ def func_grad(forward_fn):
                 input_types=reverse_args,
                 value_func=None,
                 module=f.module,
-                template_func=f.template_func,
                 skip_forward_codegen=True,
                 custom_reverse_mode=True,
                 custom_reverse_num_input_args=len(f.input_types),
@@ -807,7 +851,7 @@ def func_replay(forward_fn):
                 f"Cannot define custom replay definition for {forward_fn.key} since the provided replay function has generic input arguments."
             )
-        f = forward_fn.get_overload(arg_types)
+        f = forward_fn.get_overload(arg_types, {})
         if f is None:
             inputs_str = ", ".join([f"{k}: {v.__name__}" for k, v in args.items()])
             raise RuntimeError(
@@ -819,8 +863,9 @@ def func_replay(forward_fn):
             namespace=f.namespace,
             input_types=f.input_types,
             value_func=f.value_func,
+            export_func=f.export_func,
+            dispatch_func=f.dispatch_func,
             module=f.module,
-            template_func=f.template_func,
             skip_reverse_codegen=True,
             skip_adding_overload=True,
             code_transformers=f.adj.transformers,
@@ -920,7 +965,7 @@ def overload(kernel, arg_types=None):
             )
         # ensure all arguments are annotated
-        argspec = inspect.getfullargspec(fn)
+        argspec = warp.codegen.get_full_arg_spec(fn)
         if len(argspec.annotations) < len(argspec.args):
             raise RuntimeError(f"Incomplete argument annotations on kernel overload {fn.__name__}")
@@ -965,7 +1010,8 @@ def add_builtin(
     constraint=None,
     value_type=None,
     value_func=None,
-    template_func=None,
+    export_func=None,
+    dispatch_func=None,
     doc="",
     namespace="wp::",
     variadic=False,
@@ -979,18 +1025,66 @@ def add_builtin(
     defaults=None,
     require_original_output_arg=False,
 ):
+    """Main entry point to register a new built-in function.
+    Args:
+        key (str): Function name. Multiple overloaded functions can be registered
+            under the same name as long as their signature differ.
+        input_types (Mapping[str, Any]): Signature of the user-facing function.
+            Variadic arguments are supported by prefixing the parameter names
+            with asterisks as in `*args` and `**kwargs`. Generic arguments are
+            supported with types such as `Any`, `Float`, `Scalar`, etc.
+        constraint (Callable): For functions that define generic arguments and
+            are to be exported, this callback is used to specify whether some
+            combination of inferred arguments are valid or not.
+        value_type (Any): Type returned by the function.
+        value_func (Callable): Callback used to specify the return type when
+            `value_type` isn't enough.
+        export_func (Callable): Callback used during the context stage to specify
+            the signature of the underlying C++ function, not accounting for
+            the template parameters.
+            If not provided, `input_types` is used.
+        dispatch_func (Callable): Callback used during the codegen stage to specify
+            the runtime and template arguments to be passed to the underlying C++
+            function. In other words, this allows defining a mapping between
+            the signatures of the user-facing and the C++ functions, and even to
+            dynamically create new arguments on the fly.
+            The arguments returned must be of type `codegen.Var`.
+            If not provided, all arguments passed by the users when calling
+            the built-in are passed as-is as runtime arguments to the C++ function.
+        doc (str): Used to generate the Python's docstring and the HTML documentation.
+        namespace: Namespace for the underlying C++ function.
+        variadic (bool): Whether the function declares variadic arguments.
+        initializer_list_func (bool): Whether to use the initializer list syntax
+            when passing the arguments to the underlying C++ function.
+        export (bool): Whether the function is to be exposed to the Python
+            interpreter so that it becomes available from within the `warp`
+            module.
+        group (str): Classification used for the documentation.
+        hidden (bool): Whether to add that function into the documentation.
+        skip_replay (bool): Whether operation will be performed during
+            the forward replay in the backward pass.
+        missing_grad (bool): Whether the function is missing a corresponding
+            adjoint.
+        native_func (str): Name of the underlying C++ function.
+        defaults (Mapping[str, Any]): Default values for the parameters defined
+            in `input_types`.
+        require_original_output_arg (bool): Used during the codegen stage to
+            specify whether an adjoint parameter corresponding to the return
+            value should be included in the signature of the backward function.
+    """
     if input_types is None:
         input_types = {}
     # wrap simple single-type functions with a value_func()
     if value_func is None:
-        def value_func(args, kwds, templates):
+        def value_func(arg_types, arg_values):
             return value_type
     if initializer_list_func is None:
-        def initializer_list_func(args, templates):
+        def initializer_list_func(args, return_type):
             return False
     if defaults is None:
@@ -998,8 +1092,13 @@ def add_builtin(
     # Add specialized versions of this builtin if it's generic by matching arguments against
     # hard coded types. We do this so you can use hard coded warp types outside kernels:
+    if export_func is not None:
+        func_arg_types = export_func(input_types)
+    else:
+        func_arg_types = input_types
     generic = False
-    for x in input_types.values():
+    for x in func_arg_types.values():
         if warp.types.type_is_generic(x):
             generic = True
             break
@@ -1007,7 +1106,7 @@ def add_builtin(
     if generic and export:
         # collect the parent type names of all the generic arguments:
         genericset = set()
-        for t in input_types.values():
+        for t in func_arg_types.values():
             if hasattr(t, "_wp_generic_type_hint_"):
                 genericset.add(t._wp_generic_type_hint_)
             elif warp.types.type_is_generic_scalar(t):
@@ -1059,15 +1158,17 @@ def add_builtin(
                 typelists.append(l)
-            for argtypes in itertools.product(*typelists):
+            for arg_types in itertools.product(*typelists):
+                arg_types = dict(zip(input_types.keys(), arg_types))
                 # Some of these argument lists won't work, eg if the function is mul(), we won't be
                 # able to do a matrix vector multiplication for a mat22 and a vec3. The `constraint`
                 # function determines which combinations are valid:
                 if constraint:
-                    if constraint(argtypes) is False:
+                    if constraint(arg_types) is False:
                         continue
-                return_type = value_func(argtypes, {}, [])
+                return_type = value_func(arg_types, None)
                 # The return_type might just be vector_t(length=3,dtype=wp.float32), so we've got to match that
                 # in the list of hard coded types so it knows it's returning one of them:
@@ -1085,8 +1186,10 @@ def add_builtin(
                 # finally we can generate a function call for these concrete types:
                 add_builtin(
                     key,
-                    input_types=dict(zip(input_types.keys(), argtypes)),
+                    input_types=arg_types,
                     value_type=return_type,
+                    export_func=export_func,
+                    dispatch_func=dispatch_func,
                     doc=doc,
                     namespace=namespace,
                     variadic=variadic,
@@ -1096,6 +1199,7 @@ def add_builtin(
                     hidden=True,
                     skip_replay=skip_replay,
                     missing_grad=missing_grad,
+                    defaults=defaults,
                     require_original_output_arg=require_original_output_arg,
                 )
@@ -1106,7 +1210,8 @@ def add_builtin(
         input_types=input_types,
         value_type=value_type,
         value_func=value_func,
-        template_func=template_func,
+        export_func=export_func,
+        dispatch_func=dispatch_func,
         variadic=variadic,
         initializer_list_func=initializer_list_func,
         export=export,
@@ -1250,7 +1355,7 @@ class ModuleBuilder:
             if not func.value_func:
                 def wrap(adj):
-                    def value_type(arg_types, kwds, templates):
+                    def value_type(arg_types, arg_values):
                         if adj.return_var is None or len(adj.return_var) == 0:
                             return None
                         if len(adj.return_var) == 1:
@@ -1453,14 +1558,6 @@ class Module:
         computed ``content_hash`` will be used.
         """
-        def get_annotations(obj: Any) -> Mapping[str, Any]:
-            """Alternative to `inspect.get_annotations()` for Python 3.9 and older."""
-            # See https://docs.python.org/3/howto/annotations.html#accessing-the-annotations-dict-of-an-object-in-python-3-9-and-older
-            if isinstance(obj, type):
-                return obj.__dict__.get("__annotations__", {})
-            return getattr(obj, "__annotations__", {})
         def get_type_name(type_hint):
             if isinstance(type_hint, warp.codegen.Struct):
                 return get_type_name(type_hint.cls)
@@ -1482,7 +1579,7 @@ class Module:
                 for struct in module.structs.values():
                     s = ",".join(
                         "{}: {}".format(name, get_type_name(type_hint))
-                        for name, type_hint in get_annotations(struct.cls).items()
+                        for name, type_hint in warp.codegen.get_annotations(struct.cls).items()
                     )
                     ch.update(bytes(s, "utf-8"))
@@ -1495,22 +1592,18 @@ class Module:
                         ch.update(bytes(sig, "utf-8"))
                         # source
-                        s = func.adj.source
-                        ch.update(bytes(s, "utf-8"))
+                        ch.update(bytes(func.adj.source, "utf-8"))
                         if func.custom_grad_func:
-                            s = func.custom_grad_func.adj.source
-                            ch.update(bytes(s, "utf-8"))
+                            ch.update(bytes(func.custom_grad_func.adj.source, "utf-8"))
                         if func.custom_replay_func:
-                            s = func.custom_replay_func.adj.source
+                            ch.update(bytes(func.custom_replay_func.adj.source, "utf-8"))
                         if func.replay_snippet:
-                            s = func.replay_snippet
+                            ch.update(bytes(func.replay_snippet, "utf-8"))
                         if func.native_snippet:
-                            s = func.native_snippet
-                            ch.update(bytes(s, "utf-8"))
+                            ch.update(bytes(func.native_snippet, "utf-8"))
                         if func.adj_native_snippet:
-                            s = func.adj_native_snippet
-                            ch.update(bytes(s, "utf-8"))
+                            ch.update(bytes(func.adj_native_snippet, "utf-8"))
                         # Populate constants referenced in this function
                         if func.adj:
@@ -1621,7 +1714,7 @@ class Module:
         with ScopedTimer(
             f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'", active=not warp.config.quiet
-        ):
+        ) as module_load_timer:
             # -----------------------------------------------------------
             # determine output paths
             if device.is_cpu:
@@ -1657,7 +1750,13 @@ class Module:
             build_dir = None
-            if not os.path.exists(binary_path) or not warp.config.cache_kernels:
+            # we always want to build if binary doesn't exist yet
+            # and we want to rebuild if we are not caching kernels or if we are tracking array access
+            if (
+                not os.path.exists(binary_path)
+                or not warp.config.cache_kernels
+                or warp.config.verify_autograd_array_access
+            ):
                 builder = ModuleBuilder(self, self.options)
                 # create a temporary (process unique) dir for build outputs before moving to the binary dir
@@ -1668,6 +1767,8 @@ class Module:
                 # dir may exist from previous attempts / runs / archs
                 Path(build_dir).mkdir(parents=True, exist_ok=True)
+                module_load_timer.extra_msg = " (compiled)"  # For wp.ScopedTimer informational purposes
                 # build CPU
                 if device.is_cpu:
                     # build
@@ -1694,6 +1795,7 @@ class Module:
                     except Exception as e:
                         self.cpu_build_failed = True
+                        module_load_timer.extra_msg = " (error)"
                         raise (e)
                 elif device.is_cuda:
@@ -1722,6 +1824,7 @@ class Module:
                     except Exception as e:
                         self.cuda_build_failed = True
+                        module_load_timer.extra_msg = " (error)"
                         raise (e)
                 # -----------------------------------------------------------
@@ -1755,6 +1858,8 @@ class Module:
                     except Exception as e:
                         # We don't need source_code_path to be copied successfully to proceed, so warn and keep running
                         warp.utils.warn(f"Exception when renaming {source_code_path}: {e}")
+            else:
+                module_load_timer.extra_msg = " (cached)"  # For wp.ScopedTimer informational purposes
             # -----------------------------------------------------------
             # Load CPU or CUDA binary
@@ -1767,6 +1872,7 @@ class Module:
                 if cuda_module is not None:
                     self.cuda_modules[device.context] = cuda_module
                 else:
+                    module_load_timer.extra_msg = " (error)"
                     raise Exception(f"Failed to load CUDA module '{self.name}'")
             if build_dir:
@@ -1937,10 +2043,13 @@ class ContextGuard:
 class Stream:
-    def __init__(self, device=None, **kwargs):
-        self.cuda_stream = None
-        self.owner = False
+    def __new__(cls, *args, **kwargs):
+        instance = super(Stream, cls).__new__(cls)
+        instance.cuda_stream = None
+        instance.owner = False
+        return instance
+    def __init__(self, device=None, **kwargs):
         # event used internally for synchronization (cached to avoid creating temporary events)
         self._cached_event = None
@@ -2016,9 +2125,12 @@ class Event:
         BLOCKING_SYNC = 0x1
         DISABLE_TIMING = 0x2
-    def __init__(self, device=None, cuda_event=None, enable_timing=False):
-        self.owner = False
+    def __new__(cls, *args, **kwargs):
+        instance = super(Event, cls).__new__(cls)
+        instance.owner = False
+        return instance
+    def __init__(self, device=None, cuda_event=None, enable_timing=False):
         device = get_device(device)
         if not device.is_cuda:
             raise RuntimeError(f"Device {device} is not a CUDA device")
@@ -2320,6 +2432,11 @@ Devicelike = Union[Device, str, None]
 class Graph:
+    def __new__(cls, *args, **kwargs):
+        instance = super(Graph, cls).__new__(cls)
+        instance.exec = None
+        return instance
     def __init__(self, device: Device, exec: ctypes.c_void_p):
         self.device = device
         self.exec = exec
@@ -2682,48 +2799,38 @@ class Runtime:
                 ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.c_int,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
+                ctypes.c_float * 9,
+                ctypes.c_float * 3,
                 ctypes.c_bool,
+                ctypes.c_float,
             ]
             self.core.volume_f_from_tiles_device.restype = ctypes.c_uint64
             self.core.volume_v_from_tiles_device.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.c_int,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
+                ctypes.c_float * 9,
+                ctypes.c_float * 3,
                 ctypes.c_bool,
+                ctypes.c_float * 3,
             ]
             self.core.volume_v_from_tiles_device.restype = ctypes.c_uint64
             self.core.volume_i_from_tiles_device.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.c_int,
-                ctypes.c_float,
-                ctypes.c_int,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
+                ctypes.c_float * 9,
+                ctypes.c_float * 3,
                 ctypes.c_bool,
+                ctypes.c_int,
             ]
             self.core.volume_i_from_tiles_device.restype = ctypes.c_uint64
             self.core.volume_index_from_tiles_device.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.c_int,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
+                ctypes.c_float * 9,
+                ctypes.c_float * 3,
                 ctypes.c_bool,
             ]
             self.core.volume_index_from_tiles_device.restype = ctypes.c_uint64
@@ -2731,10 +2838,8 @@ class Runtime:
                 ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.c_int,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_float,
+                ctypes.c_float * 9,
+                ctypes.c_float * 3,
                 ctypes.c_bool,
             ]
             self.core.volume_from_active_voxels_device.restype = ctypes.c_uint64
@@ -2780,39 +2885,38 @@ class Runtime:
             self.core.volume_get_blind_data_info.restype = ctypes.c_char_p
             bsr_matrix_from_triplets_argtypes = [
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
+                ctypes.c_int,  # rows_per_bock
+                ctypes.c_int,  # cols_per_blocks
+                ctypes.c_int,  # row_count
+                ctypes.c_int,  # tpl_nnz
+                ctypes.POINTER(ctypes.c_int),  # tpl_rows
+                ctypes.POINTER(ctypes.c_int),  # tpl_cols
+                ctypes.c_void_p,  # tpl_values
+                ctypes.c_bool,  # prune_numerical_zeros
+                ctypes.POINTER(ctypes.c_int),  # bsr_offsets
+                ctypes.POINTER(ctypes.c_int),  # bsr_columns
+                ctypes.c_void_p,  # bsr_values
+                ctypes.POINTER(ctypes.c_int),  # bsr_nnz
+                ctypes.c_void_p,  # bsr_nnz_event
             ]
             self.core.bsr_matrix_from_triplets_float_host.argtypes = bsr_matrix_from_triplets_argtypes
             self.core.bsr_matrix_from_triplets_double_host.argtypes = bsr_matrix_from_triplets_argtypes
             self.core.bsr_matrix_from_triplets_float_device.argtypes = bsr_matrix_from_triplets_argtypes
             self.core.bsr_matrix_from_triplets_double_device.argtypes = bsr_matrix_from_triplets_argtypes
-            self.core.bsr_matrix_from_triplets_float_host.restype = ctypes.c_int
-            self.core.bsr_matrix_from_triplets_double_host.restype = ctypes.c_int
-            self.core.bsr_matrix_from_triplets_float_device.restype = ctypes.c_int
-            self.core.bsr_matrix_from_triplets_double_device.restype = ctypes.c_int
             bsr_transpose_argtypes = [
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
-                ctypes.c_uint64,
+                ctypes.c_int,  # rows_per_bock
+                ctypes.c_int,  # cols_per_blocks
+                ctypes.c_int,  # row_count
+                ctypes.c_int,  # col count
+                ctypes.c_int,  # nnz
+                ctypes.POINTER(ctypes.c_int),  # transposed_bsr_offsets
+                ctypes.POINTER(ctypes.c_int),  # transposed_bsr_columns
+                ctypes.c_void_p,  # bsr_values
+                ctypes.POINTER(ctypes.c_int),  # transposed_bsr_offsets
+                ctypes.POINTER(ctypes.c_int),  # transposed_bsr_columns
+                ctypes.c_void_p,  # transposed_bsr_values
             ]
             self.core.bsr_transpose_float_host.argtypes = bsr_transpose_argtypes
             self.core.bsr_transpose_double_host.argtypes = bsr_transpose_argtypes
@@ -3019,35 +3123,63 @@ class Runtime:
         self.device_map["cpu"] = self.cpu_device
         self.context_map[None] = self.cpu_device
-        cuda_device_count = self.core.cuda_device_get_count()
+        self.is_cuda_enabled = bool(self.core.is_cuda_enabled())
+        self.is_cuda_compatibility_enabled = bool(self.core.is_cuda_compatibility_enabled())
-        if cuda_device_count > 0:
+        self.toolkit_version = None  # CTK version used to build the core lib
+        self.driver_version = None  # installed driver version
+        self.min_driver_version = None  # minimum required driver version
+        self.cuda_devices = []
+        self.cuda_primary_devices = []
+        cuda_device_count = 0
+        if self.is_cuda_enabled:
             # get CUDA Toolkit and driver versions
-            self.toolkit_version = self.core.cuda_toolkit_version()
-            self.driver_version = self.core.cuda_driver_version()
-            # get all architectures supported by NVRTC
-            num_archs = self.core.nvrtc_supported_arch_count()
-            if num_archs > 0:
-                archs = (ctypes.c_int * num_archs)()
-                self.core.nvrtc_supported_archs(archs)
-                self.nvrtc_supported_archs = list(archs)
+            toolkit_version = self.core.cuda_toolkit_version()
+            driver_version = self.core.cuda_driver_version()
+            # save versions as tuples, e.g., (12, 4)
+            self.toolkit_version = (toolkit_version // 1000, (toolkit_version % 1000) // 10)
+            self.driver_version = (driver_version // 1000, (driver_version % 1000) // 10)
+            # determine minimum required driver version
+            if self.is_cuda_compatibility_enabled:
+                # we can rely on minor version compatibility, but 11.4 is the absolute minimum required from the driver
+                if self.toolkit_version[0] > 11:
+                    self.min_driver_version = (self.toolkit_version[0], 0)
+                else:
+                    self.min_driver_version = (11, 4)
             else:
-                self.nvrtc_supported_archs = []
+                # we can't rely on minor version compatibility, so the driver can't be older than the toolkit
+                self.min_driver_version = self.toolkit_version
+            # determine if the installed driver is sufficient
+            if self.driver_version >= self.min_driver_version:
+                # get all architectures supported by NVRTC
+                num_archs = self.core.nvrtc_supported_arch_count()
+                if num_archs > 0:
+                    archs = (ctypes.c_int * num_archs)()
+                    self.core.nvrtc_supported_archs(archs)
+                    self.nvrtc_supported_archs = set(archs)
+                else:
+                    self.nvrtc_supported_archs = set()
-        # this is so we can give non-primary contexts a reasonable alias
-        # associated with the physical device (e.g., "cuda:0.0", "cuda:0.1")
-        self.cuda_custom_context_count = [0] * cuda_device_count
+                # get CUDA device count
+                cuda_device_count = self.core.cuda_device_get_count()
-        # register primary CUDA devices
-        self.cuda_devices = []
-        self.cuda_primary_devices = []
-        for i in range(cuda_device_count):
-            alias = f"cuda:{i}"
-            device = Device(self, alias, ordinal=i, is_primary=True)
-            self.cuda_devices.append(device)
-            self.cuda_primary_devices.append(device)
-            self.device_map[alias] = device
+                # register primary CUDA devices
+                for i in range(cuda_device_count):
+                    alias = f"cuda:{i}"
+                    device = Device(self, alias, ordinal=i, is_primary=True)
+                    self.cuda_devices.append(device)
+                    self.cuda_primary_devices.append(device)
+                    self.device_map[alias] = device
+                # count known non-primary contexts on each physical device so we can
+                # give them reasonable aliases (e.g., "cuda:0.0", "cuda:0.1")
+                self.cuda_custom_context_count = [0] * cuda_device_count
         # set default device
         if cuda_device_count > 0:
@@ -3066,14 +3198,8 @@ class Runtime:
         # initialize kernel cache
         warp.build.init_kernel_cache(warp.config.kernel_cache_dir)
-        devices_without_uva = []
-        devices_without_mempool = []
-        for cuda_device in self.cuda_devices:
-            if cuda_device.is_primary:
-                if not cuda_device.is_uva:
-                    devices_without_uva.append(cuda_device)
-                if not cuda_device.is_mempool_supported:
-                    devices_without_mempool.append(cuda_device)
+        # global tape
+        self.tape = None
         # print device and version information
         if not warp.config.quiet:
@@ -3081,18 +3207,24 @@ class Runtime:
             greeting.append(f"Warp {warp.config.version} initialized:")
             if cuda_device_count > 0:
-                toolkit_version = (self.toolkit_version // 1000, (self.toolkit_version % 1000) // 10)
-                driver_version = (self.driver_version // 1000, (self.driver_version % 1000) // 10)
+                # print CUDA version info
                 greeting.append(
-                    f"   CUDA Toolkit {toolkit_version[0]}.{toolkit_version[1]}, Driver {driver_version[0]}.{driver_version[1]}"
+                    f"   CUDA Toolkit {self.toolkit_version[0]}.{self.toolkit_version[1]}, Driver {self.driver_version[0]}.{self.driver_version[1]}"
                 )
             else:
-                if self.core.is_cuda_enabled():
-                    # Warp was compiled with CUDA support, but no devices are available
-                    greeting.append("   CUDA devices not available")
-                else:
+                # briefly explain lack of CUDA devices
+                if not self.is_cuda_enabled:
                     # Warp was compiled without CUDA support
-                    greeting.append("   CUDA support not enabled in this build")
+                    greeting.append("   CUDA not enabled in this build")
+                elif self.driver_version < self.min_driver_version:
+                    # insufficient CUDA driver version
+                    greeting.append(
+                        f"   CUDA Toolkit {self.toolkit_version[0]}.{self.toolkit_version[1]}, Driver {self.driver_version[0]}.{self.driver_version[1]}"
+                        " (insufficient CUDA driver version!)"
+                    )
+                else:
+                    # CUDA is supported, but no devices are available
+                    greeting.append("   CUDA devices not available")
             greeting.append("   Devices:")
             alias_str = f'"{self.cpu_device.alias}"'
             name_str = f'"{self.cpu_device.name}"'
@@ -3151,41 +3283,44 @@ class Runtime:
             print("\n".join(greeting))
         if cuda_device_count > 0:
-            # warn about possible misconfiguration of the system
+            # ensure initialization did not change the initial context (e.g. querying available memory)
+            self.core.cuda_context_set_current(initial_context)
+            # detect possible misconfiguration of the system
+            devices_without_uva = []
+            devices_without_mempool = []
+            for cuda_device in self.cuda_primary_devices:
+                if not cuda_device.is_uva:
+                    devices_without_uva.append(cuda_device)
+                if not cuda_device.is_mempool_supported:
+                    devices_without_mempool.append(cuda_device)
             if devices_without_uva:
                 # This should not happen on any system officially supported by Warp.  UVA is not available
                 # on 32-bit Windows, which we don't support.  Nonetheless, we should check and report a
                 # warning out of abundance of caution.  It may help with debugging a broken VM setup etc.
                 warp.utils.warn(
-                    f"Support for Unified Virtual Addressing (UVA) was not detected on devices {devices_without_uva}."
+                    f"\n   Support for Unified Virtual Addressing (UVA) was not detected on devices {devices_without_uva}."
                 )
             if devices_without_mempool:
                 warp.utils.warn(
-                    f"Support for CUDA memory pools was not detected on devices {devices_without_mempool}. "
-                    "This prevents memory allocations in CUDA graphs and may result in poor performance. "
-                    "Is the UVM driver enabled?"
+                    f"\n   Support for CUDA memory pools was not detected on devices {devices_without_mempool}."
+                    "\n   This prevents memory allocations in CUDA graphs and may result in poor performance."
+                    "\n   Is the UVM driver enabled?"
                 )
-            # CUDA compatibility check.  This should only affect developer builds done with the
-            # --quick flag.  The consequences of running with an older driver can be obscure and severe,
-            # so make sure we print a very visible warning.
-            if self.driver_version < self.toolkit_version and not self.core.is_cuda_compatibility_enabled():
-                print(
-                    "******************************************************************\n"
-                    "* WARNING:                                                       *\n"
-                    "*   Warp was compiled without CUDA compatibility support         *\n"
-                    "*   (quick build).  The CUDA Toolkit version used to build       *\n"
-                    "*   Warp is not fully supported by the current driver.           *\n"
-                    "*   Some CUDA functionality may not work correctly!              *\n"
-                    "*   Update the driver or rebuild Warp without the --quick flag.  *\n"
-                    "******************************************************************\n"
+        elif self.is_cuda_enabled:
+            # Report a warning about insufficient driver version.  The warning should appear even in quiet mode
+            # when the greeting message is suppressed.  Also try to provide guidance for resolving the situation.
+            if self.driver_version < self.min_driver_version:
+                msg = []
+                msg.append("\n   Insufficient CUDA driver version.")
+                msg.append(
+                    f"The minimum required CUDA driver version is {self.min_driver_version[0]}.{self.min_driver_version[1]}, "
+                    f"but the installed CUDA driver version is {self.driver_version[0]}.{self.driver_version[1]}."
                 )
-            # ensure initialization did not change the initial context (e.g. querying available memory)
-            self.core.cuda_context_set_current(initial_context)
-        # global tape
-        self.tape = None
+                msg.append("Visit https://github.com/NVIDIA/warp/blob/main/README.md#installing for guidance.")
+                warp.utils.warn("\n   ".join(msg))
     def get_error_string(self):
         return self.core.get_error_string().decode("utf-8")
@@ -3208,17 +3343,20 @@ class Runtime:
         return dll
     def get_device(self, ident: Devicelike = None) -> Device:
-        if isinstance(ident, Device):
+        # special cases
+        if type(ident) is Device:
             return ident
         elif ident is None:
             return self.default_device
-        elif isinstance(ident, str):
-            if ident == "cuda":
-                return self.get_current_cuda_device()
-            else:
-                return self.device_map[ident]
-        else:
-            raise RuntimeError(f"Unable to resolve device from argument of type {type(ident)}")
+        # string lookup
+        device = self.device_map.get(ident)
+        if device is not None:
+            return device
+        elif ident == "cuda":
+            return self.get_current_cuda_device()
+        raise ValueError(f"Invalid device identifier: {ident}")
     def set_default_device(self, ident: Devicelike):
         self.default_device = self.get_device(ident)
@@ -3248,7 +3386,7 @@ class Runtime:
             return self.cuda_devices[0]
         else:
             # CUDA is not available
-            if not self.core.is_cuda_enabled():
+            if not self.is_cuda_enabled:
                 raise RuntimeError('"cuda" device requested but this build of Warp does not support CUDA')
             else:
                 raise RuntimeError('"cuda" device requested but CUDA is not supported by the hardware or driver')
@@ -3821,6 +3959,11 @@ class RegisteredGLBuffer:
     __fallback_warning_shown = False
+    def __new__(cls, *args, **kwargs):
+        instance = super(RegisteredGLBuffer, cls).__new__(cls)
+        instance.resource = None
+        return instance
     def __init__(self, gl_buffer_id: int, device: Devicelike = None, flags: int = NONE, fallback_to_copy: bool = True):
         """
         Args:
@@ -4230,6 +4373,10 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             # allow for NULL arrays
             return arg_type.__ctype__()
+        elif isinstance(value, warp.types.array_t):
+            # accept array descriptors verbatum
+            return value
         else:
             # check for array type
             # - in forward passes, array types have to match
@@ -4240,6 +4387,32 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
                 array_matches = type(value) is type(arg_type)
             if not array_matches:
+                # if a regular Warp array is required, try converting from __cuda_array_interface__ or __array_interface__
+                if isinstance(arg_type, warp.array):
+                    if device.is_cuda:
+                        # check for __cuda_array_interface__
+                        try:
+                            interface = value.__cuda_array_interface__
+                        except AttributeError:
+                            pass
+                        else:
+                            return warp.types.array_ctype_from_interface(interface, dtype=arg_type.dtype, owner=value)
+                    else:
+                        # check for __array_interface__
+                        try:
+                            interface = value.__array_interface__
+                        except AttributeError:
+                            pass
+                        else:
+                            return warp.types.array_ctype_from_interface(interface, dtype=arg_type.dtype, owner=value)
+                        # check for __array__() method, e.g. Torch CPU tensors
+                        try:
+                            interface = value.__array__().__array_interface__
+                        except AttributeError:
+                            pass
+                        else:
+                            return warp.types.array_ctype_from_interface(interface, dtype=arg_type.dtype, owner=value)
                 adj = "adjoint " if adjoint else ""
                 raise RuntimeError(
                     f"Error launching kernel '{kernel.key}', {adj}argument '{arg_name}' expects an array of type {type(arg_type)}, but passed value has type {type(value)}."
@@ -4603,6 +4776,10 @@ def launch(
         caller = {"file": frame.f_code.co_filename, "lineno": frame.f_lineno, "func": frame.f_code.co_name}
         runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device, metadata={"caller": caller})
+        # detect illegal inter-kernel read/write access patterns if verification flag is set
+        if warp.config.verify_autograd_array_access:
+            runtime.tape._check_kernel_array_access(kernel, fwd_args)
 def synchronize():
     """Manually synchronize the calling CPU thread with any outstanding CUDA work on all devices
@@ -4808,7 +4985,7 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=None
     """
     if force_module_load is None:
-        if runtime.driver_version >= 12030:
+        if runtime.driver_version >= (12, 3):
             # Driver versions 12.3 and can compile modules during graph capture
             force_module_load = False
         else:
@@ -5084,6 +5261,9 @@ def copy(
                 ),
                 arrays=[dest, src],
             )
+            if warp.config.verify_autograd_array_access:
+                dest.mark_write()
+                src.mark_read()
 def adj_copy(
@@ -5106,8 +5286,16 @@ def type_str(t):
         return "Any"
     elif t == Callable:
         return "Callable"
+    elif t == Tuple[int]:
+        return "Tuple[int]"
     elif t == Tuple[int, int]:
         return "Tuple[int, int]"
+    elif t == Tuple[int, int, int]:
+        return "Tuple[int, int, int]"
+    elif t == Tuple[int, int, int, int]:
+        return "Tuple[int, int, int, int]"
+    elif t == Tuple[int, ...]:
+        return "Tuple[int, ...]"
     elif isinstance(t, int):
         return str(t)
     elif isinstance(t, List):
@@ -5142,6 +5330,9 @@ def type_str(t):
             return f"Transformation[{type_str(t._wp_scalar_type_)}]"
         raise TypeError("Invalid vector or matrix dimensions")
+    elif typing.get_origin(t) in (List, Mapping, Sequence, Union, Tuple):
+        args_repr = ", ".join(type_str(x) for x in typing.get_args(t))
+        return f"{t.__name__}[{args_repr}]"
     return t.__name__
@@ -5169,7 +5360,7 @@ def print_function(f, file, noentry=False):  # pragma: no cover
     try:
         # todo: construct a default value for each of the functions args
         # so we can generate the return type for overloaded functions
-        return_type = " -> " + type_str(f.value_func(None, None, None))
+        return_type = " -> " + type_str(f.value_func(None, None))
     except Exception:
         pass
@@ -5232,14 +5423,6 @@ def export_functions_rst(file):  # pragma: no cover
     print(".. class:: Transformation", file=file)
     print(".. class:: Array", file=file)
-    print("\nQuery Types", file=file)
-    print("-------------", file=file)
-    print(".. autoclass:: bvh_query_t", file=file)
-    print(".. autoclass:: hash_grid_query_t", file=file)
-    print(".. autoclass:: mesh_query_aabb_t", file=file)
-    print(".. autoclass:: mesh_query_point_t", file=file)
-    print(".. autoclass:: mesh_query_ray_t", file=file)
     # build dictionary of all functions by group
     groups = {}
@@ -5252,8 +5435,17 @@ def export_functions_rst(file):  # pragma: no cover
         for o in f.overloads:
             groups[f.group].append(o)
-    # Keep track of what function names have been written
-    written_functions = {}
+    # Keep track of what function and query types have been written
+    written_functions = set()
+    written_query_types = set()
+    query_types = (
+        ("bvh_query", "BvhQuery"),
+        ("mesh_query_aabb", "MeshQueryAABB"),
+        ("mesh_query_point", "MeshQueryPoint"),
+        ("mesh_query_ray", "MeshQueryRay"),
+        ("hash_grid_query", "HashGridQuery"),
+    )
     for k, g in groups.items():
         print("\n", file=file)
@@ -5261,12 +5453,18 @@ def export_functions_rst(file):  # pragma: no cover
         print("---------------", file=file)
         for f in g:
+            for f_prefix, query_type in query_types:
+                if f.key.startswith(f_prefix) and query_type not in written_query_types:
+                    print(f".. autoclass:: {query_type}", file=file)
+                    written_query_types.add(query_type)
+                    break
             if f.key in written_functions:
                 # Add :noindex: + :nocontentsentry: since Sphinx gets confused
                 print_function(f, file=file, noentry=True)
             else:
                 if print_function(f, file=file):
-                    written_functions[f.key] = []
+                    written_functions.add(f.key)
     # footnotes
     print(".. rubric:: Footnotes", file=file)
@@ -5327,7 +5525,7 @@ def export_stubs(file):  # pragma: no cover
             try:
                 # todo: construct a default value for each of the functions args
                 # so we can generate the return type for overloaded functions
-                return_type = f.value_func(None, None, None)
+                return_type = f.value_func(None, None)
                 if return_type:
                     return_str = " -> " + type_str(return_type)
@@ -5373,21 +5571,25 @@ def export_builtins(file: io.TextIOBase):  # pragma: no cover
             if not f.is_simple():
                 continue
-            args = ", ".join(f"{ctype_arg_str(v)} {k}" for k, v in f.input_types.items())
-            params = ", ".join(f.input_types.keys())
-            return_type = ""
             try:
                 # todo: construct a default value for each of the functions args
                 # so we can generate the return type for overloaded functions
-                return_type = ctype_ret_str(f.value_func(None, None, None))
+                return_type = ctype_ret_str(f.value_func(None, None))
             except Exception:
                 continue
             if return_type.startswith("Tuple"):
                 continue
+            # Runtime arguments that are to be passed to the function, not its template signature.
+            if f.export_func is not None:
+                func_args = f.export_func(f.input_types)
+            else:
+                func_args = f.input_types
+            args = ", ".join(f"{ctype_arg_str(v)} {k}" for k, v in func_args.items())
+            params = ", ".join(func_args.keys())
             if args == "":
                 file.write(f"WP_API void {f.mangled_name}({return_type}* ret) {{ *ret = wp::{f.key}({params}); }}\n")
             elif return_type == "None":