PyPI - warp-lang - Versions diffs - 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl - Mend

warp-lang 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (271) hide show

docs/conf.py +17 -5
examples/env/env_ant.py +1 -1
examples/env/env_cartpole.py +1 -1
examples/env/env_humanoid.py +1 -1
examples/env/env_usd.py +4 -1
examples/env/environment.py +8 -9
examples/example_dem.py +34 -33
examples/example_diffray.py +364 -337
examples/example_fluid.py +32 -23
examples/example_jacobian_ik.py +97 -93
examples/example_marching_cubes.py +6 -16
examples/example_mesh.py +6 -16
examples/example_mesh_intersect.py +16 -14
examples/example_nvdb.py +14 -16
examples/example_raycast.py +14 -13
examples/example_raymarch.py +16 -23
examples/example_render_opengl.py +19 -10
examples/example_sim_cartpole.py +82 -78
examples/example_sim_cloth.py +45 -48
examples/example_sim_fk_grad.py +51 -44
examples/example_sim_fk_grad_torch.py +47 -40
examples/example_sim_grad_bounce.py +108 -133
examples/example_sim_grad_cloth.py +99 -113
examples/example_sim_granular.py +5 -6
examples/{example_sim_sdf_shape.py → example_sim_granular_collision_sdf.py} +37 -26
examples/example_sim_neo_hookean.py +51 -55
examples/example_sim_particle_chain.py +4 -4
examples/example_sim_quadruped.py +126 -81
examples/example_sim_rigid_chain.py +54 -61
examples/example_sim_rigid_contact.py +66 -70
examples/example_sim_rigid_fem.py +3 -3
examples/example_sim_rigid_force.py +1 -1
examples/example_sim_rigid_gyroscopic.py +3 -4
examples/example_sim_rigid_kinematics.py +28 -39
examples/example_sim_trajopt.py +112 -110
examples/example_sph.py +9 -8
examples/example_wave.py +7 -7
examples/fem/bsr_utils.py +30 -17
examples/fem/example_apic_fluid.py +85 -69
examples/fem/example_convection_diffusion.py +97 -93
examples/fem/example_convection_diffusion_dg.py +142 -149
examples/fem/example_convection_diffusion_dg0.py +141 -136
examples/fem/example_deformed_geometry.py +146 -0
examples/fem/example_diffusion.py +115 -84
examples/fem/example_diffusion_3d.py +116 -86
examples/fem/example_diffusion_mgpu.py +102 -79
examples/fem/example_mixed_elasticity.py +139 -100
examples/fem/example_navier_stokes.py +175 -162
examples/fem/example_stokes.py +143 -111
examples/fem/example_stokes_transfer.py +186 -157
examples/fem/mesh_utils.py +59 -97
examples/fem/plot_utils.py +138 -17
tools/ci/publishing/build_nodes_info.py +54 -0
warp/__init__.py +4 -3
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +836 -492
warp/codegen.py +864 -553
warp/config.py +3 -1
warp/context.py +389 -172
warp/fem/__init__.py +24 -6
warp/fem/cache.py +318 -25
warp/fem/dirichlet.py +7 -3
warp/fem/domain.py +14 -0
warp/fem/field/__init__.py +30 -38
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +244 -138
warp/fem/field/restriction.py +8 -6
warp/fem/field/test.py +127 -59
warp/fem/field/trial.py +117 -60
warp/fem/geometry/__init__.py +5 -1
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +24 -1
warp/fem/geometry/geometry.py +86 -14
warp/fem/geometry/grid_2d.py +112 -54
warp/fem/geometry/grid_3d.py +134 -65
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +85 -33
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +451 -115
warp/fem/geometry/trimesh_2d.py +197 -92
warp/fem/integrate.py +534 -268
warp/fem/operator.py +58 -31
warp/fem/polynomial.py +11 -0
warp/fem/quadrature/__init__.py +1 -1
warp/fem/quadrature/pic_quadrature.py +150 -58
warp/fem/quadrature/quadrature.py +209 -57
warp/fem/space/__init__.py +230 -53
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +49 -2
warp/fem/space/function_space.py +90 -39
warp/fem/space/grid_2d_function_space.py +149 -496
warp/fem/space/grid_3d_function_space.py +173 -538
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +129 -76
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +46 -34
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +132 -1039
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +104 -742
warp/fem/types.py +13 -11
warp/fem/utils.py +335 -60
warp/native/array.h +120 -34
warp/native/builtin.h +101 -72
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +22 -40
warp/native/clang/clang.cpp +1 -0
warp/native/crt.h +2 -0
warp/native/cuda_util.cpp +8 -3
warp/native/cuda_util.h +1 -0
warp/native/exports.h +1522 -1243
warp/native/intersect.h +19 -4
warp/native/intersect_adj.h +8 -8
warp/native/mat.h +76 -17
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -18
warp/native/mesh.h +395 -40
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +44 -34
warp/native/reduce.cpp +1 -1
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +163 -155
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +18 -14
warp/native/vec.h +103 -21
warp/native/warp.cpp +2 -1
warp/native/warp.cu +28 -3
warp/native/warp.h +4 -3
warp/render/render_opengl.py +261 -109
warp/sim/__init__.py +1 -2
warp/sim/articulation.py +385 -185
warp/sim/import_mjcf.py +59 -48
warp/sim/import_urdf.py +15 -15
warp/sim/import_usd.py +174 -102
warp/sim/inertia.py +17 -18
warp/sim/integrator_xpbd.py +4 -3
warp/sim/model.py +330 -250
warp/sim/render.py +1 -1
warp/sparse.py +625 -152
warp/stubs.py +341 -309
warp/tape.py +9 -6
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +94 -74
warp/tests/test_array.py +82 -101
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +22 -12
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +18 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +165 -134
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +237 -0
warp/tests/test_fabricarray.py +22 -24
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1034 -124
warp/tests/test_fp16.py +23 -16
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +123 -181
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +35 -34
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +24 -25
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +304 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +60 -22
warp/tests/test_mesh_query_aabb.py +21 -25
warp/tests/test_mesh_query_point.py +111 -22
warp/tests/test_mesh_query_ray.py +12 -24
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +90 -86
warp/tests/test_transient_module.py +10 -12
warp/tests/test_types.py +363 -0
warp/tests/test_utils.py +451 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +418 -376
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +291 -0
warp/tests/unittest_utils.py +342 -0
warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +589 -0
warp/types.py +622 -211
warp/utils.py +54 -393
warp_lang-1.0.0b6.dist-info/METADATA +238 -0
warp_lang-1.0.0b6.dist-info/RECORD +409 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
examples/example_cache_management.py +0 -40
examples/example_multigpu.py +0 -54
examples/example_struct.py +0 -65
examples/fem/example_stokes_transfer_3d.py +0 -210
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/fem/field/discrete_field.py +0 -80
warp/fem/space/nodal_function_space.py +0 -233
warp/tests/test_all.py +0 -223
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-1.0.0b2.dist-info/METADATA +0 -26
warp_lang-1.0.0b2.dist-info/RECORD +0 -380
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -7,8 +7,10 @@
 import ast
 import ctypes
+import gc
 import hashlib
 import inspect
+import io
 import os
 import platform
 import sys
@@ -68,6 +70,8 @@ class Function:
         native_func=None,
         defaults=None,
         custom_replay_func=None,
+        native_snippet=None,
+        adj_native_snippet=None,
         skip_forward_codegen=False,
         skip_reverse_codegen=False,
         custom_reverse_num_input_args=-1,
@@ -75,6 +79,7 @@ class Function:
         overloaded_annotations=None,
         code_transformers=[],
         skip_adding_overload=False,
+        require_original_output_arg=False,
     ):
         self.func = func  # points to Python function decorated with @wp.func, may be None for builtins
         self.key = key
@@ -90,7 +95,10 @@ class Function:
         self.defaults = defaults
         # Function instance for a custom implementation of the replay pass
         self.custom_replay_func = custom_replay_func
+        self.native_snippet = native_snippet
+        self.adj_native_snippet = adj_native_snippet
         self.custom_grad_func = None
+        self.require_original_output_arg = require_original_output_arg
         if initializer_list_func is None:
             self.initializer_list_func = lambda x, y: False
@@ -170,121 +178,24 @@ class Function:
         # from within a kernel (experimental).
         if self.is_builtin() and self.mangled_name:
-            # store last error during overload resolution
-            error = None
-            for f in self.overloads:
-                if f.generic:
+            # For each of this function's existing overloads, we attempt to pack
+            # the given arguments into the C types expected by the corresponding
+            # parameters, and we rinse and repeat until we get a match.
+            for overload in self.overloads:
+                if overload.generic:
                     continue
-                # try and find builtin in the warp.dll
-                if not hasattr(warp.context.runtime.core, f.mangled_name):
-                    raise RuntimeError(
-                        f"Couldn't find function {self.key} with mangled name {f.mangled_name} in the Warp native library"
-                    )
-                try:
-                    # try and pack args into what the function expects
-                    params = []
-                    for i, (arg_name, arg_type) in enumerate(f.input_types.items()):
-                        a = args[i]
-                        # try to convert to a value type (vec3, mat33, etc)
-                        if issubclass(arg_type, ctypes.Array):
-                            # wrap the arg_type (which is an ctypes.Array) in a structure
-                            # to ensure parameter is passed to the .dll by value rather than reference
-                            class ValueArg(ctypes.Structure):
-                                _fields_ = [("value", arg_type)]
-                            x = ValueArg()
-                            # force conversion to ndarray first (handles tuple / list, Gf.Vec3 case)
-                            if isinstance(a, ctypes.Array) is False:
-                                # assume you want the float32 version of the function so it doesn't just
-                                # grab an override for a random data type:
-                                if arg_type._type_ != ctypes.c_float:
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' does not have c_float type."
-                                    )
-                                a = np.array(a)
-                                # flatten to 1D array
-                                v = a.flatten()
-                                if len(v) != arg_type._length_:
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' has length {len(v)}, but expected {arg_type._length_}. Could not convert parameter to {arg_type}."
-                                    )
-                                for i in range(arg_type._length_):
-                                    x.value[i] = v[i]
-                            else:
-                                # already a built-in type, check it matches
-                                if not warp.types.types_equal(type(a), arg_type):
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' has type '{type(a)}' but expected '{arg_type}'"
-                                    )
-                                x.value = a
-                            params.append(x)
-                        else:
-                            try:
-                                # try to pack as a scalar type
-                                params.append(arg_type._type_(a))
-                            except Exception:
-                                raise RuntimeError(
-                                    f"Error calling function {f.key}, unable to pack function parameter type {type(a)} for param {arg_name}, expected {arg_type}"
-                                )
-                    # returns the corresponding ctype for a scalar or vector warp type
-                    def type_ctype(dtype):
-                        if dtype == float:
-                            return ctypes.c_float
-                        elif dtype == int:
-                            return ctypes.c_int32
-                        elif issubclass(dtype, ctypes.Array):
-                            return dtype
-                        elif issubclass(dtype, ctypes.Structure):
-                            return dtype
-                        else:
-                            # scalar type
-                            return dtype._type_
-                    value_type = type_ctype(f.value_func(None, None, None))
-                    # construct return value (passed by address)
-                    ret = value_type()
-                    ret_addr = ctypes.c_void_p(ctypes.addressof(ret))
-                    params.append(ret_addr)
-                    c_func = getattr(warp.context.runtime.core, f.mangled_name)
-                    c_func(*params)
-                    if issubclass(value_type, ctypes.Array) or issubclass(value_type, ctypes.Structure):
-                        # return vector types as ctypes
-                        return ret
-                    else:
-                        # return scalar types as int/float
-                        return ret.value
-                except Exception as e:
-                    # couldn't pack values to match this overload
-                    # store error and move onto the next one
-                    error = e
-                    continue
+                success, return_value = call_builtin(overload, *args)
+                if success:
+                    return return_value
             # overload resolution or call failed
-            # raise the last exception encountered
-            if error:
-                raise error
-            else:
-                raise RuntimeError(f"Error calling function '{f.key}'.")
+            raise RuntimeError(
+                f"Couldn't find a function '{self.key}' compatible with "
+                f"the arguments '{', '.join(type(x).__name__ for x in args)}'"
+            )
-        elif hasattr(self, "user_overloads") and len(self.user_overloads):
+        if hasattr(self, "user_overloads") and len(self.user_overloads):
             # user-defined function with overloads
             if len(kwargs):
@@ -293,28 +204,26 @@ class Function:
                 )
             # try and find a matching overload
-            for f in self.user_overloads.values():
-                if len(f.input_types) != len(args):
+            for overload in self.user_overloads.values():
+                if len(overload.input_types) != len(args):
                     continue
-                template_types = list(f.input_types.values())
-                arg_names = list(f.input_types.keys())
+                template_types = list(overload.input_types.values())
+                arg_names = list(overload.input_types.keys())
                 try:
                     # attempt to unify argument types with function template types
                     warp.types.infer_argument_types(args, template_types, arg_names)
-                    return f.func(*args)
+                    return overload.func(*args)
                 except Exception:
                     continue
             raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
-        else:
-            # user-defined function with no overloads
-            if self.func is None:
-                raise RuntimeError(f"Error calling function '{self.key}', function is undefined")
+        # user-defined function with no overloads
+        if self.func is None:
+            raise RuntimeError(f"Error calling function '{self.key}', function is undefined")
-            # this function has no overloads, call it like a plain Python function
-            return self.func(*args, **kwargs)
+        # this function has no overloads, call it like a plain Python function
+        return self.func(*args, **kwargs)
     def is_builtin(self):
         return self.func is None
@@ -427,10 +336,188 @@ class Function:
             return None
     def __repr__(self):
-        inputs_str = ", ".join([f"{k}: {v.__name__}" for k, v in self.input_types.items()])
+        inputs_str = ", ".join([f"{k}: {warp.types.type_repr(v)}" for k, v in self.input_types.items()])
         return f"<Function {self.key}({inputs_str})>"
+def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
+    uses_non_warp_array_type = False
+    # Retrieve the built-in function from Warp's dll.
+    c_func = getattr(warp.context.runtime.core, func.mangled_name)
+    # Try gathering the parameters that the function expects and pack them
+    # into their corresponding C types.
+    c_params = []
+    for i, (_, arg_type) in enumerate(func.input_types.items()):
+        param = params[i]
+        try:
+            iter(param)
+        except TypeError:
+            is_array = False
+        else:
+            is_array = True
+        if is_array:
+            if not issubclass(arg_type, ctypes.Array):
+                return (False, None)
+            # The argument expects a built-in Warp type like a vector or a matrix.
+            c_param = None
+            if isinstance(param, ctypes.Array):
+                # The given parameter is also a built-in Warp type, so we only need
+                # to make sure that it matches with the argument.
+                if not warp.types.types_equal(type(param), arg_type):
+                    return (False, None)
+                if isinstance(param, arg_type):
+                    c_param = param
+                else:
+                    # Cast the value to its argument type to make sure that it
+                    # can be assigned to the field of the `Param` struct.
+                    # This could error otherwise when, for example, the field type
+                    # is set to `vec3i` while the value is of type `vector(length=3, dtype=int)`,
+                    # even though both types are semantically identical.
+                    c_param = arg_type(param)
+            else:
+                # Flatten the parameter values into a flat 1-D array.
+                arr = []
+                ndim = 1
+                stack = [(0, param)]
+                while stack:
+                    depth, elem = stack.pop(0)
+                    try:
+                        # If `elem` is a sequence, then it should be possible
+                        # to add its elements to the stack for later processing.
+                        stack.extend((depth + 1, x) for x in elem)
+                    except TypeError:
+                        # Since `elem` doesn't seem to be a sequence,
+                        # we must have a leaf value that we need to add to our
+                        # resulting array.
+                        arr.append(elem)
+                        ndim = max(depth, ndim)
+                assert ndim > 0
+                # Ensure that if the given parameter value is, say, a 2-D array,
+                # then we try to resolve it against a matrix argument rather than
+                # a vector.
+                if ndim > len(arg_type._shape_):
+                    return (False, None)
+                elem_count = len(arr)
+                if elem_count != arg_type._length_:
+                    return (False, None)
+                # Retrieve the element type of the sequence while ensuring
+                # that it's homogeneous.
+                elem_type = type(arr[0])
+                for i in range(1, elem_count):
+                    if type(arr[i]) is not elem_type:
+                        raise ValueError("All array elements must share the same type.")
+                expected_elem_type = arg_type._wp_scalar_type_
+                if not (
+                    elem_type is expected_elem_type
+                    or (elem_type is float and expected_elem_type is warp.types.float32)
+                    or (elem_type is int and expected_elem_type is warp.types.int32)
+                    or (
+                        issubclass(elem_type, np.number)
+                        and warp.types.np_dtype_to_warp_type[np.dtype(elem_type)] is expected_elem_type
+                    )
+                ):
+                    # The parameter value has a type not matching the type defined
+                    # for the corresponding argument.
+                    return (False, None)
+                if elem_type in warp.types.int_types:
+                    # Pass the value through the expected integer type
+                    # in order to evaluate any integer wrapping.
+                    # For example `uint8(-1)` should result in the value `-255`.
+                    arr = tuple(elem_type._type_(x.value).value for x in arr)
+                elif elem_type in warp.types.float_types:
+                    # Extract the floating-point values.
+                    arr = tuple(x.value for x in arr)
+                c_param = arg_type()
+                if warp.types.type_is_matrix(arg_type):
+                    rows, cols = arg_type._shape_
+                    for i in range(rows):
+                        idx_start = i * cols
+                        idx_end = idx_start + cols
+                        c_param[i] = arr[idx_start:idx_end]
+                else:
+                    c_param[:] = arr
+                uses_non_warp_array_type = True
+            c_params.append(ctypes.byref(c_param))
+        else:
+            if issubclass(arg_type, ctypes.Array):
+                return (False, None)
+            if not (
+                isinstance(param, arg_type)
+                or (type(param) is float and arg_type is warp.types.float32)
+                or (type(param) is int and arg_type is warp.types.int32)
+                or warp.types.np_dtype_to_warp_type.get(getattr(param, "dtype", None)) is arg_type
+            ):
+                return (False, None)
+            if type(param) in warp.types.scalar_types:
+                param = param.value
+            # try to pack as a scalar type
+            if arg_type == warp.types.float16:
+                c_params.append(arg_type._type_(warp.types.float_to_half_bits(param)))
+            else:
+                c_params.append(arg_type._type_(param))
+    # returns the corresponding ctype for a scalar or vector warp type
+    value_type = func.value_func(None, None, None)
+    if value_type == float:
+        value_ctype = ctypes.c_float
+    elif value_type == int:
+        value_ctype = ctypes.c_int32
+    elif issubclass(value_type, (ctypes.Array, ctypes.Structure)):
+        value_ctype = value_type
+    else:
+        # scalar type
+        value_ctype = value_type._type_
+    # construct return value (passed by address)
+    ret = value_ctype()
+    ret_addr = ctypes.c_void_p(ctypes.addressof(ret))
+    c_params.append(ret_addr)
+    # Call the built-in function from Warp's dll.
+    c_func(*c_params)
+    # TODO: uncomment when we have a way to print warning messages only once.
+    # if uses_non_warp_array_type:
+    #     warp.utils.warn(
+    #         "Support for built-in functions called with non-Warp array types, "
+    #         "such as lists, tuples, NumPy arrays, and others, will be dropped "
+    #         "in the future. Use a Warp type such as `wp.vec`, `wp.mat`, "
+    #         "`wp.quat`, or `wp.transform`.",
+    #         DeprecationWarning,
+    #         stacklevel=3
+    #     )
+    if issubclass(value_ctype, ctypes.Array) or issubclass(value_ctype, ctypes.Structure):
+        # return vector types as ctypes
+        return (True, ret)
+    if value_type == warp.types.float16:
+        return (True, warp.types.half_bits_to_float(ret.value))
+    # return scalar types as int/float
+    return (True, ret.value)
 class KernelHooks:
     def __init__(self, forward, backward):
         self.forward = forward
@@ -439,10 +526,20 @@ class KernelHooks:
 # caches source and compiled entry points for a kernel (will be populated after module loads)
 class Kernel:
-    def __init__(self, func, key, module, options=None, code_transformers=[]):
+    def __init__(self, func, key=None, module=None, options=None, code_transformers=[]):
         self.func = func
-        self.module = module
-        self.key = key
+        if module is None:
+            self.module = get_module(func.__module__)
+        else:
+            self.module = module
+        if key is None:
+            unique_key = self.module.generate_unique_kernel_key(func.__name__)
+            self.key = unique_key
+        else:
+            self.key = key
         self.options = {} if options is None else options
         self.adj = warp.codegen.Adjoint(func, transformers=code_transformers)
@@ -463,8 +560,8 @@ class Kernel:
         # argument indices by name
         self.arg_indices = dict((a.label, i) for i, a in enumerate(self.adj.args))
-        if module:
-            module.register_kernel(self)
+        if self.module:
+            self.module.register_kernel(self)
     def infer_argument_types(self, args):
         template_types = list(self.adj.arg_types.values())
@@ -541,7 +638,7 @@ def func(f):
     name = warp.codegen.make_full_qualified_name(f)
     m = get_module(f.__module__)
-    func = Function(
+    Function(
         func=f, key=name, namespace="", module=m, value_func=None
     )  # value_type not known yet, will be inferred during Adjoint.build()
@@ -549,6 +646,24 @@ def func(f):
     return m.functions[name]
+def func_native(snippet, adj_snippet=None):
+    """
+    Decorator to register native code snippet, @func_native
+    """
+    def snippet_func(f):
+        name = warp.codegen.make_full_qualified_name(f)
+        m = get_module(f.__module__)
+        func = Function(
+            func=f, key=name, namespace="", module=m, native_snippet=snippet, adj_native_snippet=adj_snippet
+        )  # cuda snippets do not have a return value_type
+        return m.functions[name]
+    return snippet_func
 def func_grad(forward_fn):
     """
     Decorator to register a custom gradient function for a given forward function.
@@ -819,6 +934,7 @@ def add_builtin(
     missing_grad=False,
     native_func=None,
     defaults=None,
+    require_original_output_arg=False,
 ):
     # wrap simple single-type functions with a value_func()
     if value_func is None:
@@ -912,7 +1028,7 @@ def add_builtin(
                 # on the generated argument list and skip generation if it fails.
                 # This also gives us the return type, which we keep for later:
                 try:
-                    return_type = value_func([warp.codegen.Var("", t) for t in argtypes], {}, [])
+                    return_type = value_func(argtypes, {}, [])
                 except Exception:
                     continue
@@ -943,6 +1059,7 @@ def add_builtin(
                     hidden=True,
                     skip_replay=skip_replay,
                     missing_grad=missing_grad,
+                    require_original_output_arg=require_original_output_arg,
                 )
     func = Function(
@@ -963,6 +1080,7 @@ def add_builtin(
         generic=generic,
         native_func=native_func,
         defaults=defaults,
+        require_original_output_arg=require_original_output_arg,
     )
     if key in builtin_functions:
@@ -972,7 +1090,7 @@ def add_builtin(
         # export means the function will be added to the `warp` module namespace
         # so that users can call it directly from the Python interpreter
-        if export is True:
+        if export:
             if hasattr(warp, key):
                 # check that we haven't already created something at this location
                 # if it's just an overload stub for auto-complete then overwrite it
@@ -1057,8 +1175,7 @@ class ModuleBuilder:
         while stack:
             s = stack.pop()
-            if s not in structs:
-                structs.append(s)
+            structs.append(s)
             for var in s.vars.values():
                 if isinstance(var.type, warp.codegen.Struct):
@@ -1090,7 +1207,7 @@ class ModuleBuilder:
             if not func.value_func:
                 def wrap(adj):
-                    def value_type(args, kwds, templates):
+                    def value_type(arg_types, kwds, templates):
                         if adj.return_var is None or len(adj.return_var) == 0:
                             return None
                         if len(adj.return_var) == 1:
@@ -1114,9 +1231,14 @@ class ModuleBuilder:
         # code-gen all imported functions
         for func in self.functions.keys():
-            source += warp.codegen.codegen_func(
-                func.adj, c_func_name=func.native_func, device=device, options=self.options
-            )
+            if func.native_snippet is None:
+                source += warp.codegen.codegen_func(
+                    func.adj, c_func_name=func.native_func, device=device, options=self.options
+                )
+            else:
+                source += warp.codegen.codegen_snippet(
+                    func.adj, name=func.key, snippet=func.native_snippet, adj_snippet=func.adj_native_snippet
+                )
         for kernel in self.module.kernels.values():
             # each kernel gets an entry point in the module
@@ -1196,6 +1318,10 @@ class Module:
         self.content_hash = None
+        # number of times module auto-generates kernel key for user
+        # used to ensure unique kernel keys
+        self.count = 0
     def register_struct(self, struct):
         self.structs[struct.key] = struct
@@ -1238,6 +1364,11 @@ class Module:
         # for a reload of module on next launch
         self.unload()
+    def generate_unique_kernel_key(self, key):
+        unique_key = f"{key}_{self.count}"
+        self.count += 1
+        return unique_key
     # collect all referenced functions / structs
     # given the AST of a function or kernel
     def find_references(self, adj):
@@ -1251,7 +1382,7 @@ class Module:
             if isinstance(node, ast.Call):
                 try:
                     # try to resolve the function
-                    func, _ = adj.resolve_path(node.func)
+                    func, _ = adj.resolve_static_expression(node.func, eval_types=False)
                     # if this is a user-defined function, add a module reference
                     if isinstance(func, warp.context.Function) and func.module is not None:
@@ -1304,9 +1435,24 @@ class Module:
                     s = func.adj.source
                     ch.update(bytes(s, "utf-8"))
+                    if func.custom_grad_func:
+                        s = func.custom_grad_func.adj.source
+                        ch.update(bytes(s, "utf-8"))
+                    if func.custom_replay_func:
+                        s = func.custom_replay_func.adj.source
+                    # cache func arg types
+                    for arg, arg_type in func.adj.arg_types.items():
+                        s = f"{arg}: {get_type_name(arg_type)}"
+                        ch.update(bytes(s, "utf-8"))
                 # kernel source
                 for kernel in module.kernels.values():
                     ch.update(bytes(kernel.adj.source, "utf-8"))
+                    # cache kernel arg types
+                    for arg, arg_type in kernel.adj.arg_types.items():
+                        s = f"{arg}: {get_type_name(arg_type)}"
+                        ch.update(bytes(s, "utf-8"))
                     # for generic kernels the Python source is always the same,
                     # but we hash the type signatures of all the overloads
                     if kernel.is_generic:
@@ -1605,13 +1751,13 @@ class ContextGuard:
     def __enter__(self):
         if self.device.is_cuda:
             runtime.core.cuda_context_push_current(self.device.context)
-        elif is_cuda_available():
+        elif is_cuda_driver_initialized():
             self.saved_context = runtime.core.cuda_context_get_current()
     def __exit__(self, exc_type, exc_value, traceback):
         if self.device.is_cuda:
             runtime.core.cuda_context_pop_current()
-        elif is_cuda_available():
+        elif is_cuda_driver_initialized():
             runtime.core.cuda_context_set_current(self.saved_context)
@@ -1896,7 +2042,7 @@ class Runtime:
         self.core = self.load_dll(warp_lib)
-        if llvm_lib and os.path.exists(llvm_lib):
+        if os.path.exists(llvm_lib):
             self.llvm = self.load_dll(llvm_lib)
             # setup c-types for warp-clang.dll
             self.llvm.lookup.restype = ctypes.c_uint64
@@ -2262,6 +2408,8 @@ class Runtime:
         self.core.cuda_driver_version.restype = ctypes.c_int
         self.core.cuda_toolkit_version.argtypes = None
         self.core.cuda_toolkit_version.restype = ctypes.c_int
+        self.core.cuda_driver_is_initialized.argtypes = None
+        self.core.cuda_driver_is_initialized.restype = ctypes.c_bool
         self.core.nvrtc_supported_arch_count.argtypes = None
         self.core.nvrtc_supported_arch_count.restype = ctypes.c_int
@@ -2364,6 +2512,7 @@ class Runtime:
             ctypes.c_void_p,
             ctypes.c_void_p,
             ctypes.c_size_t,
+            ctypes.c_int,
             ctypes.POINTER(ctypes.c_void_p),
         ]
         self.core.cuda_launch_kernel.restype = ctypes.c_size_t
@@ -2484,8 +2633,15 @@ class Runtime:
                 dll = ctypes.CDLL(dll_path, winmode=0)
             else:
                 dll = ctypes.CDLL(dll_path)
-        except OSError:
-            raise RuntimeError(f"Failed to load the shared library '{dll_path}'")
+        except OSError as e:
+            if "GLIBCXX" in str(e):
+                raise RuntimeError(
+                    f"Failed to load the shared library '{dll_path}'.\n"
+                    "The execution environment's libstdc++ runtime is older than the version the Warp library was built for.\n"
+                    "See https://nvidia.github.io/warp/_build/html/installation.html#conda-environments for details."
+                ) from e
+            else:
+                raise RuntimeError(f"Failed to load the shared library '{dll_path}'") from e
         return dll
     def get_device(self, ident: Devicelike = None) -> Device:
@@ -2614,6 +2770,21 @@ def is_device_available(device):
     return device in get_devices()
+def is_cuda_driver_initialized() -> bool:
+    """Returns ``True`` if the CUDA driver is initialized.
+    This is a stricter test than ``is_cuda_available()`` since a CUDA driver
+    call to ``cuCtxGetCurrent`` is made, and the result is compared to
+    `CUDA_SUCCESS`. Note that `CUDA_SUCCESS` is returned by ``cuCtxGetCurrent``
+    even if there is no context bound to the calling CPU thread.
+    This can be helpful in cases in which ``cuInit()`` was called before a fork.
+    """
+    assert_initialized()
+    return runtime.core.cuda_driver_is_initialized()
 def get_devices() -> List[Device]:
     """Returns a list of devices supported in this environment."""
@@ -3090,9 +3261,9 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             # - in forward passes, array types have to match
             # - in backward passes, indexed array gradients are regular arrays
             if adjoint:
-                array_matches = type(value) == warp.array
+                array_matches = isinstance(value, warp.array)
             else:
-                array_matches = type(value) == type(arg_type)
+                array_matches = type(value) is type(arg_type)
             if not array_matches:
                 adj = "adjoint " if adjoint else ""
@@ -3172,7 +3343,7 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
 # represents all data required for a kernel launch
 # so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
 class Launch:
-    def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None):
+    def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0):
         # if not specified look up hooks
         if not hooks:
             module = kernel.module
@@ -3209,6 +3380,7 @@ class Launch:
         self.params_addr = params_addr
         self.device = device
         self.bounds = bounds
+        self.max_blocks = max_blocks
     def set_dim(self, dim):
         self.bounds = warp.types.launch_bounds_t(dim)
@@ -3274,7 +3446,9 @@ class Launch:
         if self.device.is_cpu:
             self.hooks.forward(*self.params)
         else:
-            runtime.core.cuda_launch_kernel(self.device.context, self.hooks.forward, self.bounds.size, self.params_addr)
+            runtime.core.cuda_launch_kernel(
+                self.device.context, self.hooks.forward, self.bounds.size, self.max_blocks, self.params_addr
+            )
 def launch(
@@ -3289,6 +3463,7 @@ def launch(
     adjoint=False,
     record_tape=True,
     record_cmd=False,
+    max_blocks=0,
 ):
     """Launch a Warp kernel on the target device
@@ -3306,6 +3481,8 @@ def launch(
         adjoint: Whether to run forward or backward pass (typically use False)
         record_tape: When true the launch will be recorded the global wp.Tape() object when present
         record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
+        max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
+            If negative or zero, the maximum hardware value will be used.
     """
     assert_initialized()
@@ -3317,7 +3494,7 @@ def launch(
         device = runtime.get_device(device)
     # check function is a Kernel
-    if isinstance(kernel, Kernel) is False:
+    if not isinstance(kernel, Kernel):
         raise RuntimeError("Error launching kernel, can only launch functions decorated with @wp.kernel.")
     # debugging aid
@@ -3399,7 +3576,9 @@ def launch(
                             f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                         )
-                    runtime.core.cuda_launch_kernel(device.context, hooks.backward, bounds.size, kernel_params)
+                    runtime.core.cuda_launch_kernel(
+                        device.context, hooks.backward, bounds.size, max_blocks, kernel_params
+                    )
                 else:
                     if hooks.forward is None:
@@ -3420,7 +3599,9 @@ def launch(
                     else:
                         # launch
-                        runtime.core.cuda_launch_kernel(device.context, hooks.forward, bounds.size, kernel_params)
+                        runtime.core.cuda_launch_kernel(
+                            device.context, hooks.forward, bounds.size, max_blocks, kernel_params
+                        )
                 try:
                     runtime.verify_cuda_device(device)
@@ -3430,7 +3611,7 @@ def launch(
     # record on tape if one is active
     if runtime.tape and record_tape:
-        runtime.tape.record_launch(kernel, dim, inputs, outputs, device)
+        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device)
 def synchronize():
@@ -3440,7 +3621,7 @@ def synchronize():
     or memory copies have completed.
     """
-    if is_cuda_available():
+    if is_cuda_driver_initialized():
         # save the original context to avoid side effects
         saved_context = runtime.core.cuda_context_get_current()
@@ -3490,7 +3671,7 @@ def synchronize_stream(stream_or_device=None):
     runtime.core.cuda_stream_synchronize(stream.device.context, stream.cuda_stream)
-def force_load(device: Union[Device, str] = None, modules: List[Module] = None):
+def force_load(device: Union[Device, str, List[Device], List[str]] = None, modules: List[Module] = None):
     """Force user-defined kernels to be compiled and loaded
     Args:
@@ -3498,12 +3679,14 @@ def force_load(device: Union[Device, str] = None, modules: List[Module] = None):
         modules: List of modules to load.  If None, load all imported modules.
     """
-    if is_cuda_available():
+    if is_cuda_driver_initialized():
         # save original context to avoid side effects
         saved_context = runtime.core.cuda_context_get_current()
     if device is None:
         devices = get_devices()
+    elif isinstance(device, list):
+        devices = [get_device(device_item) for device_item in device]
     else:
         devices = [get_device(device)]
@@ -3595,7 +3778,7 @@ def get_module_options(module: Optional[Any] = None) -> Dict[str, Any]:
     return get_module(m.__name__).options
-def capture_begin(device: Devicelike = None, stream=None, force_module_load=True):
+def capture_begin(device: Devicelike = None, stream=None, force_module_load=None):
     """Begin capture of a CUDA graph
     Captures all subsequent kernel launches and memory operations on CUDA devices.
@@ -3609,7 +3792,10 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=True
     """
-    if warp.config.verify_cuda is True:
+    if force_module_load is None:
+        force_module_load = warp.config.graph_capture_module_load_default
+    if warp.config.verify_cuda:
         raise RuntimeError("Cannot use CUDA error verification during graph capture")
     if stream is not None:
@@ -3624,6 +3810,9 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=True
     device.is_capturing = True
+    # disable garbage collection to avoid older allocations getting collected during graph capture
+    gc.disable()
     with warp.ScopedStream(stream):
         runtime.core.cuda_graph_begin_capture(device.context)
@@ -3647,6 +3836,9 @@ def capture_end(device: Devicelike = None, stream=None) -> Graph:
     device.is_capturing = False
+    # re-enable GC
+    gc.enable()
     if graph is None:
         raise RuntimeError(
             "Error occurred during CUDA graph capture. This could be due to an unintended allocation or CPU/GPU synchronization event."
@@ -3841,7 +4033,7 @@ def type_str(t):
         return t.__name__
-def print_function(f, file, noentry=False):
+def print_function(f, file, noentry=False):  # pragma: no cover
     """Writes a function definition to a file for use in reST documentation
     Args:
@@ -3886,7 +4078,7 @@ def print_function(f, file, noentry=False):
     return True
-def print_builtins(file):
+def export_functions_rst(file):  # pragma: no cover
     header = (
         "..\n"
         "   Autogenerated File - Do not edit. Run build_docs.py to generate.\n"
@@ -3906,6 +4098,8 @@ def print_builtins(file):
     for t in warp.types.scalar_types:
         print(f".. class:: {t.__name__}", file=file)
+    # Manually add wp.bool since it's inconvenient to add to wp.types.scalar_types:
+    print(f".. class:: {warp.types.bool.__name__}", file=file)
     print("\n\nVector Types", file=file)
     print("------------", file=file)
@@ -3925,6 +4119,14 @@ def print_builtins(file):
     print(".. class:: Transformation", file=file)
     print(".. class:: Array", file=file)
+    print("\nQuery Types", file=file)
+    print("-------------", file=file)
+    print(".. autoclass:: bvh_query_t", file=file)
+    print(".. autoclass:: hash_grid_query_t", file=file)
+    print(".. autoclass:: mesh_query_aabb_t", file=file)
+    print(".. autoclass:: mesh_query_point_t", file=file)
+    print(".. autoclass:: mesh_query_ray_t", file=file)
     # build dictionary of all functions by group
     groups = {}
@@ -3958,7 +4160,7 @@ def print_builtins(file):
     print(".. [1] Note: function gradients not implemented for backpropagation.", file=file)
-def export_stubs(file):
+def export_stubs(file):  # pragma: no cover
     """Generates stub file for auto-complete of builtin functions"""
     import textwrap
@@ -3990,6 +4192,8 @@ def export_stubs(file):
     print("Quaternion = Generic[Float]", file=file)
     print("Transformation = Generic[Float]", file=file)
     print("Array = Generic[DType]", file=file)
+    print("FabricArray = Generic[DType]", file=file)
+    print("IndexedFabricArray = Generic[DType]", file=file)
     # prepend __init__.py
     with open(os.path.join(os.path.dirname(file.name), "__init__.py")) as header_file:
@@ -4006,7 +4210,7 @@ def export_stubs(file):
             return_str = ""
-            if f.export is False or f.hidden is True:  # or f.generic:
+            if not f.export or f.hidden:  # or f.generic:
                 continue
             try:
@@ -4027,8 +4231,18 @@ def export_stubs(file):
             print("    ...\n\n", file=file)
-def export_builtins(file):
-    def ctype_str(t):
+def export_builtins(file: io.TextIOBase):  # pragma: no cover
+    def ctype_arg_str(t):
+        if isinstance(t, int):
+            return "int"
+        elif isinstance(t, float):
+            return "float"
+        elif t in warp.types.vector_types:
+            return f"{t.__name__}&"
+        else:
+            return t.__name__
+    def ctype_ret_str(t):
         if isinstance(t, int):
             return "int"
         elif isinstance(t, float):
@@ -4036,9 +4250,12 @@ def export_builtins(file):
         else:
             return t.__name__
+    file.write("namespace wp {\n\n")
+    file.write('extern "C" {\n\n')
     for k, g in builtin_functions.items():
         for f in g.overloads:
-            if f.export is False or f.generic:
+            if not f.export or f.generic:
                 continue
             simple = True
@@ -4052,7 +4269,7 @@ def export_builtins(file):
             if not simple or f.variadic:
                 continue
-            args = ", ".join(f"{ctype_str(v)} {k}" for k, v in f.input_types.items())
+            args = ", ".join(f"{ctype_arg_str(v)} {k}" for k, v in f.input_types.items())
             params = ", ".join(f.input_types.keys())
             return_type = ""
@@ -4060,7 +4277,7 @@ def export_builtins(file):
             try:
                 # todo: construct a default value for each of the functions args
                 # so we can generate the return type for overloaded functions
-                return_type = ctype_str(f.value_func(None, None, None))
+                return_type = ctype_ret_str(f.value_func(None, None, None))
             except Exception:
                 continue
@@ -4068,17 +4285,17 @@ def export_builtins(file):
                 continue
             if args == "":
-                print(
-                    f"WP_API void {f.mangled_name}({return_type}* ret) {{ *ret = wp::{f.key}({params}); }}", file=file
-                )
+                file.write(f"WP_API void {f.mangled_name}({return_type}* ret) {{ *ret = wp::{f.key}({params}); }}\n")
             elif return_type == "None":
-                print(f"WP_API void {f.mangled_name}({args}) {{ wp::{f.key}({params}); }}", file=file)
+                file.write(f"WP_API void {f.mangled_name}({args}) {{ wp::{f.key}({params}); }}\n")
             else:
-                print(
-                    f"WP_API void {f.mangled_name}({args}, {return_type}* ret) {{ *ret = wp::{f.key}({params}); }}",
-                    file=file,
+                file.write(
+                    f"WP_API void {f.mangled_name}({args}, {return_type}* ret) {{ *ret = wp::{f.key}({params}); }}\n"
                 )
+    file.write('\n}  // extern "C"\n\n')
+    file.write("}  // namespace wp\n")
 # initialize global runtime
 runtime = None