PyPI - warp-lang - Versions diffs - 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (300) hide show

warp/__init__.py +10 -4
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +868 -507
warp/codegen.py +1074 -638
warp/config.py +3 -3
warp/constants.py +6 -0
warp/context.py +715 -222
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +147 -44
warp/native/builtin.h +122 -149
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +34 -43
warp/native/clang/clang.cpp +13 -8
warp/native/crt.h +2 -0
warp/native/cuda_crt.h +5 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -952
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +1 -1
warp/native/marching.cu +157 -161
warp/native/mat.h +80 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -23
warp/native/mesh.h +446 -46
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +1 -1
warp/native/reduce.cu +10 -12
warp/native/runlength_encode.cu +6 -10
warp/native/scan.cu +8 -11
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +164 -154
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +14 -30
warp/native/vec.h +107 -23
warp/native/volume.h +120 -0
warp/native/warp.cpp +560 -30
warp/native/warp.cu +431 -44
warp/native/warp.h +13 -4
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +335 -119
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +8 -0
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +158 -16
warp/sim/model.py +795 -291
warp/sim/render.py +3 -3
warp/sim/utils.py +3 -0
warp/sparse.py +640 -150
warp/stubs.py +606 -267
warp/tape.py +61 -10
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +212 -97
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +42 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +208 -130
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +178 -109
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +32 -31
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +140 -22
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +9 -6
warp/types.py +1089 -366
warp/utils.py +93 -387
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/tests/test_all.py +0 -219
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.10.1.dist-info/METADATA +0 -21
warp_lang-0.10.1.dist-info/RECORD +0 -188
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/LICENSE.md +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -5,36 +5,27 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-import math
-import os
-import sys
-import hashlib
+import ast
 import ctypes
+import gc
+import hashlib
+import inspect
+import io
+import os
 import platform
-import ast
+import sys
 import types
-import inspect
-from typing import Tuple
-from typing import List
-from typing import Dict
-from typing import Any
-from typing import Callable
-from typing import Union
-from typing import Mapping
-from typing import Optional
+from copy import copy as shallowcopy
 from types import ModuleType
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
-from copy import copy as shallowcopy
+import numpy as np
 import warp
-import warp.codegen
 import warp.build
+import warp.codegen
 import warp.config
-import numpy as np
 # represents either a built-in or user-defined function
@@ -45,6 +36,18 @@ def create_value_func(type):
     return value_func
+def get_function_args(func):
+    """Ensures that all function arguments are annotated and returns a dictionary mapping from argument name to its type."""
+    import inspect
+    argspec = inspect.getfullargspec(func)
+    # use source-level argument annotations
+    if len(argspec.annotations) < len(argspec.args):
+        raise RuntimeError(f"Incomplete argument annotations on function {func.__qualname__}")
+    return argspec.annotations
 class Function:
     def __init__(
         self,
@@ -66,8 +69,17 @@ class Function:
         generic=False,
         native_func=None,
         defaults=None,
+        custom_replay_func=None,
+        native_snippet=None,
+        adj_native_snippet=None,
+        skip_forward_codegen=False,
+        skip_reverse_codegen=False,
+        custom_reverse_num_input_args=-1,
+        custom_reverse_mode=False,
         overloaded_annotations=None,
         code_transformers=[],
+        skip_adding_overload=False,
+        require_original_output_arg=False,
     ):
         self.func = func  # points to Python function decorated with @wp.func, may be None for builtins
         self.key = key
@@ -81,6 +93,12 @@ class Function:
         self.module = module
         self.variadic = variadic  # function can take arbitrary number of inputs, e.g.: printf()
         self.defaults = defaults
+        # Function instance for a custom implementation of the replay pass
+        self.custom_replay_func = custom_replay_func
+        self.native_snippet = native_snippet
+        self.adj_native_snippet = adj_native_snippet
+        self.custom_grad_func = None
+        self.require_original_output_arg = require_original_output_arg
         if initializer_list_func is None:
             self.initializer_list_func = lambda x, y: False
@@ -110,7 +128,14 @@ class Function:
             # user defined (Python) function
             self.adj = warp.codegen.Adjoint(
-                func, overload_annotations=overloaded_annotations, transformers=code_transformers
+                func,
+                is_user_function=True,
+                skip_forward_codegen=skip_forward_codegen,
+                skip_reverse_codegen=skip_reverse_codegen,
+                custom_reverse_num_input_args=custom_reverse_num_input_args,
+                custom_reverse_mode=custom_reverse_mode,
+                overload_annotations=overloaded_annotations,
+                transformers=code_transformers,
             )
             # record input types
@@ -139,11 +164,12 @@ class Function:
             else:
                 self.mangled_name = None
-        self.add_overload(self)
+        if not skip_adding_overload:
+            self.add_overload(self)
         # add to current module
         if module:
-            module.register_function(self)
+            module.register_function(self, skip_adding_overload)
     def __call__(self, *args, **kwargs):
         # handles calling a builtin (native) function
@@ -152,121 +178,24 @@ class Function:
         # from within a kernel (experimental).
         if self.is_builtin() and self.mangled_name:
-            # store last error during overload resolution
-            error = None
-            for f in self.overloads:
-                if f.generic:
+            # For each of this function's existing overloads, we attempt to pack
+            # the given arguments into the C types expected by the corresponding
+            # parameters, and we rinse and repeat until we get a match.
+            for overload in self.overloads:
+                if overload.generic:
                     continue
-                # try and find builtin in the warp.dll
-                if not hasattr(warp.context.runtime.core, f.mangled_name):
-                    raise RuntimeError(
-                        f"Couldn't find function {self.key} with mangled name {f.mangled_name} in the Warp native library"
-                    )
-                try:
-                    # try and pack args into what the function expects
-                    params = []
-                    for i, (arg_name, arg_type) in enumerate(f.input_types.items()):
-                        a = args[i]
-                        # try to convert to a value type (vec3, mat33, etc)
-                        if issubclass(arg_type, ctypes.Array):
-                            # wrap the arg_type (which is an ctypes.Array) in a structure
-                            # to ensure parameter is passed to the .dll by value rather than reference
-                            class ValueArg(ctypes.Structure):
-                                _fields_ = [("value", arg_type)]
-                            x = ValueArg()
-                            # force conversion to ndarray first (handles tuple / list, Gf.Vec3 case)
-                            if isinstance(a, ctypes.Array) == False:
-                                # assume you want the float32 version of the function so it doesn't just
-                                # grab an override for a random data type:
-                                if arg_type._type_ != ctypes.c_float:
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' does not have c_float type."
-                                    )
-                                a = np.array(a)
-                                # flatten to 1D array
-                                v = a.flatten()
-                                if len(v) != arg_type._length_:
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' has length {len(v)}, but expected {arg_type._length_}. Could not convert parameter to {arg_type}."
-                                    )
-                                for i in range(arg_type._length_):
-                                    x.value[i] = v[i]
-                            else:
-                                # already a built-in type, check it matches
-                                if not warp.types.types_equal(type(a), arg_type):
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' has type '{type(a)}' but expected '{arg_type}'"
-                                    )
-                                x.value = a
-                            params.append(x)
-                        else:
-                            try:
-                                # try to pack as a scalar type
-                                params.append(arg_type._type_(a))
-                            except:
-                                raise RuntimeError(
-                                    f"Error calling function {f.key}, unable to pack function parameter type {type(a)} for param {arg_name}, expected {arg_type}"
-                                )
-                    # returns the corresponding ctype for a scalar or vector warp type
-                    def type_ctype(dtype):
-                        if dtype == float:
-                            return ctypes.c_float
-                        elif dtype == int:
-                            return ctypes.c_int32
-                        elif issubclass(dtype, ctypes.Array):
-                            return dtype
-                        elif issubclass(dtype, ctypes.Structure):
-                            return dtype
-                        else:
-                            # scalar type
-                            return dtype._type_
-                    value_type = type_ctype(f.value_func(None, None, None))
-                    # construct return value (passed by address)
-                    ret = value_type()
-                    ret_addr = ctypes.c_void_p(ctypes.addressof(ret))
-                    params.append(ret_addr)
-                    c_func = getattr(warp.context.runtime.core, f.mangled_name)
-                    c_func(*params)
-                    if issubclass(value_type, ctypes.Array) or issubclass(value_type, ctypes.Structure):
-                        # return vector types as ctypes
-                        return ret
-                    else:
-                        # return scalar types as int/float
-                        return ret.value
-                except Exception as e:
-                    # couldn't pack values to match this overload
-                    # store error and move onto the next one
-                    error = e
-                    continue
+                success, return_value = call_builtin(overload, *args)
+                if success:
+                    return return_value
             # overload resolution or call failed
-            # raise the last exception encountered
-            if error:
-                raise error
-            else:
-                raise RuntimeError(f"Error calling function '{f.key}'.")
+            raise RuntimeError(
+                f"Couldn't find a function '{self.key}' compatible with "
+                f"the arguments '{', '.join(type(x).__name__ for x in args)}'"
+            )
-        elif hasattr(self, "user_overloads") and len(self.user_overloads):
+        if hasattr(self, "user_overloads") and len(self.user_overloads):
             # user-defined function with overloads
             if len(kwargs):
@@ -275,28 +204,26 @@ class Function:
                 )
             # try and find a matching overload
-            for f in self.user_overloads.values():
-                if len(f.input_types) != len(args):
+            for overload in self.user_overloads.values():
+                if len(overload.input_types) != len(args):
                     continue
-                template_types = list(f.input_types.values())
-                arg_names = list(f.input_types.keys())
+                template_types = list(overload.input_types.values())
+                arg_names = list(overload.input_types.keys())
                 try:
                     # attempt to unify argument types with function template types
                     warp.types.infer_argument_types(args, template_types, arg_names)
-                    return f.func(*args)
+                    return overload.func(*args)
                 except Exception:
                     continue
             raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
-        else:
-            # user-defined function with no overloads
-            if self.func is None:
-                raise RuntimeError(f"Error calling function '{self.key}', function is undefined")
+        # user-defined function with no overloads
+        if self.func is None:
+            raise RuntimeError(f"Error calling function '{self.key}', function is undefined")
-            # this function has no overloads, call it like a plain Python function
-            return self.func(*args, **kwargs)
+        # this function has no overloads, call it like a plain Python function
+        return self.func(*args, **kwargs)
     def is_builtin(self):
         return self.func is None
@@ -316,7 +243,7 @@ class Function:
             # todo: construct a default value for each of the functions args
             # so we can generate the return type for overloaded functions
             return_type = type_str(self.value_func(None, None, None))
-        except:
+        except Exception:
             return False
         if return_type.startswith("Tuple"):
@@ -409,10 +336,187 @@ class Function:
             return None
     def __repr__(self):
-        inputs_str = ", ".join([f"{k}: {v.__name__}" for k, v in self.input_types.items()])
+        inputs_str = ", ".join([f"{k}: {warp.types.type_repr(v)}" for k, v in self.input_types.items()])
         return f"<Function {self.key}({inputs_str})>"
+def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
+    uses_non_warp_array_type = False
+    # Retrieve the built-in function from Warp's dll.
+    c_func = getattr(warp.context.runtime.core, func.mangled_name)
+    # Try gathering the parameters that the function expects and pack them
+    # into their corresponding C types.
+    c_params = []
+    for i, (_, arg_type) in enumerate(func.input_types.items()):
+        param = params[i]
+        try:
+            iter(param)
+        except TypeError:
+            is_array = False
+        else:
+            is_array = True
+        if is_array:
+            if not issubclass(arg_type, ctypes.Array):
+                return (False, None)
+            # The argument expects a built-in Warp type like a vector or a matrix.
+            c_param = None
+            if isinstance(param, ctypes.Array):
+                # The given parameter is also a built-in Warp type, so we only need
+                # to make sure that it matches with the argument.
+                if not warp.types.types_equal(type(param), arg_type):
+                    return (False, None)
+                if isinstance(param, arg_type):
+                    c_param = param
+                else:
+                    # Cast the value to its argument type to make sure that it
+                    # can be assigned to the field of the `Param` struct.
+                    # This could error otherwise when, for example, the field type
+                    # is set to `vec3i` while the value is of type `vector(length=3, dtype=int)`,
+                    # even though both types are semantically identical.
+                    c_param = arg_type(param)
+            else:
+                # Flatten the parameter values into a flat 1-D array.
+                arr = []
+                ndim = 1
+                stack = [(0, param)]
+                while stack:
+                    depth, elem = stack.pop(0)
+                    try:
+                        # If `elem` is a sequence, then it should be possible
+                        # to add its elements to the stack for later processing.
+                        stack.extend((depth + 1, x) for x in elem)
+                    except TypeError:
+                        # Since `elem` doesn't seem to be a sequence,
+                        # we must have a leaf value that we need to add to our
+                        # resulting array.
+                        arr.append(elem)
+                        ndim = max(depth, ndim)
+                assert ndim > 0
+                # Ensure that if the given parameter value is, say, a 2-D array,
+                # then we try to resolve it against a matrix argument rather than
+                # a vector.
+                if ndim > len(arg_type._shape_):
+                    return (False, None)
+                elem_count = len(arr)
+                if elem_count != arg_type._length_:
+                    return (False, None)
+                # Retrieve the element type of the sequence while ensuring
+                # that it's homogeneous.
+                elem_type = type(arr[0])
+                for i in range(1, elem_count):
+                    if type(arr[i]) is not elem_type:
+                        raise ValueError("All array elements must share the same type.")
+                expected_elem_type = arg_type._wp_scalar_type_
+                if not (
+                    elem_type is expected_elem_type
+                    or (elem_type is float and expected_elem_type is warp.types.float32)
+                    or (elem_type is int and expected_elem_type is warp.types.int32)
+                    or (
+                        issubclass(elem_type, np.number)
+                        and warp.types.np_dtype_to_warp_type[np.dtype(elem_type)] is expected_elem_type
+                    )
+                ):
+                    # The parameter value has a type not matching the type defined
+                    # for the corresponding argument.
+                    return (False, None)
+                if elem_type in warp.types.int_types:
+                    # Pass the value through the expected integer type
+                    # in order to evaluate any integer wrapping.
+                    # For example `uint8(-1)` should result in the value `-255`.
+                    arr = tuple(elem_type._type_(x.value).value for x in arr)
+                elif elem_type in warp.types.float_types:
+                    # Extract the floating-point values.
+                    arr = tuple(x.value for x in arr)
+                c_param = arg_type()
+                if warp.types.type_is_matrix(arg_type):
+                    rows, cols = arg_type._shape_
+                    for i in range(rows):
+                        idx_start = i * cols
+                        idx_end = idx_start + cols
+                        c_param[i] = arr[idx_start:idx_end]
+                else:
+                    c_param[:] = arr
+                uses_non_warp_array_type = True
+            c_params.append(ctypes.byref(c_param))
+        else:
+            if issubclass(arg_type, ctypes.Array):
+                return (False, None)
+            if not (
+                isinstance(param, arg_type)
+                or (type(param) is float and arg_type is warp.types.float32)
+                or (type(param) is int and arg_type is warp.types.int32)
+                or warp.types.np_dtype_to_warp_type.get(getattr(param, "dtype", None)) is arg_type
+            ):
+                return (False, None)
+            if type(param) in warp.types.scalar_types:
+                param = param.value
+            # try to pack as a scalar type
+            if arg_type == warp.types.float16:
+                c_params.append(arg_type._type_(warp.types.float_to_half_bits(param)))
+            else:
+                c_params.append(arg_type._type_(param))
+    # returns the corresponding ctype for a scalar or vector warp type
+    value_type = func.value_func(None, None, None)
+    if value_type == float:
+        value_ctype = ctypes.c_float
+    elif value_type == int:
+        value_ctype = ctypes.c_int32
+    elif issubclass(value_type, (ctypes.Array, ctypes.Structure)):
+        value_ctype = value_type
+    else:
+        # scalar type
+        value_ctype = value_type._type_
+    # construct return value (passed by address)
+    ret = value_ctype()
+    ret_addr = ctypes.c_void_p(ctypes.addressof(ret))
+    c_params.append(ret_addr)
+    # Call the built-in function from Warp's dll.
+    c_func(*c_params)
+    if uses_non_warp_array_type:
+        warp.utils.warn(
+            "Support for built-in functions called with non-Warp array types, "
+            "such as lists, tuples, NumPy arrays, and others, will be dropped "
+            "in the future. Use a Warp type such as `wp.vec`, `wp.mat`, "
+            "`wp.quat`, or `wp.transform`.",
+            DeprecationWarning,
+            stacklevel=3,
+        )
+    if issubclass(value_ctype, ctypes.Array) or issubclass(value_ctype, ctypes.Structure):
+        # return vector types as ctypes
+        return (True, ret)
+    if value_type == warp.types.float16:
+        return (True, warp.types.half_bits_to_float(ret.value))
+    # return scalar types as int/float
+    return (True, ret.value)
 class KernelHooks:
     def __init__(self, forward, backward):
         self.forward = forward
@@ -421,10 +525,20 @@ class KernelHooks:
 # caches source and compiled entry points for a kernel (will be populated after module loads)
 class Kernel:
-    def __init__(self, func, key, module, options=None, code_transformers=[]):
+    def __init__(self, func, key=None, module=None, options=None, code_transformers=[]):
         self.func = func
-        self.module = module
-        self.key = key
+        if module is None:
+            self.module = get_module(func.__module__)
+        else:
+            self.module = module
+        if key is None:
+            unique_key = self.module.generate_unique_kernel_key(func.__name__)
+            self.key = unique_key
+        else:
+            self.key = key
         self.options = {} if options is None else options
         self.adj = warp.codegen.Adjoint(func, transformers=code_transformers)
@@ -445,8 +559,8 @@ class Kernel:
         # argument indices by name
         self.arg_indices = dict((a.label, i) for i, a in enumerate(self.adj.args))
-        if module:
-            module.register_kernel(self)
+        if self.module:
+            self.module.register_kernel(self)
     def infer_argument_types(self, args):
         template_types = list(self.adj.arg_types.values())
@@ -523,7 +637,7 @@ def func(f):
     name = warp.codegen.make_full_qualified_name(f)
     m = get_module(f.__module__)
-    func = Function(
+    Function(
         func=f, key=name, namespace="", module=m, value_func=None
     )  # value_type not known yet, will be inferred during Adjoint.build()
@@ -531,6 +645,167 @@ def func(f):
     return m.functions[name]
+def func_native(snippet, adj_snippet=None):
+    """
+    Decorator to register native code snippet, @func_native
+    """
+    def snippet_func(f):
+        name = warp.codegen.make_full_qualified_name(f)
+        m = get_module(f.__module__)
+        func = Function(
+            func=f, key=name, namespace="", module=m, native_snippet=snippet, adj_native_snippet=adj_snippet
+        )  # cuda snippets do not have a return value_type
+        return m.functions[name]
+    return snippet_func
+def func_grad(forward_fn):
+    """
+    Decorator to register a custom gradient function for a given forward function.
+    The function signature must correspond to one of the function overloads in the following way:
+    the first part of the input arguments are the original input variables with the same types as their
+    corresponding arguments in the original function, and the second part of the input arguments are the
+    adjoint variables of the output variables (if available) of the original function with the same types as the
+    output variables. The function must not return anything.
+    """
+    def wrapper(grad_fn):
+        generic = any(warp.types.type_is_generic(x) for x in forward_fn.input_types.values())
+        if generic:
+            raise RuntimeError(
+                f"Cannot define custom grad definition for {forward_fn.key} since functions with generic input arguments are not yet supported."
+            )
+        reverse_args = {}
+        reverse_args.update(forward_fn.input_types)
+        # create temporary Adjoint instance to analyze the function signature
+        adj = warp.codegen.Adjoint(
+            grad_fn, skip_forward_codegen=True, skip_reverse_codegen=False, transformers=forward_fn.adj.transformers
+        )
+        from warp.types import types_equal
+        grad_args = adj.args
+        grad_sig = warp.types.get_signature([arg.type for arg in grad_args], func_name=forward_fn.key)
+        generic = any(warp.types.type_is_generic(x.type) for x in grad_args)
+        if generic:
+            raise RuntimeError(
+                f"Cannot define custom grad definition for {forward_fn.key} since the provided grad function has generic input arguments."
+            )
+        def match_function(f):
+            # check whether the function overload f matches the signature of the provided gradient function
+            if not hasattr(f.adj, "return_var"):
+                f.adj.build(None)
+            expected_args = list(f.input_types.items())
+            if f.adj.return_var is not None:
+                expected_args += [(f"adj_ret_{var.label}", var.type) for var in f.adj.return_var]
+            if len(grad_args) != len(expected_args):
+                return False
+            if any(not types_equal(a.type, exp_type) for a, (_, exp_type) in zip(grad_args, expected_args)):
+                return False
+            return True
+        def add_custom_grad(f: Function):
+            # register custom gradient function
+            f.custom_grad_func = Function(
+                grad_fn,
+                key=f.key,
+                namespace=f.namespace,
+                input_types=reverse_args,
+                value_func=None,
+                module=f.module,
+                template_func=f.template_func,
+                skip_forward_codegen=True,
+                custom_reverse_mode=True,
+                custom_reverse_num_input_args=len(f.input_types),
+                skip_adding_overload=False,
+                code_transformers=f.adj.transformers,
+            )
+            f.adj.skip_reverse_codegen = True
+        if hasattr(forward_fn, "user_overloads") and len(forward_fn.user_overloads):
+            # find matching overload for which this grad function is defined
+            for sig, f in forward_fn.user_overloads.items():
+                if not grad_sig.startswith(sig):
+                    continue
+                if match_function(f):
+                    add_custom_grad(f)
+                    return
+            raise RuntimeError(
+                f"No function overload found for gradient function {grad_fn.__qualname__} for function {forward_fn.key}"
+            )
+        else:
+            # resolve return variables
+            forward_fn.adj.build(None)
+            expected_args = list(forward_fn.input_types.items())
+            if forward_fn.adj.return_var is not None:
+                expected_args += [(f"adj_ret_{var.label}", var.type) for var in forward_fn.adj.return_var]
+            # check if the signature matches this function
+            if match_function(forward_fn):
+                add_custom_grad(forward_fn)
+            else:
+                raise RuntimeError(
+                    f"Gradient function {grad_fn.__qualname__} for function {forward_fn.key} has an incorrect signature. The arguments must match the "
+                    "forward function arguments plus the adjoint variables corresponding to the return variables:"
+                    f"\n{', '.join(map(lambda nt: f'{nt[0]}: {nt[1].__name__}', expected_args))}"
+                )
+    return wrapper
+def func_replay(forward_fn):
+    """
+    Decorator to register a custom replay function for a given forward function.
+    The replay function is the function version that is called in the forward phase of the backward pass (replay mode) and corresponds to the forward function by default.
+    The provided function has to match the signature of one of the original forward function overloads.
+    """
+    def wrapper(replay_fn):
+        generic = any(warp.types.type_is_generic(x) for x in forward_fn.input_types.values())
+        if generic:
+            raise RuntimeError(
+                f"Cannot define custom replay definition for {forward_fn.key} since functions with generic input arguments are not yet supported."
+            )
+        args = get_function_args(replay_fn)
+        arg_types = list(args.values())
+        generic = any(warp.types.type_is_generic(x) for x in arg_types)
+        if generic:
+            raise RuntimeError(
+                f"Cannot define custom replay definition for {forward_fn.key} since the provided replay function has generic input arguments."
+            )
+        f = forward_fn.get_overload(arg_types)
+        if f is None:
+            inputs_str = ", ".join([f"{k}: {v.__name__}" for k, v in args.items()])
+            raise RuntimeError(
+                f"Could not find forward definition of function {forward_fn.key} that matches custom replay definition with arguments:\n{inputs_str}"
+            )
+        f.custom_replay_func = Function(
+            replay_fn,
+            key=f"replay_{f.key}",
+            namespace=f.namespace,
+            input_types=f.input_types,
+            value_func=f.value_func,
+            module=f.module,
+            template_func=f.template_func,
+            skip_reverse_codegen=True,
+            skip_adding_overload=True,
+            code_transformers=f.adj.transformers,
+        )
+    return wrapper
 # decorator to register kernel, @kernel, custom_name may be a string
 # that creates a kernel with a different name from the actual function
 def kernel(f=None, *, enable_backward=None):
@@ -658,6 +933,7 @@ def add_builtin(
     missing_grad=False,
     native_func=None,
     defaults=None,
+    require_original_output_arg=False,
 ):
     # wrap simple single-type functions with a value_func()
     if value_func is None:
@@ -670,7 +946,7 @@ def add_builtin(
         def initializer_list_func(args, templates):
             return False
-    if defaults == None:
+    if defaults is None:
         defaults = {}
     # Add specialized versions of this builtin if it's generic by matching arguments against
@@ -751,8 +1027,8 @@ def add_builtin(
                 # on the generated argument list and skip generation if it fails.
                 # This also gives us the return type, which we keep for later:
                 try:
-                    return_type = value_func([warp.codegen.Var("", t) for t in argtypes], {}, [])
-                except Exception as e:
+                    return_type = value_func(argtypes, {}, [])
+                except Exception:
                     continue
                 # The return_type might just be vector_t(length=3,dtype=wp.float32), so we've got to match that
@@ -782,6 +1058,7 @@ def add_builtin(
                     hidden=True,
                     skip_replay=skip_replay,
                     missing_grad=missing_grad,
+                    require_original_output_arg=require_original_output_arg,
                 )
     func = Function(
@@ -802,6 +1079,7 @@ def add_builtin(
         generic=generic,
         native_func=native_func,
         defaults=defaults,
+        require_original_output_arg=require_original_output_arg,
     )
     if key in builtin_functions:
@@ -811,7 +1089,7 @@ def add_builtin(
         # export means the function will be added to the `warp` module namespace
         # so that users can call it directly from the Python interpreter
-        if export == True:
+        if export:
             if hasattr(warp, key):
                 # check that we haven't already created something at this location
                 # if it's just an overload stub for auto-complete then overwrite it
@@ -878,6 +1156,8 @@ class ModuleBuilder:
         for func in module.functions.values():
             for f in func.user_overloads.values():
                 self.build_function(f)
+                if f.custom_replay_func is not None:
+                    self.build_function(f.custom_replay_func)
         # build all kernel entry points
         for kernel in module.kernels.values():
@@ -894,8 +1174,7 @@ class ModuleBuilder:
         while stack:
             s = stack.pop()
-            if not s in structs:
-                structs.append(s)
+            structs.append(s)
             for var in s.vars.values():
                 if isinstance(var.type, warp.codegen.Struct):
@@ -927,7 +1206,7 @@ class ModuleBuilder:
             if not func.value_func:
                 def wrap(adj):
-                    def value_type(args, kwds, templates):
+                    def value_type(arg_types, kwds, templates):
                         if adj.return_var is None or len(adj.return_var) == 0:
                             return None
                         if len(adj.return_var) == 1:
@@ -951,7 +1230,14 @@ class ModuleBuilder:
         # code-gen all imported functions
         for func in self.functions.keys():
-            source += warp.codegen.codegen_func(func.adj, name=func.key, device=device, options=self.options)
+            if func.native_snippet is None:
+                source += warp.codegen.codegen_func(
+                    func.adj, c_func_name=func.native_func, device=device, options=self.options
+                )
+            else:
+                source += warp.codegen.codegen_snippet(
+                    func.adj, name=func.key, snippet=func.native_snippet, adj_snippet=func.adj_native_snippet
+                )
         for kernel in self.module.kernels.values():
             # each kernel gets an entry point in the module
@@ -1031,6 +1317,10 @@ class Module:
         self.content_hash = None
+        # number of times module auto-generates kernel key for user
+        # used to ensure unique kernel keys
+        self.count = 0
     def register_struct(self, struct):
         self.structs[struct.key] = struct
@@ -1045,7 +1335,7 @@ class Module:
         # for a reload of module on next launch
         self.unload()
-    def register_function(self, func):
+    def register_function(self, func, skip_adding_overload=False):
         if func.key not in self.functions:
             self.functions[func.key] = func
         else:
@@ -1065,7 +1355,7 @@ class Module:
             )
             if sig == sig_existing:
                 self.functions[func.key] = func
-            else:
+            elif not skip_adding_overload:
                 func_existing.add_overload(func)
         self.find_references(func.adj)
@@ -1073,6 +1363,11 @@ class Module:
         # for a reload of module on next launch
         self.unload()
+    def generate_unique_kernel_key(self, key):
+        unique_key = f"{key}_{self.count}"
+        self.count += 1
+        return unique_key
     # collect all referenced functions / structs
     # given the AST of a function or kernel
     def find_references(self, adj):
@@ -1086,13 +1381,13 @@ class Module:
             if isinstance(node, ast.Call):
                 try:
                     # try to resolve the function
-                    func, _ = adj.resolve_path(node.func)
+                    func, _ = adj.resolve_static_expression(node.func, eval_types=False)
                     # if this is a user-defined function, add a module reference
                     if isinstance(func, warp.context.Function) and func.module is not None:
                         add_ref(func.module)
-                except:
+                except Exception:
                     # Lookups may fail for builtins, but that's ok.
                     # Lookups may also fail for functions in this module that haven't been imported yet,
                     # and that's ok too (not an external reference).
@@ -1139,9 +1434,24 @@ class Module:
                     s = func.adj.source
                     ch.update(bytes(s, "utf-8"))
+                    if func.custom_grad_func:
+                        s = func.custom_grad_func.adj.source
+                        ch.update(bytes(s, "utf-8"))
+                    if func.custom_replay_func:
+                        s = func.custom_replay_func.adj.source
+                    # cache func arg types
+                    for arg, arg_type in func.adj.arg_types.items():
+                        s = f"{arg}: {get_type_name(arg_type)}"
+                        ch.update(bytes(s, "utf-8"))
                 # kernel source
                 for kernel in module.kernels.values():
                     ch.update(bytes(kernel.adj.source, "utf-8"))
+                    # cache kernel arg types
+                    for arg, arg_type in kernel.adj.arg_types.items():
+                        s = f"{arg}: {get_type_name(arg_type)}"
+                        ch.update(bytes(s, "utf-8"))
                     # for generic kernels the Python source is always the same,
                     # but we hash the type signatures of all the overloads
                     if kernel.is_generic:
@@ -1440,13 +1750,13 @@ class ContextGuard:
     def __enter__(self):
         if self.device.is_cuda:
             runtime.core.cuda_context_push_current(self.device.context)
-        elif is_cuda_available():
+        elif is_cuda_driver_initialized():
             self.saved_context = runtime.core.cuda_context_get_current()
     def __exit__(self, exc_type, exc_value, traceback):
         if self.device.is_cuda:
             runtime.core.cuda_context_pop_current()
-        elif is_cuda_available():
+        elif is_cuda_driver_initialized():
             runtime.core.cuda_context_set_current(self.saved_context)
@@ -1537,6 +1847,29 @@ class Event:
 class Device:
+    """A device to allocate Warp arrays and to launch kernels on.
+    Attributes:
+        ordinal: A Warp-specific integer label for the device. ``-1`` for CPU devices.
+        name: A string label for the device. By default, CPU devices will be named according to the processor name,
+            or ``"CPU"`` if the processor name cannot be determined.
+        arch: An integer representing the compute capability version number calculated as
+            ``10 * major + minor``. ``0`` for CPU devices.
+        is_uva: A boolean indicating whether or not the device supports unified addressing.
+            ``False`` for CPU devices.
+        is_cubin_supported: A boolean indicating whether or not Warp's version of NVRTC can directly
+            generate CUDA binary files (cubin) for this device's architecture. ``False`` for CPU devices.
+        is_mempool_supported: A boolean indicating whether or not the device supports using the
+            ``cuMemAllocAsync`` and ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for
+            CPU devices.
+        is_primary: A boolean indicating whether or not this device's CUDA context is also the
+            device's primary context.
+        uuid: A string representing the UUID of the CUDA device. The UUID is in the same format used by
+            ``nvidia-smi -L``. ``None`` for CPU devices.
+        pci_bus_id: A string identifier for the CUDA device in the format ``[domain]:[bus]:[device]``, in which
+            ``domain``, ``bus``, and ``device`` are all hexadecimal values. ``None`` for CPU devices.
+    """
     def __init__(self, runtime, alias, ordinal=-1, is_primary=False, context=None):
         self.runtime = runtime
         self.alias = alias
@@ -1566,6 +1899,9 @@ class Device:
             self.arch = 0
             self.is_uva = False
             self.is_cubin_supported = False
+            self.is_mempool_supported = False
+            self.uuid = None
+            self.pci_bus_id = None
             # TODO: add more device-specific dispatch functions
             self.memset = runtime.core.memset_host
@@ -1578,6 +1914,26 @@ class Device:
             self.is_uva = runtime.core.cuda_device_is_uva(ordinal)
             # check whether our NVRTC can generate CUBINs for this architecture
             self.is_cubin_supported = self.arch in runtime.nvrtc_supported_archs
+            self.is_mempool_supported = runtime.core.cuda_device_is_memory_pool_supported(ordinal)
+            uuid_buffer = (ctypes.c_char * 16)()
+            runtime.core.cuda_device_get_uuid(ordinal, uuid_buffer)
+            uuid_byte_str = bytes(uuid_buffer).hex()
+            self.uuid = f"GPU-{uuid_byte_str[0:8]}-{uuid_byte_str[8:12]}-{uuid_byte_str[12:16]}-{uuid_byte_str[16:20]}-{uuid_byte_str[20:]}"
+            pci_domain_id = runtime.core.cuda_device_get_pci_domain_id(ordinal)
+            pci_bus_id = runtime.core.cuda_device_get_pci_bus_id(ordinal)
+            pci_device_id = runtime.core.cuda_device_get_pci_device_id(ordinal)
+            # This is (mis)named to correspond to the naming of cudaDeviceGetPCIBusId
+            self.pci_bus_id = f"{pci_domain_id:08X}:{pci_bus_id:02X}:{pci_device_id:02X}"
+            # Warn the user of a possible misconfiguration of their system
+            if not self.is_mempool_supported:
+                warp.utils.warn(
+                    f"Support for stream ordered memory allocators was not detected on device {ordinal}. "
+                    "This can prevent the use of graphs and/or result in poor performance. "
+                    "Is the UVM driver enabled?"
+                )
             # initialize streams unless context acquisition is postponed
             if self._context is not None:
@@ -1601,14 +1957,17 @@ class Device:
     @property
     def is_cpu(self):
+        """A boolean indicating whether or not the device is a CPU device."""
         return self.ordinal < 0
     @property
     def is_cuda(self):
+        """A boolean indicating whether or not the device is a CUDA device."""
         return self.ordinal >= 0
     @property
     def context(self):
+        """The context associated with the device."""
         if self._context is not None:
             return self._context
         elif self.is_primary:
@@ -1623,10 +1982,16 @@ class Device:
     @property
     def has_context(self):
+        """A boolean indicating whether or not the device has a CUDA context associated with it."""
         return self._context is not None
     @property
     def stream(self):
+        """The stream associated with a CUDA device.
+        Raises:
+            RuntimeError: The device is not a CUDA device.
+        """
         if self.context:
             return self._stream
         else:
@@ -1644,6 +2009,7 @@ class Device:
     @property
     def has_stream(self):
+        """A boolean indicating whether or not the device has a stream associated with it."""
         return self._stream is not None
     def __str__(self):
@@ -1721,7 +2087,7 @@ class Runtime:
         self.core = self.load_dll(warp_lib)
-        if llvm_lib and os.path.exists(llvm_lib):
+        if os.path.exists(llvm_lib):
             self.llvm = self.load_dll(llvm_lib)
             # setup c-types for warp-clang.dll
             self.llvm.lookup.restype = ctypes.c_uint64
@@ -2087,6 +2453,8 @@ class Runtime:
         self.core.cuda_driver_version.restype = ctypes.c_int
         self.core.cuda_toolkit_version.argtypes = None
         self.core.cuda_toolkit_version.restype = ctypes.c_int
+        self.core.cuda_driver_is_initialized.argtypes = None
+        self.core.cuda_driver_is_initialized.restype = ctypes.c_bool
         self.core.nvrtc_supported_arch_count.argtypes = None
         self.core.nvrtc_supported_arch_count.restype = ctypes.c_int
@@ -2103,6 +2471,14 @@ class Runtime:
         self.core.cuda_device_get_arch.restype = ctypes.c_int
         self.core.cuda_device_is_uva.argtypes = [ctypes.c_int]
         self.core.cuda_device_is_uva.restype = ctypes.c_int
+        self.core.cuda_device_get_uuid.argtypes = [ctypes.c_int, ctypes.c_char * 16]
+        self.core.cuda_device_get_uuid.restype = None
+        self.core.cuda_device_get_pci_domain_id.argtypes = [ctypes.c_int]
+        self.core.cuda_device_get_pci_domain_id.restype = ctypes.c_int
+        self.core.cuda_device_get_pci_bus_id.argtypes = [ctypes.c_int]
+        self.core.cuda_device_get_pci_bus_id.restype = ctypes.c_int
+        self.core.cuda_device_get_pci_device_id.argtypes = [ctypes.c_int]
+        self.core.cuda_device_get_pci_device_id.restype = ctypes.c_int
         self.core.cuda_context_get_current.argtypes = None
         self.core.cuda_context_get_current.restype = ctypes.c_void_p
@@ -2189,6 +2565,7 @@ class Runtime:
             ctypes.c_void_p,
             ctypes.c_void_p,
             ctypes.c_size_t,
+            ctypes.c_int,
             ctypes.POINTER(ctypes.c_void_p),
         ]
         self.core.cuda_launch_kernel.restype = ctypes.c_size_t
@@ -2309,8 +2686,15 @@ class Runtime:
                 dll = ctypes.CDLL(dll_path, winmode=0)
             else:
                 dll = ctypes.CDLL(dll_path)
-        except OSError:
-            raise RuntimeError(f"Failed to load the shared library '{dll_path}'")
+        except OSError as e:
+            if "GLIBCXX" in str(e):
+                raise RuntimeError(
+                    f"Failed to load the shared library '{dll_path}'.\n"
+                    "The execution environment's libstdc++ runtime is older than the version the Warp library was built for.\n"
+                    "See https://nvidia.github.io/warp/_build/html/installation.html#conda-environments for details."
+                ) from e
+            else:
+                raise RuntimeError(f"Failed to load the shared library '{dll_path}'") from e
         return dll
     def get_device(self, ident: Devicelike = None) -> Device:
@@ -2439,6 +2823,21 @@ def is_device_available(device):
     return device in get_devices()
+def is_cuda_driver_initialized() -> bool:
+    """Returns ``True`` if the CUDA driver is initialized.
+    This is a stricter test than ``is_cuda_available()`` since a CUDA driver
+    call to ``cuCtxGetCurrent`` is made, and the result is compared to
+    `CUDA_SUCCESS`. Note that `CUDA_SUCCESS` is returned by ``cuCtxGetCurrent``
+    even if there is no context bound to the calling CPU thread.
+    This can be helpful in cases in which ``cuInit()`` was called before a fork.
+    """
+    assert_initialized()
+    return runtime.core.cuda_driver_is_initialized()
 def get_devices() -> List[Device]:
     """Returns a list of devices supported in this environment."""
@@ -2749,7 +3148,7 @@ def full(
             elif na.ndim == 2:
                 dtype = warp.types.matrix(na.shape, scalar_type)
             else:
-                raise ValueError(f"Values with more than two dimensions are not supported")
+                raise ValueError("Values with more than two dimensions are not supported")
         else:
             raise ValueError(f"Invalid value type for Warp array: {value_type}")
@@ -2872,8 +3271,34 @@ def empty_like(
     return arr
-def from_numpy(arr, dtype, device: Devicelike = None, requires_grad=False):
-    return warp.array(data=arr, dtype=dtype, device=device, requires_grad=requires_grad)
+def from_numpy(
+    arr: np.ndarray,
+    dtype: Optional[type] = None,
+    shape: Optional[Sequence[int]] = None,
+    device: Optional[Devicelike] = None,
+    requires_grad: bool = False,
+) -> warp.array:
+    if dtype is None:
+        base_type = warp.types.np_dtype_to_warp_type.get(arr.dtype)
+        if base_type is None:
+            raise RuntimeError("Unsupported NumPy data type '{}'.".format(arr.dtype))
+        dim_count = len(arr.shape)
+        if dim_count == 2:
+            dtype = warp.types.vector(length=arr.shape[1], dtype=base_type)
+        elif dim_count == 3:
+            dtype = warp.types.matrix(shape=(arr.shape[1], arr.shape[2]), dtype=base_type)
+        else:
+            dtype = base_type
+    return warp.array(
+        data=arr,
+        dtype=dtype,
+        shape=shape,
+        owner=False,
+        device=device,
+        requires_grad=requires_grad,
+    )
 # given a kernel destination argument type and a value convert
@@ -2889,9 +3314,9 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             # - in forward passes, array types have to match
             # - in backward passes, indexed array gradients are regular arrays
             if adjoint:
-                array_matches = type(value) == warp.array
+                array_matches = isinstance(value, warp.array)
             else:
-                array_matches = type(value) == type(arg_type)
+                array_matches = type(value) is type(arg_type)
             if not array_matches:
                 adj = "adjoint " if adjoint else ""
@@ -2934,7 +3359,7 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             # try constructing the required value from the argument (handles tuple / list, Gf.Vec3 case)
             try:
                 return arg_type(value)
-            except:
+            except Exception:
                 raise ValueError(f"Failed to convert argument for param {arg_name} to {type_str(arg_type)}")
     elif isinstance(value, bool):
@@ -2943,27 +3368,35 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
     elif isinstance(value, arg_type):
         try:
             # try to pack as a scalar type
-            return arg_type._type_(value.value)
-        except:
+            if arg_type is warp.types.float16:
+                return arg_type._type_(warp.types.float_to_half_bits(value.value))
+            else:
+                return arg_type._type_(value.value)
+        except Exception:
             raise RuntimeError(
-                f"Error launching kernel, unable to pack kernel parameter type {type(value)} for param {arg_name}, expected {arg_type}"
+                "Error launching kernel, unable to pack kernel parameter type "
+                f"{type(value)} for param {arg_name}, expected {arg_type}"
             )
     else:
         try:
             # try to pack as a scalar type
-            return arg_type._type_(value)
+            if arg_type is warp.types.float16:
+                return arg_type._type_(warp.types.float_to_half_bits(value))
+            else:
+                return arg_type._type_(value)
         except Exception as e:
             print(e)
             raise RuntimeError(
-                f"Error launching kernel, unable to pack kernel parameter type {type(value)} for param {arg_name}, expected {arg_type}"
+                "Error launching kernel, unable to pack kernel parameter type "
+                f"{type(value)} for param {arg_name}, expected {arg_type}"
             )
 # represents all data required for a kernel launch
 # so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
 class Launch:
-    def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None):
+    def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0):
         # if not specified look up hooks
         if not hooks:
             module = kernel.module
@@ -3000,6 +3433,7 @@ class Launch:
         self.params_addr = params_addr
         self.device = device
         self.bounds = bounds
+        self.max_blocks = max_blocks
     def set_dim(self, dim):
         self.bounds = warp.types.launch_bounds_t(dim)
@@ -3065,7 +3499,9 @@ class Launch:
         if self.device.is_cpu:
             self.hooks.forward(*self.params)
         else:
-            runtime.core.cuda_launch_kernel(self.device.context, self.hooks.forward, self.bounds.size, self.params_addr)
+            runtime.core.cuda_launch_kernel(
+                self.device.context, self.hooks.forward, self.bounds.size, self.max_blocks, self.params_addr
+            )
 def launch(
@@ -3080,6 +3516,7 @@ def launch(
     adjoint=False,
     record_tape=True,
     record_cmd=False,
+    max_blocks=0,
 ):
     """Launch a Warp kernel on the target device
@@ -3097,6 +3534,8 @@ def launch(
         adjoint: Whether to run forward or backward pass (typically use False)
         record_tape: When true the launch will be recorded the global wp.Tape() object when present
         record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
+        max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
+            If negative or zero, the maximum hardware value will be used.
     """
     assert_initialized()
@@ -3108,7 +3547,7 @@ def launch(
         device = runtime.get_device(device)
     # check function is a Kernel
-    if isinstance(kernel, Kernel) == False:
+    if not isinstance(kernel, Kernel):
         raise RuntimeError("Error launching kernel, can only launch functions decorated with @wp.kernel.")
     # debugging aid
@@ -3190,7 +3629,9 @@ def launch(
                             f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                         )
-                    runtime.core.cuda_launch_kernel(device.context, hooks.backward, bounds.size, kernel_params)
+                    runtime.core.cuda_launch_kernel(
+                        device.context, hooks.backward, bounds.size, max_blocks, kernel_params
+                    )
                 else:
                     if hooks.forward is None:
@@ -3211,7 +3652,9 @@ def launch(
                     else:
                         # launch
-                        runtime.core.cuda_launch_kernel(device.context, hooks.forward, bounds.size, kernel_params)
+                        runtime.core.cuda_launch_kernel(
+                            device.context, hooks.forward, bounds.size, max_blocks, kernel_params
+                        )
                 try:
                     runtime.verify_cuda_device(device)
@@ -3221,7 +3664,7 @@ def launch(
     # record on tape if one is active
     if runtime.tape and record_tape:
-        runtime.tape.record_launch(kernel, dim, inputs, outputs, device)
+        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device)
 def synchronize():
@@ -3231,7 +3674,7 @@ def synchronize():
     or memory copies have completed.
     """
-    if is_cuda_available():
+    if is_cuda_driver_initialized():
         # save the original context to avoid side effects
         saved_context = runtime.core.cuda_context_get_current()
@@ -3281,7 +3724,7 @@ def synchronize_stream(stream_or_device=None):
     runtime.core.cuda_stream_synchronize(stream.device.context, stream.cuda_stream)
-def force_load(device: Union[Device, str] = None, modules: List[Module] = None):
+def force_load(device: Union[Device, str, List[Device], List[str]] = None, modules: List[Module] = None):
     """Force user-defined kernels to be compiled and loaded
     Args:
@@ -3289,12 +3732,14 @@ def force_load(device: Union[Device, str] = None, modules: List[Module] = None):
         modules: List of modules to load.  If None, load all imported modules.
     """
-    if is_cuda_available():
+    if is_cuda_driver_initialized():
         # save original context to avoid side effects
         saved_context = runtime.core.cuda_context_get_current()
     if device is None:
         devices = get_devices()
+    elif isinstance(device, list):
+        devices = [get_device(device_item) for device_item in device]
     else:
         devices = [get_device(device)]
@@ -3386,7 +3831,7 @@ def get_module_options(module: Optional[Any] = None) -> Dict[str, Any]:
     return get_module(m.__name__).options
-def capture_begin(device: Devicelike = None, stream=None, force_module_load=True):
+def capture_begin(device: Devicelike = None, stream=None, force_module_load=None):
     """Begin capture of a CUDA graph
     Captures all subsequent kernel launches and memory operations on CUDA devices.
@@ -3400,7 +3845,10 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=True
     """
-    if warp.config.verify_cuda == True:
+    if force_module_load is None:
+        force_module_load = warp.config.graph_capture_module_load_default
+    if warp.config.verify_cuda:
         raise RuntimeError("Cannot use CUDA error verification during graph capture")
     if stream is not None:
@@ -3415,6 +3863,9 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=True
     device.is_capturing = True
+    # disable garbage collection to avoid older allocations getting collected during graph capture
+    gc.disable()
     with warp.ScopedStream(stream):
         runtime.core.cuda_graph_begin_capture(device.context)
@@ -3438,6 +3889,9 @@ def capture_end(device: Devicelike = None, stream=None) -> Graph:
     device.is_capturing = False
+    # re-enable GC
+    gc.enable()
     if graph is None:
         raise RuntimeError(
             "Error occurred during CUDA graph capture. This could be due to an unintended allocation or CPU/GPU synchronization event."
@@ -3557,6 +4011,16 @@ def copy(
         if src_elem_size != dst_elem_size:
             raise RuntimeError("Incompatible array data types")
+        # can't copy to/from fabric arrays of arrays, because they are jagged arrays of arbitrary lengths
+        # TODO?
+        if (
+            isinstance(src, (warp.fabricarray, warp.indexedfabricarray))
+            and src.ndim > 1
+            or isinstance(dest, (warp.fabricarray, warp.indexedfabricarray))
+            and dest.ndim > 1
+        ):
+            raise RuntimeError("Copying to/from Fabric arrays of arrays is not supported")
         src_desc = src.__ctype__()
         dst_desc = dest.__ctype__()
         src_ptr = ctypes.pointer(src_desc)
@@ -3592,6 +4056,10 @@ def type_str(t):
         return f"Array[{type_str(t.dtype)}]"
     elif isinstance(t, warp.indexedarray):
         return f"IndexedArray[{type_str(t.dtype)}]"
+    elif isinstance(t, warp.fabricarray):
+        return f"FabricArray[{type_str(t.dtype)}]"
+    elif isinstance(t, warp.indexedfabricarray):
+        return f"IndexedFabricArray[{type_str(t.dtype)}]"
     elif hasattr(t, "_wp_generic_type_str_"):
         generic_type = t._wp_generic_type_str_
@@ -3618,7 +4086,7 @@ def type_str(t):
         return t.__name__
-def print_function(f, file, noentry=False):
+def print_function(f, file, noentry=False):  # pragma: no cover
     """Writes a function definition to a file for use in reST documentation
     Args:
@@ -3642,7 +4110,7 @@ def print_function(f, file, noentry=False):
         # todo: construct a default value for each of the functions args
         # so we can generate the return type for overloaded functions
         return_type = " -> " + type_str(f.value_func(None, None, None))
-    except:
+    except Exception:
         pass
     print(f".. function:: {f.key}({args}){return_type}", file=file)
@@ -3663,7 +4131,7 @@ def print_function(f, file, noentry=False):
     return True
-def print_builtins(file):
+def export_functions_rst(file):  # pragma: no cover
     header = (
         "..\n"
         "   Autogenerated File - Do not edit. Run build_docs.py to generate.\n"
@@ -3683,6 +4151,8 @@ def print_builtins(file):
     for t in warp.types.scalar_types:
         print(f".. class:: {t.__name__}", file=file)
+    # Manually add wp.bool since it's inconvenient to add to wp.types.scalar_types:
+    print(f".. class:: {warp.types.bool.__name__}", file=file)
     print("\n\nVector Types", file=file)
     print("------------", file=file)
@@ -3693,14 +4163,22 @@ def print_builtins(file):
     print("\nGeneric Types", file=file)
     print("-------------", file=file)
-    print(f".. class:: Int", file=file)
-    print(f".. class:: Float", file=file)
-    print(f".. class:: Scalar", file=file)
-    print(f".. class:: Vector", file=file)
-    print(f".. class:: Matrix", file=file)
-    print(f".. class:: Quaternion", file=file)
-    print(f".. class:: Transformation", file=file)
-    print(f".. class:: Array", file=file)
+    print(".. class:: Int", file=file)
+    print(".. class:: Float", file=file)
+    print(".. class:: Scalar", file=file)
+    print(".. class:: Vector", file=file)
+    print(".. class:: Matrix", file=file)
+    print(".. class:: Quaternion", file=file)
+    print(".. class:: Transformation", file=file)
+    print(".. class:: Array", file=file)
+    print("\nQuery Types", file=file)
+    print("-------------", file=file)
+    print(".. autoclass:: bvh_query_t", file=file)
+    print(".. autoclass:: hash_grid_query_t", file=file)
+    print(".. autoclass:: mesh_query_aabb_t", file=file)
+    print(".. autoclass:: mesh_query_point_t", file=file)
+    print(".. autoclass:: mesh_query_ray_t", file=file)
     # build dictionary of all functions by group
     groups = {}
@@ -3735,7 +4213,7 @@ def print_builtins(file):
     print(".. [1] Note: function gradients not implemented for backpropagation.", file=file)
-def export_stubs(file):
+def export_stubs(file):  # pragma: no cover
     """Generates stub file for auto-complete of builtin functions"""
     import textwrap
@@ -3767,6 +4245,8 @@ def export_stubs(file):
     print("Quaternion = Generic[Float]", file=file)
     print("Transformation = Generic[Float]", file=file)
     print("Array = Generic[DType]", file=file)
+    print("FabricArray = Generic[DType]", file=file)
+    print("IndexedFabricArray = Generic[DType]", file=file)
     # prepend __init__.py
     with open(os.path.join(os.path.dirname(file.name), "__init__.py")) as header_file:
@@ -3783,7 +4263,7 @@ def export_stubs(file):
             return_str = ""
-            if f.export == False or f.hidden == True:  # or f.generic:
+            if not f.export or f.hidden:  # or f.generic:
                 continue
             try:
@@ -3793,29 +4273,42 @@ def export_stubs(file):
                 if return_type:
                     return_str = " -> " + type_str(return_type)
-            except:
+            except Exception:
                 pass
             print("@over", file=file)
             print(f"def {f.key}({args}){return_str}:", file=file)
-            print(f'    """', file=file)
+            print('    """', file=file)
             print(textwrap.indent(text=f.doc, prefix="    "), file=file)
-            print(f'    """', file=file)
-            print(f"    ...\n\n", file=file)
+            print('    """', file=file)
+            print("    ...\n\n", file=file)
-def export_builtins(file):
-    def ctype_str(t):
+def export_builtins(file: io.TextIOBase):  # pragma: no cover
+    def ctype_arg_str(t):
         if isinstance(t, int):
             return "int"
         elif isinstance(t, float):
             return "float"
+        elif t in warp.types.vector_types:
+            return f"{t.__name__}&"
         else:
             return t.__name__
+    def ctype_ret_str(t):
+        if isinstance(t, int):
+            return "int"
+        elif isinstance(t, float):
+            return "float"
+        else:
+            return t.__name__
+    file.write("namespace wp {\n\n")
+    file.write('extern "C" {\n\n')
     for k, g in builtin_functions.items():
         for f in g.overloads:
-            if f.export == False or f.generic:
+            if not f.export or f.generic:
                 continue
             simple = True
@@ -3829,7 +4322,7 @@ def export_builtins(file):
             if not simple or f.variadic:
                 continue
-            args = ", ".join(f"{ctype_str(v)} {k}" for k, v in f.input_types.items())
+            args = ", ".join(f"{ctype_arg_str(v)} {k}" for k, v in f.input_types.items())
             params = ", ".join(f.input_types.keys())
             return_type = ""
@@ -3837,25 +4330,25 @@ def export_builtins(file):
             try:
                 # todo: construct a default value for each of the functions args
                 # so we can generate the return type for overloaded functions
-                return_type = ctype_str(f.value_func(None, None, None))
-            except:
+                return_type = ctype_ret_str(f.value_func(None, None, None))
+            except Exception:
                 continue
             if return_type.startswith("Tuple"):
                 continue
             if args == "":
-                print(
-                    f"WP_API void {f.mangled_name}({return_type}* ret) {{ *ret = wp::{f.key}({params}); }}", file=file
-                )
+                file.write(f"WP_API void {f.mangled_name}({return_type}* ret) {{ *ret = wp::{f.key}({params}); }}\n")
             elif return_type == "None":
-                print(f"WP_API void {f.mangled_name}({args}) {{ wp::{f.key}({params}); }}", file=file)
+                file.write(f"WP_API void {f.mangled_name}({args}) {{ wp::{f.key}({params}); }}\n")
             else:
-                print(
-                    f"WP_API void {f.mangled_name}({args}, {return_type}* ret) {{ *ret = wp::{f.key}({params}); }}",
-                    file=file,
+                file.write(
+                    f"WP_API void {f.mangled_name}({args}, {return_type}* ret) {{ *ret = wp::{f.key}({params}); }}\n"
                 )
+    file.write('\n}  // extern "C"\n\n')
+    file.write("}  // namespace wp\n")
 # initialize global runtime
 runtime = None