PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/codegen.py CHANGED Viewed

@@ -7,23 +7,40 @@
 from __future__ import annotations
-import re
-import sys
 import ast
-import inspect
+import builtins
 import ctypes
+import inspect
+import math
+import re
+import sys
 import textwrap
 import types
+from typing import Any, Callable, Mapping
-import numpy as np
+import warp.config
+from warp.types import *
-from typing import Any
-from typing import Callable
-from typing import Mapping
-from typing import Union
-from warp.types import *
-import warp.config
+class WarpCodegenError(RuntimeError):
+    def __init__(self, message):
+        super().__init__(message)
+class WarpCodegenTypeError(TypeError):
+    def __init__(self, message):
+        super().__init__(message)
+class WarpCodegenAttributeError(AttributeError):
+    def __init__(self, message):
+        super().__init__(message)
+class WarpCodegenKeyError(KeyError):
+    def __init__(self, message):
+        super().__init__(message)
 # map operator to function name
 builtin_operators = {}
@@ -57,6 +74,19 @@ builtin_operators[ast.Invert] = "invert"
 builtin_operators[ast.LShift] = "lshift"
 builtin_operators[ast.RShift] = "rshift"
+comparison_chain_strings = [
+    builtin_operators[ast.Gt],
+    builtin_operators[ast.Lt],
+    builtin_operators[ast.LtE],
+    builtin_operators[ast.GtE],
+    builtin_operators[ast.Eq],
+    builtin_operators[ast.NotEq],
+]
+def op_str_is_chainable(op: str) -> builtins.bool:
+    return op in comparison_chain_strings
 def get_annotations(obj: Any) -> Mapping[str, Any]:
     """Alternative to `inspect.get_annotations()` for Python 3.9 and older."""
@@ -67,97 +97,156 @@ def get_annotations(obj: Any) -> Mapping[str, Any]:
     return getattr(obj, "__annotations__", {})
-def _get_struct_instance_ctype(
-    inst: StructInstance,
-    parent_ctype: Union[StructInstance, None],
-    parent_field: Union[str, None],
-) -> ctypes.Structure:
-    if inst._struct_.ctype._fields_ == [("_dummy_", ctypes.c_int)]:
-        return inst._struct_.ctype()
+def struct_instance_repr_recursive(inst: StructInstance, depth: int) -> str:
+    indent = "\t"
+    # handle empty structs
+    if len(inst._cls.vars) == 0:
+        return f"{inst._cls.key}()"
-    if parent_ctype is None:
-        inst_ctype = inst._struct_.ctype()
-    else:
-        inst_ctype = getattr(parent_ctype, parent_field)
+    lines = []
+    lines.append(f"{inst._cls.key}(")
+    for field_name, _ in inst._cls.ctype._fields_:
+        field_value = getattr(inst, field_name, None)
+        if isinstance(field_value, StructInstance):
+            field_value = struct_instance_repr_recursive(field_value, depth + 1)
+        lines.append(f"{indent * (depth + 1)}{field_name}={field_value},")
-    for field_name, _ in inst_ctype._fields_:
-        value = getattr(inst, field_name, None)
+    lines.append(f"{indent * depth})")
+    return "\n".join(lines)
+class StructInstance:
+    def __init__(self, cls: Struct, ctype):
+        super().__setattr__("_cls", cls)
+        # maintain a c-types object for the top-level instance the struct
+        if not ctype:
+            super().__setattr__("_ctype", cls.ctype())
+        else:
+            super().__setattr__("_ctype", ctype)
+        # create Python attributes for each of the struct's variables
+        for field, var in cls.vars.items():
+            if isinstance(var.type, warp.codegen.Struct):
+                self.__dict__[field] = StructInstance(var.type, getattr(self._ctype, field))
+            elif isinstance(var.type, warp.types.array):
+                self.__dict__[field] = None
+            else:
+                self.__dict__[field] = var.type()
-        var_type = inst._struct_.vars[field_name].type
-        if isinstance(var_type, array):
+    def __setattr__(self, name, value):
+        if name not in self._cls.vars:
+            raise RuntimeError(f"Trying to set Warp struct attribute that does not exist {name}")
+        var = self._cls.vars[name]
+        # update our ctype flat copy
+        if isinstance(var.type, array):
             if value is None:
                 # create array with null pointer
-                setattr(inst_ctype, field_name, array_t())
+                setattr(self._ctype, name, array_t())
             else:
                 # wp.array
                 assert isinstance(value, array)
-                assert (
-                    value.dtype == var_type.dtype
-                ), "assign to struct member variable {} failed, expected type {}, got type {}".format(
-                    field_name, var_type.dtype, value.dtype
+                assert types_equal(
+                    value.dtype, var.type.dtype
+                ), f"assign to struct member variable {name} failed, expected type {type_repr(var.type.dtype)}, got type {type_repr(value.dtype)}"
+                setattr(self._ctype, name, value.__ctype__())
+        elif isinstance(var.type, Struct):
+            # assign structs by-value, otherwise we would have problematic cases transferring ownership
+            # of the underlying ctypes data between shared Python struct instances
+            if not isinstance(value, StructInstance):
+                raise RuntimeError(
+                    f"Trying to assign a non-structure value to a struct attribute with type: {self._cls.key}"
                 )
-                setattr(inst_ctype, field_name, value.__ctype__())
-        elif isinstance(var_type, Struct):
-            if value is None:
-                _get_struct_instance_ctype(StructInstance(var_type), inst_ctype, field_name)
-            else:
-                _get_struct_instance_ctype(value, inst_ctype, field_name)
-        elif issubclass(var_type, ctypes.Array):
+            # destination attribution on self
+            dest = getattr(self, name)
+            if dest._cls.key is not value._cls.key:
+                raise RuntimeError(
+                    f"Trying to assign a structure of type {value._cls.key} to an attribute of {self._cls.key}"
+                )
+            # update all nested ctype vars by deep copy
+            for n in dest._cls.vars:
+                setattr(dest, n, getattr(value, n))
+            # early return to avoid updating our Python StructInstance
+            return
+        elif issubclass(var.type, ctypes.Array):
             # vector/matrix type, e.g. vec3
             if value is None:
-                setattr(inst_ctype, field_name, var_type())
-            elif types_equal(type(value), var_type):
-                setattr(inst_ctype, field_name, value)
+                setattr(self._ctype, name, var.type())
+            elif types_equal(type(value), var.type):
+                setattr(self._ctype, name, value)
             else:
                 # conversion from list/tuple, ndarray, etc.
-                setattr(inst_ctype, field_name, var_type(value))
+                setattr(self._ctype, name, var.type(value))
         else:
             # primitive type
             if value is None:
-                setattr(inst_ctype, field_name, var_type._type_())
+                # zero initialize
+                setattr(self._ctype, name, var.type._type_())
             else:
-                setattr(inst_ctype, field_name, var_type._type_(value))
-    return inst_ctype
-def _fmt_struct_instance_repr(inst: StructInstance, depth: int) -> str:
-    indent = "\t"
-    if inst._struct_.ctype._fields_ == [("_dummy_", ctypes.c_int)]:
-        return f"{inst._struct_.key}()"
-    lines = []
-    lines.append(f"{inst._struct_.key}(")
-    for field_name, _ in inst._struct_.ctype._fields_:
-        if field_name == "_dummy_":
-            continue
-        field_value = getattr(inst, field_name, None)
-        if isinstance(field_value, StructInstance):
-            field_value = _fmt_struct_instance_repr(field_value, depth + 1)
+                if hasattr(value, "_type_"):
+                    # assigning warp type value (e.g.: wp.float32)
+                    value = value.value
+                # float16 needs conversion to uint16 bits
+                if var.type == warp.float16:
+                    setattr(self._ctype, name, float_to_half_bits(value))
+                else:
+                    setattr(self._ctype, name, value)
-        lines.append(f"{indent * (depth + 1)}{field_name}={field_value},")
+        # update Python instance
+        super().__setattr__(name, value)
-    lines.append(f"{indent * depth})")
-    return "\n".join(lines)
+    def __ctype__(self):
+        return self._ctype
+    def __repr__(self):
+        return struct_instance_repr_recursive(self, 0)
-class StructInstance:
-    def __init__(self, struct: Struct):
-        self.__dict__["_struct_"] = struct
+    # type description used in numpy structured arrays
+    def numpy_dtype(self):
+        return self._cls.numpy_dtype()
-    def __setattr__(self, name, value):
-        assert name in self._struct_.vars, "invalid struct member variable {}".format(name)
-        super().__setattr__(name, value)
+    # value usable in numpy structured arrays of .numpy_dtype(), e.g. (42, 13.37, [1.0, 2.0, 3.0])
+    def numpy_value(self):
+        npvalue = []
+        for name, var in self._cls.vars.items():
+            # get the attribute value
+            value = getattr(self._ctype, name)
-    def __ctype__(self):
-        return _get_struct_instance_ctype(self, None, None)
+            if isinstance(var.type, array):
+                # array_t
+                npvalue.append(value.numpy_value())
+            elif isinstance(var.type, Struct):
+                # nested struct
+                npvalue.append(value.numpy_value())
+            elif issubclass(var.type, ctypes.Array):
+                if len(var.type._shape_) == 1:
+                    # vector
+                    npvalue.append(list(value))
+                else:
+                    # matrix
+                    npvalue.append([list(row) for row in value])
+            else:
+                # scalar
+                if var.type == warp.float16:
+                    npvalue.append(half_bits_to_float(value))
+                else:
+                    npvalue.append(value)
-    def __repr__(self):
-        return _fmt_struct_instance_repr(self, 0)
+        return tuple(npvalue)
 class Struct:
@@ -184,7 +273,7 @@ class Struct:
         class StructType(ctypes.Structure):
             # if struct is empty, add a dummy field to avoid launch errors on CPU device ("ffi_prep_cif failed")
-            _fields_ = fields or [("_dummy_", ctypes.c_int)]
+            _fields_ = fields or [("_dummy_", ctypes.c_byte)]
         self.ctype = StructType
@@ -235,29 +324,108 @@ class Struct:
         class NewStructInstance(self.cls, StructInstance):
             def __init__(inst):
-                StructInstance.__init__(inst, self)
+                StructInstance.__init__(inst, self, None)
         return NewStructInstance()
     def initializer(self):
         return self.default_constructor
+    # return structured NumPy dtype, including field names, formats, and offsets
+    def numpy_dtype(self):
+        names = []
+        formats = []
+        offsets = []
+        for name, var in self.vars.items():
+            names.append(name)
+            offsets.append(getattr(self.ctype, name).offset)
+            if isinstance(var.type, array):
+                # array_t
+                formats.append(array_t.numpy_dtype())
+            elif isinstance(var.type, Struct):
+                # nested struct
+                formats.append(var.type.numpy_dtype())
+            elif issubclass(var.type, ctypes.Array):
+                scalar_typestr = type_typestr(var.type._wp_scalar_type_)
+                if len(var.type._shape_) == 1:
+                    # vector
+                    formats.append(f"{var.type._length_}{scalar_typestr}")
+                else:
+                    # matrix
+                    formats.append(f"{var.type._shape_}{scalar_typestr}")
+            else:
+                # scalar
+                formats.append(type_typestr(var.type))
+        return {"names": names, "formats": formats, "offsets": offsets, "itemsize": ctypes.sizeof(self.ctype)}
+    # constructs a Warp struct instance from a pointer to the ctype
+    def from_ptr(self, ptr):
+        if not ptr:
+            raise RuntimeError("NULL pointer exception")
+        # create a new struct instance
+        instance = self()
+        for name, var in self.vars.items():
+            offset = getattr(self.ctype, name).offset
+            if isinstance(var.type, array):
+                # We could reconstruct wp.array from array_t, but it's problematic.
+                # There's no guarantee that the original wp.array is still allocated and
+                # no easy way to make a backref.
+                # Instead, we just create a stub annotation, which is not a fully usable array object.
+                setattr(instance, name, array(dtype=var.type.dtype, ndim=var.type.ndim))
+            elif isinstance(var.type, Struct):
+                # nested struct
+                value = var.type.from_ptr(ptr + offset)
+                setattr(instance, name, value)
+            elif issubclass(var.type, ctypes.Array):
+                # vector/matrix
+                value = var.type.from_ptr(ptr + offset)
+                setattr(instance, name, value)
+            else:
+                # scalar
+                cvalue = ctypes.cast(ptr + offset, ctypes.POINTER(var.type._type_)).contents
+                if var.type == warp.float16:
+                    setattr(instance, name, half_bits_to_float(cvalue))
+                else:
+                    setattr(instance, name, cvalue.value)
+        return instance
+class Reference:
+    def __init__(self, value_type):
+        self.value_type = value_type
+def is_reference(type):
+    return isinstance(type, Reference)
+def strip_reference(arg):
+    if is_reference(arg):
+        return arg.value_type
+    else:
+        return arg
 def compute_type_str(base_name, template_params):
-    if template_params is None or len(template_params) == 0:
+    if not template_params:
         return base_name
-    else:
-        def param2str(p):
-            if isinstance(p, int):
-                return str(p)
-            return p.__name__
+    def param2str(p):
+        if isinstance(p, int):
+            return str(p)
+        elif hasattr(p, "_type_"):
+            return f"wp::{p.__name__}"
+        return p.__name__
-        return f"{base_name}<{','.join(map(param2str, template_params))}>"
+    return f"{base_name}<{','.join(map(param2str, template_params))}>"
 class Var:
-    def __init__(self, label, type, requires_grad=False, constant=None):
+    def __init__(self, label, type, requires_grad=False, constant=None, prefix=True):
         # convert built-in types to wp types
         if type == float:
             type = float32
@@ -268,26 +436,49 @@ class Var:
         self.type = type
         self.requires_grad = requires_grad
         self.constant = constant
+        self.prefix = prefix
     def __str__(self):
         return self.label
-    def ctype(self):
-        if is_array(self.type):
-            if hasattr(self.type.dtype, "_wp_generic_type_str_"):
-                dtypestr = compute_type_str(self.type.dtype._wp_generic_type_str_, self.type.dtype._wp_type_params_)
-            elif isinstance(self.type.dtype, Struct):
-                dtypestr = make_full_qualified_name(self.type.dtype.cls)
+    @staticmethod
+    def type_to_ctype(t, value_type=False):
+        if is_array(t):
+            if hasattr(t.dtype, "_wp_generic_type_str_"):
+                dtypestr = compute_type_str(f"wp::{t.dtype._wp_generic_type_str_}", t.dtype._wp_type_params_)
+            elif isinstance(t.dtype, Struct):
+                dtypestr = make_full_qualified_name(t.dtype.cls)
+            elif t.dtype.__name__ in ("bool", "int", "float"):
+                dtypestr = t.dtype.__name__
             else:
-                dtypestr = str(self.type.dtype.__name__)
-            classstr = type(self.type).__name__
+                dtypestr = f"wp::{t.dtype.__name__}"
+            classstr = f"wp::{type(t).__name__}"
             return f"{classstr}_t<{dtypestr}>"
-        elif isinstance(self.type, Struct):
-            return make_full_qualified_name(self.type.cls)
-        elif hasattr(self.type, "_wp_generic_type_str_"):
-            return compute_type_str(self.type._wp_generic_type_str_, self.type._wp_type_params_)
+        elif isinstance(t, Struct):
+            return make_full_qualified_name(t.cls)
+        elif is_reference(t):
+            if not value_type:
+                return Var.type_to_ctype(t.value_type) + "*"
+            else:
+                return Var.type_to_ctype(t.value_type)
+        elif hasattr(t, "_wp_generic_type_str_"):
+            return compute_type_str(f"wp::{t._wp_generic_type_str_}", t._wp_type_params_)
+        elif t.__name__ in ("bool", "int", "float"):
+            return t.__name__
+        else:
+            return f"wp::{t.__name__}"
+    def ctype(self, value_type=False):
+        return Var.type_to_ctype(self.type, value_type)
+    def emit(self, prefix: str = "var"):
+        if self.prefix:
+            return f"{prefix}_{self.label}"
         else:
-            return str(self.type.__name__)
+            return self.label
+    def emit_adj(self):
+        return self.emit("adj")
 class Block:
@@ -304,33 +495,65 @@ class Block:
         self.vars = []
+def is_local_value(value) -> bool:
+    """Check whether a variable is defined inside a kernel."""
+    return isinstance(value, (warp.context.Function, Var))
 class Adjoint:
     # Source code transformer, this class takes a Python function and
     # generates forward and backward SSA forms of the function instructions
-    def __init__(adj, func, overload_annotations=None):
+    def __init__(
+        adj,
+        func,
+        overload_annotations=None,
+        is_user_function=False,
+        skip_forward_codegen=False,
+        skip_reverse_codegen=False,
+        custom_reverse_mode=False,
+        custom_reverse_num_input_args=-1,
+        transformers: List[ast.NodeTransformer] = [],
+    ):
         adj.func = func
-        # build AST from function object
-        adj.source = inspect.getsource(func)
+        adj.is_user_function = is_user_function
-        # get source code lines and line number where function starts
-        adj.raw_source, adj.fun_lineno = inspect.getsourcelines(func)
+        # whether the generation of the forward code is skipped for this function
+        adj.skip_forward_codegen = skip_forward_codegen
+        # whether the generation of the adjoint code is skipped for this function
+        adj.skip_reverse_codegen = skip_reverse_codegen
-        # keep track of line number in function code
-        adj.lineno = None
+        # extract name of source file
+        adj.filename = inspect.getsourcefile(func) or "unknown source file"
+        # get source file line number where function starts
+        _, adj.fun_lineno = inspect.getsourcelines(func)
+        # get function source code
+        adj.source = inspect.getsource(func)
         # ensures that indented class methods can be parsed as kernels
         adj.source = textwrap.dedent(adj.source)
-        # extract name of source file
-        adj.filename = inspect.getsourcefile(func) or "unknown source file"
+        adj.source_lines = adj.source.splitlines()
-        # build AST
+        # build AST and apply node transformers
         adj.tree = ast.parse(adj.source)
+        adj.transformers = transformers
+        for transformer in transformers:
+            adj.tree = transformer.visit(adj.tree)
         adj.fun_name = adj.tree.body[0].name
+        # for keeping track of line number in function code
+        adj.lineno = None
+        # whether the forward code shall be used for the reverse pass and a custom
+        # function signature is applied to the reverse version of the function
+        adj.custom_reverse_mode = custom_reverse_mode
+        # the number of function arguments that pertain to the forward function
+        # input arguments (i.e. the number of arguments that are not adjoint arguments)
+        adj.custom_reverse_num_input_args = custom_reverse_num_input_args
         # parse argument types
         argspec = inspect.getfullargspec(func)
@@ -338,16 +561,17 @@ class Adjoint:
         if overload_annotations is None:
             # use source-level argument annotations
             if len(argspec.annotations) < len(argspec.args):
-                raise RuntimeError(f"Incomplete argument annotations on function {adj.fun_name}")
+                raise WarpCodegenError(f"Incomplete argument annotations on function {adj.fun_name}")
             adj.arg_types = argspec.annotations
         else:
             # use overload argument annotations
             for arg_name in argspec.args:
                 if arg_name not in overload_annotations:
-                    raise RuntimeError(f"Incomplete overload annotations for function {adj.fun_name}")
+                    raise WarpCodegenError(f"Incomplete overload annotations for function {adj.fun_name}")
             adj.arg_types = overload_annotations.copy()
         adj.args = []
+        adj.symbols = {}
         for name, type in adj.arg_types.items():
             # skip return hint
@@ -358,8 +582,23 @@ class Adjoint:
             arg = Var(name, type, False)
             adj.args.append(arg)
+            # pre-populate symbol dictionary with function argument names
+            # this is to avoid registering false references to overshadowed modules
+            adj.symbols[name] = arg
+        # There are cases where a same module might be rebuilt multiple times,
+        # for example when kernels are nested inside of functions, or when
+        # a kernel's launch raises an exception. Ideally we'd always want to
+        # avoid rebuilding kernels but some corner cases seem to depend on it,
+        # so we only avoid rebuilding kernels that errored out to give a chance
+        # for unit testing errors being spit out from kernels.
+        adj.skip_build = False
     # generate function ssa form and adjoint
     def build(adj, builder):
+        if adj.skip_build:
+            return
         adj.builder = builder
         adj.symbols = {}  # map from symbols to adjoint variables
@@ -373,7 +612,7 @@ class Adjoint:
         adj.loop_blocks = []
         # holds current indent level
-        adj.prefix = ""
+        adj.indentation = ""
         # used to generate new label indices
         adj.label_count = 0
@@ -387,20 +626,25 @@ class Adjoint:
             adj.eval(adj.tree.body[0])
         except Exception as e:
             try:
+                if isinstance(e, KeyError) and getattr(e.args[0], "__module__", None) == "ast":
+                    msg = f'Syntax error: unsupported construct "ast.{e.args[0].__name__}"'
+                else:
+                    msg = "Error"
                 lineno = adj.lineno + adj.fun_lineno
-                line = adj.source.splitlines()[adj.lineno]
-                msg = f'Error while parsing function "{adj.fun_name}" at {adj.filename}:{lineno}:\n{line}\n'
+                line = adj.source_lines[adj.lineno]
+                msg += f' while parsing function "{adj.fun_name}" at {adj.filename}:{lineno}:\n{line}\n'
                 ex, data, traceback = sys.exc_info()
-                e = ex("".join([msg] + list(data.args))).with_traceback(traceback)
+                e = ex(";".join([msg] + [str(a) for a in data.args])).with_traceback(traceback)
             finally:
+                adj.skip_build = True
                 raise e
-        for a in adj.args:
-            if isinstance(a.type, Struct):
-                builder.build_struct_recursive(a.type)
-            elif isinstance(a.type, warp.types.array) and isinstance(a.type.dtype, Struct):
-                builder.build_struct_recursive(a.type.dtype)
+        if builder is not None:
+            for a in adj.args:
+                if isinstance(a.type, Struct):
+                    builder.build_struct_recursive(a.type)
+                elif isinstance(a.type, warp.types.array) and isinstance(a.type.dtype, Struct):
+                    builder.build_struct_recursive(a.type.dtype)
     # code generation methods
     def format_template(adj, template, input_vars, output_var):
@@ -415,44 +659,56 @@ class Adjoint:
         arg_strs = []
         for a in args:
-            if type(a) == warp.context.Function:
+            if isinstance(a, warp.context.Function):
                 # functions don't have a var_ prefix so strip it off here
-                if prefix == "var_":
+                if prefix == "var":
                     arg_strs.append(a.key)
                 else:
-                    arg_strs.append(prefix + a.key)
+                    arg_strs.append(f"{prefix}_{a.key}")
+            elif is_reference(a.type):
+                arg_strs.append(f"{prefix}_{a}")
+            elif isinstance(a, Var):
+                arg_strs.append(a.emit(prefix))
             else:
-                arg_strs.append(prefix + str(a))
+                raise WarpCodegenTypeError(f"Arguments must be variables or functions, got {type(a)}")
         return arg_strs
     # generates argument string for a forward function call
     def format_forward_call_args(adj, args, use_initializer_list):
-        arg_str = ", ".join(adj.format_args("var_", args))
+        arg_str = ", ".join(adj.format_args("var", args))
         if use_initializer_list:
-            return "{{{}}}".format(arg_str)
+            return f"{{{arg_str}}}"
         return arg_str
     # generates argument string for a reverse function call
-    def format_reverse_call_args(adj, args, args_out, non_adjoint_args, non_adjoint_outputs, use_initializer_list):
-        formatted_var = adj.format_args("var_", args)
+    def format_reverse_call_args(
+        adj,
+        args_var,
+        args,
+        args_out,
+        use_initializer_list,
+        has_output_args=True,
+        require_original_output_arg=False,
+    ):
+        formatted_var = adj.format_args("var", args_var)
         formatted_out = []
-        if len(args_out) > 1:
-            formatted_out = adj.format_args("var_", args_out)
+        if has_output_args and (require_original_output_arg or len(args_out) > 1):
+            formatted_out = adj.format_args("var", args_out)
         formatted_var_adj = adj.format_args(
-            "&adj_" if use_initializer_list else "adj_", [a for i, a in enumerate(args) if i not in non_adjoint_args]
+            "&adj" if use_initializer_list else "adj",
+            args,
         )
-        formatted_out_adj = adj.format_args("adj_", [a for i, a in enumerate(args_out) if i not in non_adjoint_outputs])
+        formatted_out_adj = adj.format_args("adj", args_out)
         if len(formatted_var_adj) == 0 and len(formatted_out_adj) == 0:
             # there are no adjoint arguments, so we don't need to call the reverse function
             return None
         if use_initializer_list:
-            var_str = "{{{}}}".format(", ".join(formatted_var))
-            out_str = "{{{}}}".format(", ".join(formatted_out))
-            adj_str = "{{{}}}".format(", ".join(formatted_var_adj))
+            var_str = f"{{{', '.join(formatted_var)}}}"
+            out_str = f"{{{', '.join(formatted_out)}}}"
+            adj_str = f"{{{', '.join(formatted_var_adj)}}}"
             out_adj_str = ", ".join(formatted_out_adj)
             if len(args_out) > 1:
                 arg_str = ", ".join([var_str, out_str, adj_str, out_adj_str])
@@ -463,10 +719,10 @@ class Adjoint:
         return arg_str
     def indent(adj):
-        adj.prefix = adj.prefix + "\t"
+        adj.indentation = adj.indentation + "    "
     def dedent(adj):
-        adj.prefix = adj.prefix[0:-1]
+        adj.indentation = adj.indentation[:-4]
     def begin_block(adj):
         b = Block()
@@ -481,10 +737,9 @@ class Adjoint:
     def end_block(adj):
         return adj.blocks.pop()
-    def add_var(adj, type=None, constant=None, name=None):
-        if name is None:
-            index = len(adj.variables)
-            name = str(index)
+    def add_var(adj, type=None, constant=None):
+        index = len(adj.variables)
+        name = str(index)
         # allocate new variable
         v = Var(name, type=type, constant=constant)
@@ -497,30 +752,54 @@ class Adjoint:
     # append a statement to the forward pass
     def add_forward(adj, statement, replay=None, skip_replay=False):
-        adj.blocks[-1].body_forward.append(adj.prefix + statement)
+        adj.blocks[-1].body_forward.append(adj.indentation + statement)
         if not skip_replay:
             if replay:
                 # if custom replay specified then output it
-                adj.blocks[-1].body_replay.append(adj.prefix + replay)
+                adj.blocks[-1].body_replay.append(adj.indentation + replay)
             else:
                 # by default just replay the original statement
-                adj.blocks[-1].body_replay.append(adj.prefix + statement)
+                adj.blocks[-1].body_replay.append(adj.indentation + statement)
     # append a statement to the reverse pass
     def add_reverse(adj, statement):
-        adj.blocks[-1].body_reverse.append(adj.prefix + statement)
+        adj.blocks[-1].body_reverse.append(adj.indentation + statement)
     def add_constant(adj, n):
         output = adj.add_var(type=type(n), constant=n)
         return output
+    def load(adj, var):
+        if is_reference(var.type):
+            var = adj.add_builtin_call("load", [var])
+        return var
     def add_comp(adj, op_strings, left, comps):
-        output = adj.add_var(bool)
+        output = adj.add_var(builtins.bool)
+        left = adj.load(left)
+        s = output.emit() + " = " + ("(" * len(comps)) + left.emit() + " "
+        prev_comp = None
-        s = "var_" + str(output) + " = " + ("(" * len(comps)) + "var_" + str(left) + " "
         for op, comp in zip(op_strings, comps):
-            s += op + " var_" + str(comp) + ") "
+            comp_chainable = op_str_is_chainable(op)
+            if comp_chainable and prev_comp:
+                # We  restrict chaining to operands of the same type
+                if prev_comp.type is comp.type:
+                    prev_comp = adj.load(prev_comp)
+                    comp = adj.load(comp)
+                    s += "&& (" + prev_comp.emit() + " " + op + " " + comp.emit() + ")) "
+                else:
+                    raise WarpCodegenTypeError(
+                        f"Cannot chain comparisons of unequal types: {prev_comp.type} {op} {comp.type}."
+                    )
+            else:
+                comp = adj.load(comp)
+                s += op + " " + comp.emit() + ") "
+            prev_comp = comp
         s = s.rstrip() + ";"
@@ -529,109 +808,106 @@ class Adjoint:
         return output
     def add_bool_op(adj, op_string, exprs):
-        output = adj.add_var(bool)
-        command = (
-            "var_" + str(output) + " = " + (" " + op_string + " ").join(["var_" + str(expr) for expr in exprs]) + ";"
-        )
+        exprs = [adj.load(expr) for expr in exprs]
+        output = adj.add_var(builtins.bool)
+        command = output.emit() + " = " + (" " + op_string + " ").join([expr.emit() for expr in exprs]) + ";"
         adj.add_forward(command)
         return output
-    def add_call(adj, func, args, min_outputs=None, templates=[], kwds=None):
-        # if func is overloaded then perform overload resolution here
-        # we validate argument types before they go to generated native code
-        resolved_func = None
+    def resolve_func(adj, func, args, min_outputs, templates, kwds):
+        arg_types = [strip_reference(a.type) for a in args if not isinstance(a, warp.context.Function)]
-        if func.is_builtin():
+        if not func.is_builtin():
+            # user-defined function
+            overload = func.get_overload(arg_types)
+            if overload is not None:
+                return overload
+        else:
+            # if func is overloaded then perform overload resolution here
+            # we validate argument types before they go to generated native code
             for f in func.overloads:
-                match = True
                 # skip type checking for variadic functions
                 if not f.variadic:
                     # check argument counts match are compatible (may be some default args)
                     if len(f.input_types) < len(args):
-                        match = False
                         continue
-                    # check argument types equal
-                    for i, (arg_name, arg_type) in enumerate(f.input_types.items()):
-                        # if arg type registered as Any, treat as
-                        # template allowing any type to match
-                        if arg_type == Any:
-                            continue
-                        # handle function refs as a special case
-                        if arg_type == Callable and type(args[i]) is warp.context.Function:
-                            continue
-                        # look for default values for missing args
-                        if i >= len(args):
-                            if arg_name not in f.defaults:
-                                match = False
-                                break
-                        else:
-                            # otherwise check arg type matches input variable type
-                            if not types_equal(arg_type, args[i].type, match_generic=True):
-                                match = False
-                                break
+                    def match_args(args, f):
+                        # check argument types equal
+                        for i, (arg_name, arg_type) in enumerate(f.input_types.items()):
+                            # if arg type registered as Any, treat as
+                            # template allowing any type to match
+                            if arg_type == Any:
+                                continue
+                            # handle function refs as a special case
+                            if arg_type == Callable and type(args[i]) is warp.context.Function:
+                                continue
+                            if arg_type == Reference and is_reference(args[i].type):
+                                continue
+                            # look for default values for missing args
+                            if i >= len(args):
+                                if arg_name not in f.defaults:
+                                    return False
+                            else:
+                                # otherwise check arg type matches input variable type
+                                if not types_equal(arg_type, strip_reference(args[i].type), match_generic=True):
+                                    return False
+                        return True
+                    if not match_args(args, f):
+                        continue
                 # check output dimensions match expectations
                 if min_outputs:
                     try:
                         value_type = f.value_func(args, kwds, templates)
-                        if len(value_type) != min_outputs:
-                            match = False
+                        if not hasattr(value_type, "__len__") or len(value_type) != min_outputs:
                             continue
                     except Exception:
                         # value func may fail if the user has given
                         # incorrect args, so we need to catch this
-                        match = False
                         continue
                 # found a match, use it
-                if match:
-                    resolved_func = f
-                    break
-        else:
-            # user-defined function
-            arg_types = [a.type for a in args]
-            resolved_func = func.get_overload(arg_types)
-        if resolved_func is None:
-            arg_types = []
-            for x in args:
-                if isinstance(x, Var):
-                    # shorten Warp primitive type names
-                    if isinstance(x.type, list):
-                        if len(x.type) != 1:
-                            raise Exception("Argument must not be the result from a multi-valued function")
-                        arg_type = x.type[0]
-                    else:
-                        arg_type = x.type
-                    if arg_type.__module__ == "warp.types":
-                        arg_types.append(arg_type.__name__)
-                    else:
-                        arg_types.append(arg_type.__module__ + "." + arg_type.__name__)
-                if isinstance(x, warp.context.Function):
-                    arg_types.append("function")
-            raise Exception(
-                f"Couldn't find function overload for '{func.key}' that matched inputs with types: [{', '.join(arg_types)}]"
-            )
+                return f
+        # unresolved function, report error
+        arg_types = []
+        for x in args:
+            if isinstance(x, Var):
+                # shorten Warp primitive type names
+                if isinstance(x.type, list):
+                    if len(x.type) != 1:
+                        raise WarpCodegenError("Argument must not be the result from a multi-valued function")
+                    arg_type = x.type[0]
+                else:
+                    arg_type = x.type
-        else:
-            func = resolved_func
+                arg_types.append(type_repr(arg_type))
+            if isinstance(x, warp.context.Function):
+                arg_types.append("function")
+        raise WarpCodegenError(
+            f"Couldn't find function overload for '{func.key}' that matched inputs with types: [{', '.join(arg_types)}]"
+        )
+    def add_call(adj, func, args, min_outputs=None, templates=[], kwds=None):
+        func = adj.resolve_func(func, args, min_outputs, templates, kwds)
         # push any default values onto args
         for i, (arg_name, arg_type) in enumerate(func.input_types.items()):
             if i >= len(args):
-                if arg_name in f.defaults:
+                if arg_name in func.defaults:
                     const = adj.add_constant(func.defaults[arg_name])
                     args.append(const)
                 else:
-                    match = False
                     break
         # if it is a user-function then build it recursively
@@ -639,93 +915,105 @@ class Adjoint:
             adj.builder.build_function(func)
         # evaluate the function type based on inputs
-        value_type = func.value_func(args, kwds, templates)
+        arg_types = [strip_reference(a.type) for a in args if not isinstance(a, warp.context.Function)]
+        return_type = func.value_func(arg_types, kwds, templates)
         func_name = compute_type_str(func.native_func, templates)
+        param_types = list(func.input_types.values())
         use_initializer_list = func.initializer_list_func(args, templates)
-        if value_type is None:
-            # handles expression (zero output) functions, e.g.: void do_something();
-            forward_call = "{}{}({});".format(
-                func.namespace, func_name, adj.format_forward_call_args(args, use_initializer_list)
-            )
-            if func.skip_replay:
-                adj.add_forward(forward_call, replay="//" + forward_call)
-            else:
-                adj.add_forward(forward_call)
-            if not func.missing_grad and len(args):
-                arg_str = adj.format_reverse_call_args(args, [], {}, {}, use_initializer_list)
-                if arg_str is not None:
-                    reverse_call = "{}adj_{}({});".format(func.namespace, func.native_func, arg_str)
-                    adj.add_reverse(reverse_call)
+        args_var = [
+            adj.load(a)
+            if not ((param_types[i] == Reference or param_types[i] == Callable) if i < len(param_types) else False)
+            else a
+            for i, a in enumerate(args)
+        ]
-            return None
+        if return_type is None:
+            # handles expression (zero output) functions, e.g.: void do_something();
-        elif not isinstance(value_type, list) or len(value_type) == 1:
-            # handle simple function (one output)
+            output = None
+            output_list = []
-            if isinstance(value_type, list):
-                value_type = value_type[0]
-            output = adj.add_var(value_type)
-            forward_call = "var_{} = {}{}({});".format(
-                output, func.namespace, func_name, adj.format_forward_call_args(args, use_initializer_list)
+            forward_call = (
+                f"{func.namespace}{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
             )
+            replay_call = forward_call
+            if func.custom_replay_func is not None:
+                replay_call = f"{func.namespace}replay_{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
-            if func.skip_replay:
-                adj.add_forward(forward_call, replay="//" + forward_call)
-            else:
-                adj.add_forward(forward_call)
+        elif not isinstance(return_type, list) or len(return_type) == 1:
+            # handle simple function (one output)
-            if not func.missing_grad and len(args):
-                arg_str = adj.format_reverse_call_args(args, [output], {}, {}, use_initializer_list)
-                if arg_str is not None:
-                    reverse_call = "{}adj_{}({});".format(func.namespace, func.native_func, arg_str)
-                    adj.add_reverse(reverse_call)
+            if isinstance(return_type, list):
+                return_type = return_type[0]
+            output = adj.add_var(return_type)
+            output_list = [output]
-            return output
+            forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
+            replay_call = forward_call
+            if func.custom_replay_func is not None:
+                replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
         else:
             # handle multiple value functions
-            output = [adj.add_var(v) for v in value_type]
-            forward_call = "{}{}({});".format(
-                func.namespace, func_name, adj.format_forward_call_args(args + output, use_initializer_list)
+            output = [adj.add_var(v) for v in return_type]
+            output_list = output
+            forward_call = (
+                f"{func.namespace}{func_name}({adj.format_forward_call_args(args_var + output, use_initializer_list)});"
             )
-            adj.add_forward(forward_call)
+            replay_call = forward_call
-            if not func.missing_grad and len(args):
-                arg_str = adj.format_reverse_call_args(args, output, {}, {}, use_initializer_list)
-                if arg_str is not None:
-                    reverse_call = "{}adj_{}({});".format(func.namespace, func.native_func, arg_str)
-                    adj.add_reverse(reverse_call)
+        if func.skip_replay:
+            adj.add_forward(forward_call, replay="// " + replay_call)
+        else:
+            adj.add_forward(forward_call, replay=replay_call)
+        if not func.missing_grad and len(args):
+            reverse_has_output_args = (
+                func.require_original_output_arg or len(output_list) > 1
+            ) and func.custom_grad_func is None
+            arg_str = adj.format_reverse_call_args(
+                args_var,
+                args,
+                output_list,
+                use_initializer_list,
+                has_output_args=reverse_has_output_args,
+                require_original_output_arg=func.require_original_output_arg,
+            )
+            if arg_str is not None:
+                reverse_call = f"{func.namespace}adj_{func.native_func}({arg_str});"
+                adj.add_reverse(reverse_call)
-            if len(output) == 1:
-                return output[0]
+        return output
-            return output
+    def add_builtin_call(adj, func_name, args, min_outputs=None, templates=[], kwds=None):
+        func = warp.context.builtin_functions[func_name]
+        return adj.add_call(func, args, min_outputs, templates, kwds)
     def add_return(adj, var):
         if var is None or len(var) == 0:
-            adj.add_forward("return;", "goto label{};".format(adj.label_count))
+            adj.add_forward("return;", f"goto label{adj.label_count};")
         elif len(var) == 1:
-            adj.add_forward("return var_{};".format(var[0]), "goto label{};".format(adj.label_count))
+            adj.add_forward(f"return {var[0].emit()};", f"goto label{adj.label_count};")
             adj.add_reverse("adj_" + str(var[0]) + " += adj_ret;")
         else:
             for i, v in enumerate(var):
-                adj.add_forward("ret_{} = var_{};".format(i, v))
-                adj.add_reverse("adj_{} += adj_ret_{};".format(v, i))
-            adj.add_forward("return;", "goto label{};".format(adj.label_count))
+                adj.add_forward(f"ret_{i} = {v.emit()};")
+                adj.add_reverse(f"adj_{v} += adj_ret_{i};")
+            adj.add_forward("return;", f"goto label{adj.label_count};")
-        adj.add_reverse("label{}:;".format(adj.label_count))
+        adj.add_reverse(f"label{adj.label_count}:;")
         adj.label_count += 1
     # define an if statement
     def begin_if(adj, cond):
-        adj.add_forward("if (var_{}) {{".format(cond))
+        cond = adj.load(cond)
+        adj.add_forward(f"if ({cond.emit()}) {{")
         adj.add_reverse("}")
         adj.indent()
@@ -734,10 +1022,12 @@ class Adjoint:
         adj.dedent()
         adj.add_forward("}")
-        adj.add_reverse(f"if (var_{cond}) {{")
+        cond = adj.load(cond)
+        adj.add_reverse(f"if ({cond.emit()}) {{")
     def begin_else(adj, cond):
-        adj.add_forward(f"if (!var_{cond}) {{")
+        cond = adj.load(cond)
+        adj.add_forward(f"if (!{cond.emit()}) {{")
         adj.add_reverse("}")
         adj.indent()
@@ -746,7 +1036,8 @@ class Adjoint:
         adj.dedent()
         adj.add_forward("}")
-        adj.add_reverse(f"if (!var_{cond}) {{")
+        cond = adj.load(cond)
+        adj.add_reverse(f"if (!{cond.emit()}) {{")
     # define a for-loop
     def begin_for(adj, iter):
@@ -756,10 +1047,10 @@ class Adjoint:
         adj.indent()
         # evaluate cond
-        adj.add_forward(f"if (iter_cmp(var_{iter}) == 0) goto for_end_{cond_block.label};")
+        adj.add_forward(f"if (iter_cmp({iter.emit()}) == 0) goto for_end_{cond_block.label};")
         # evaluate iter
-        val = adj.add_call(warp.context.builtin_functions["iter_next"], [iter])
+        val = adj.add_builtin_call("iter_next", [iter])
         adj.begin_block()
@@ -790,17 +1081,14 @@ class Adjoint:
         reverse = []
         # reverse iterator
-        reverse.append(adj.prefix + f"var_{iter} = wp::iter_reverse(var_{iter});")
+        reverse.append(adj.indentation + f"{iter.emit()} = wp::iter_reverse({iter.emit()});")
         for i in cond_block.body_forward:
             reverse.append(i)
         # zero adjoints
         for i in body_block.vars:
-            if isinstance(i.type, Struct):
-                reverse.append(adj.prefix + f"\tadj_{i} = {i.ctype()}{{}};")
-            else:
-                reverse.append(adj.prefix + f"\tadj_{i} = {i.ctype()}(0);")
+            reverse.append(adj.indentation + f"\t{i.emit_adj()} = {{}};")
         # replay
         for i in body_block.body_replay:
@@ -810,14 +1098,14 @@ class Adjoint:
         for i in reversed(body_block.body_reverse):
             reverse.append(i)
-        reverse.append(adj.prefix + f"\tgoto for_start_{cond_block.label};")
-        reverse.append(adj.prefix + f"for_end_{cond_block.label}:;")
+        reverse.append(adj.indentation + f"\tgoto for_start_{cond_block.label};")
+        reverse.append(adj.indentation + f"for_end_{cond_block.label}:;")
         adj.blocks[-1].body_reverse.extend(reversed(reverse))
     # define a while loop
     def begin_while(adj, cond):
-        # evaulate condition in its own block
+        # evaluate condition in its own block
         # so we can control replay
         cond_block = adj.begin_block()
         adj.loop_blocks.append(cond_block)
@@ -825,7 +1113,7 @@ class Adjoint:
         c = adj.eval(cond)
-        cond_block.body_forward.append(f"if ((var_{c}) == false) goto while_end_{cond_block.label};")
+        cond_block.body_forward.append(f"if (({c.emit()}) == false) goto while_end_{cond_block.label};")
         # being block around loop
         adj.begin_block()
@@ -859,10 +1147,7 @@ class Adjoint:
         # zero adjoints of local vars
         for i in body_block.vars:
-            if isinstance(i.type, Struct):
-                reverse.append(f"adj_{i} = {i.ctype()}{{}};")
-            else:
-                reverse.append(f"adj_{i} = {i.ctype()}(0);")
+            reverse.append(f"{i.emit_adj()} = {{}};")
         # replay
         for i in body_block.body_replay:
@@ -882,6 +1167,10 @@ class Adjoint:
         for f in node.body:
             adj.eval(f)
+        if adj.return_var is not None and len(adj.return_var) == 1:
+            if not isinstance(node.body[-1], ast.Return):
+                adj.add_forward("return {};", skip_replay=True)
     def emit_If(adj, node):
         if len(node.body) == 0:
             return None
@@ -909,7 +1198,7 @@ class Adjoint:
             if var1 != var2:
                 # insert a phi function that selects var1, var2 based on cond
-                out = adj.add_call(warp.context.builtin_functions["select"], [cond, var1, var2])
+                out = adj.add_builtin_call("select", [cond, var1, var2])
                 adj.symbols[sym] = out
         symbols_prev = adj.symbols.copy()
@@ -933,7 +1222,7 @@ class Adjoint:
             if var1 != var2:
                 # insert a phi function that selects var1, var2 based on cond
                 # note the reversed order of vars since we want to use !cond as our select
-                out = adj.add_call(warp.context.builtin_functions["select"], [cond, var2, var1])
+                out = adj.add_builtin_call("select", [cond, var2, var1])
                 adj.symbols[sym] = out
     def emit_Compare(adj, node):
@@ -955,7 +1244,7 @@ class Adjoint:
         elif isinstance(op, ast.Or):
             func = "||"
         else:
-            raise KeyError("Op {} is not supported".format(op))
+            raise WarpCodegenKeyError(f"Op {op} is not supported")
         return adj.add_bool_op(func, [adj.eval(expr) for expr in node.values])
@@ -975,7 +1264,7 @@ class Adjoint:
             obj = capturedvars.get(str(node.id), None)
         if obj is None:
-            raise KeyError("Referencing undefined symbol: " + str(node.id))
+            raise WarpCodegenKeyError("Referencing undefined symbol: " + str(node.id))
         if warp.types.is_value(obj):
             # evaluate constant
@@ -987,26 +1276,96 @@ class Adjoint:
         # pass it back to the caller for processing
         return obj
+    @staticmethod
+    def resolve_type_attribute(var_type: type, attr: str):
+        if isinstance(var_type, type) and type_is_value(var_type):
+            if attr == "dtype":
+                return type_scalar_type(var_type)
+            elif attr == "length":
+                return type_length(var_type)
+        return getattr(var_type, attr, None)
+    def vector_component_index(adj, component, vector_type):
+        if len(component) != 1:
+            raise WarpCodegenAttributeError(f"Vector swizzle must be single character, got .{component}")
+        dim = vector_type._shape_[0]
+        swizzles = "xyzw"[0:dim]
+        if component not in swizzles:
+            raise WarpCodegenAttributeError(
+                f"Vector swizzle for {vector_type} must be one of {swizzles}, got {component}"
+            )
+        index = swizzles.index(component)
+        index = adj.add_constant(index)
+        return index
+    @staticmethod
+    def is_differentiable_value_type(var_type):
+        # checks that the argument type is a value type (i.e, not an array)
+        # possibly holding differentiable values (for which gradients must be accumulated)
+        return type_scalar_type(var_type) in float_types or isinstance(var_type, Struct)
     def emit_Attribute(adj, node):
-        try:
-            val = adj.eval(node.value)
+        if hasattr(node, "is_adjoint"):
+            node.value.is_adjoint = True
+        aggregate = adj.eval(node.value)
-            if isinstance(val, types.ModuleType) or isinstance(val, type):
-                out = getattr(val, node.attr)
+        try:
+            if isinstance(aggregate, types.ModuleType) or isinstance(aggregate, type):
+                out = getattr(aggregate, node.attr)
                 if warp.types.is_value(out):
                     return adj.add_constant(out)
                 return out
-            # create a Var that points to the struct attribute, i.e.: directly generates `struct.attr` when used
-            attr_name = val.label + "." + node.attr
-            attr_type = val.type.vars[node.attr].type
+            if hasattr(node, "is_adjoint"):
+                # create a Var that points to the struct attribute, i.e.: directly generates `struct.attr` when used
+                attr_name = aggregate.label + "." + node.attr
+                attr_type = aggregate.type.vars[node.attr].type
+                return Var(attr_name, attr_type)
+            aggregate_type = strip_reference(aggregate.type)
+            # reading a vector component
+            if type_is_vector(aggregate_type):
+                index = adj.vector_component_index(node.attr, aggregate_type)
+                return adj.add_builtin_call("extract", [aggregate, index])
+            else:
+                attr_type = Reference(aggregate_type.vars[node.attr].type)
+                attr = adj.add_var(attr_type)
+                if is_reference(aggregate.type):
+                    adj.add_forward(f"{attr.emit()} = &({aggregate.emit()}->{node.attr});")
+                else:
+                    adj.add_forward(f"{attr.emit()} = &({aggregate.emit()}.{node.attr});")
+                if adj.is_differentiable_value_type(strip_reference(attr_type)):
+                    adj.add_reverse(f"{aggregate.emit_adj()}.{node.attr} += {attr.emit_adj()};")
+                else:
+                    adj.add_reverse(f"{aggregate.emit_adj()}.{node.attr} = {attr.emit_adj()};")
+                return attr
+        except (KeyError, AttributeError):
+            # Try resolving as type attribute
+            aggregate_type = strip_reference(aggregate.type) if isinstance(aggregate, Var) else aggregate
-            return Var(attr_name, attr_type)
+            type_attribute = adj.resolve_type_attribute(aggregate_type, node.attr)
+            if type_attribute is not None:
+                return type_attribute
-        except KeyError:
-            raise RuntimeError(f"Error, `{node.attr}` is not an attribute of '{val.label}' ({val.type})")
+            if isinstance(aggregate, Var):
+                raise WarpCodegenAttributeError(
+                    f"Error, `{node.attr}` is not an attribute of '{node.value.id}' ({type_repr(aggregate.type)})"
+                )
+            raise WarpCodegenAttributeError(f"Error, `{node.attr}` is not an attribute of '{aggregate}'")
     def emit_String(adj, node):
         # string constant
@@ -1023,19 +1382,25 @@ class Adjoint:
             adj.symbols[key] = out
             return out
+    def emit_Ellipsis(adj, node):
+        # stubbed @wp.native_func
+        return
     def emit_NameConstant(adj, node):
-        if node.value == True:
+        if node.value:
             return adj.add_constant(True)
-        elif node.value == False:
-            return adj.add_constant(False)
         elif node.value is None:
-            raise TypeError("None type unsupported")
+            raise WarpCodegenTypeError("None type unsupported")
+        else:
+            return adj.add_constant(False)
     def emit_Constant(adj, node):
         if isinstance(node, ast.Str):
             return adj.emit_String(node)
         elif isinstance(node, ast.Num):
             return adj.emit_Num(node)
+        elif isinstance(node, ast.Ellipsis):
+            return adj.emit_Ellipsis(node)
         else:
             assert isinstance(node, ast.NameConstant)
             return adj.emit_NameConstant(node)
@@ -1046,18 +1411,16 @@ class Adjoint:
         right = adj.eval(node.right)
         name = builtin_operators[type(node.op)]
-        func = warp.context.builtin_functions[name]
-        return adj.add_call(func, [left, right])
+        return adj.add_builtin_call(name, [left, right])
     def emit_UnaryOp(adj, node):
         # evaluate unary op arguments
         arg = adj.eval(node.operand)
         name = builtin_operators[type(node.op)]
-        func = warp.context.builtin_functions[name]
-        return adj.add_call(func, [arg])
+        return adj.add_builtin_call(name, [arg])
     def materialize_redefinitions(adj, symbols):
         # detect symbols with conflicting definitions (assigned inside the for loop)
@@ -1067,21 +1430,19 @@ class Adjoint:
             var2 = adj.symbols[sym]
             if var1 != var2:
-                if warp.config.verbose:
+                if warp.config.verbose and not adj.custom_reverse_mode:
                     lineno = adj.lineno + adj.fun_lineno
-                    line = adj.source.splitlines()[adj.lineno]
-                    msg = f'Warning: detected mutated variable {sym} during a dynamic for-loop in function "{adj.fun_name}" at {adj.filename}:{lineno}: this is a non-differentiable operation.\n{line}\n'
+                    line = adj.source_lines[adj.lineno]
+                    msg = f'Warning: detected mutated variable {sym} during a dynamic for-loop in function "{adj.fun_name}" at {adj.filename}:{lineno}: this may not be a differentiable operation.\n{line}\n'
                     print(msg)
                 if var1.constant is not None:
-                    raise Exception(
-                        "Error mutating a constant {} inside a dynamic loop, use the following syntax: pi = float(3.141) to declare a dynamic variable".format(
-                            sym
-                        )
+                    raise WarpCodegenError(
+                        f"Error mutating a constant {sym} inside a dynamic loop, use the following syntax: pi = float(3.141) to declare a dynamic variable"
                     )
                 # overwrite the old variable value (violates SSA)
-                adj.add_call(warp.context.builtin_functions["copy"], [var1, var2])
+                adj.add_builtin_call("assign", [var1, var2])
                 # reset the symbol to point to the original variable
                 adj.symbols[sym] = var1
@@ -1100,95 +1461,132 @@ class Adjoint:
         adj.end_while()
-    def is_num(adj, a):
-        # simple constant
+    def eval_num(adj, a):
         if isinstance(a, ast.Num):
-            return True
-        # expression of form -constant
-        elif isinstance(a, ast.UnaryOp) and isinstance(a.op, ast.USub) and isinstance(a.operand, ast.Num):
-            return True
-        else:
-            # try and resolve the expression to an object
-            # e.g.: wp.constant in the globals scope
-            obj, path = adj.resolve_path(a)
-            if warp.types.is_int(obj):
+            return True, a.n
+        if isinstance(a, ast.UnaryOp) and isinstance(a.op, ast.USub) and isinstance(a.operand, ast.Num):
+            return True, -a.operand.n
+        # try and resolve the expression to an object
+        # e.g.: wp.constant in the globals scope
+        obj, _ = adj.resolve_static_expression(a)
+        if isinstance(obj, Var) and obj.constant is not None:
+            obj = obj.constant
+        return warp.types.is_int(obj), obj
+    # detects whether a loop contains a break (or continue) statement
+    def contains_break(adj, body):
+        for s in body:
+            if isinstance(s, ast.Break):
                 return True
+            elif isinstance(s, ast.Continue):
+                return True
+            elif isinstance(s, ast.If):
+                if adj.contains_break(s.body):
+                    return True
+                if adj.contains_break(s.orelse):
+                    return True
             else:
-                return False
+                # note that nested for or while loops containing a break statement
+                # do not affect the current loop
+                pass
+        return False
+    # returns a constant range() if unrollable, otherwise None
+    def get_unroll_range(adj, loop):
+        if (
+            not isinstance(loop.iter, ast.Call)
+            or not isinstance(loop.iter.func, ast.Name)
+            or loop.iter.func.id != "range"
+            or len(loop.iter.args) == 0
+            or len(loop.iter.args) > 3
+        ):
+            return None
-    def eval_num(adj, a):
-        if isinstance(a, ast.Num):
-            return a.n
-        elif isinstance(a, ast.UnaryOp) and isinstance(a.op, ast.USub) and isinstance(a.operand, ast.Num):
-            return -a.operand.n
-        else:
-            # try and resolve the expression to an object
-            # e.g.: wp.constant in the globals scope
-            obj, path = adj.resolve_path(a)
-            if warp.types.is_int(obj):
-                return obj
-            else:
-                return False
+        # if all range() arguments are numeric constants we will unroll
+        # note that this only handles trivial constants, it will not unroll
+        # constant compile-time expressions e.g.: range(0, 3*2)
+        # Evaluate the arguments and check that they are numeric constants
+        # It is important to do that in one pass, so that if evaluating these arguments have side effects
+        # the code does not get generated more than once
+        range_args = [adj.eval_num(arg) for arg in loop.iter.args]
+        arg_is_numeric, arg_values = zip(*range_args)
+        if all(arg_is_numeric):
+            # All argument are numeric constants
+            # range(end)
+            if len(loop.iter.args) == 1:
+                start = 0
+                end = arg_values[0]
+                step = 1
+            # range(start, end)
+            elif len(loop.iter.args) == 2:
+                start = arg_values[0]
+                end = arg_values[1]
+                step = 1
+            # range(start, end, step)
+            elif len(loop.iter.args) == 3:
+                start = arg_values[0]
+                end = arg_values[1]
+                step = arg_values[2]
+            # test if we're above max unroll count
+            max_iters = abs(end - start) // abs(step)
+            max_unroll = adj.builder.options["max_unroll"]
+            ok_to_unroll = True
+            if max_iters > max_unroll:
+                if warp.config.verbose:
+                    print(
+                        f"Warning: fixed-size loop count of {max_iters} is larger than the module 'max_unroll' limit of {max_unroll}, will generate dynamic loop."
+                    )
+                ok_to_unroll = False
+            elif adj.contains_break(loop.body):
+                if warp.config.verbose:
+                    print("Warning: 'break' or 'continue' found in loop body, will generate dynamic loop.")
+                ok_to_unroll = False
+            if ok_to_unroll:
+                return range(start, end, step)
+        # Unroll is not possible, range needs to be valuated dynamically
+        range_call = adj.add_builtin_call(
+            "range",
+            [adj.add_constant(val) if is_numeric else val for is_numeric, val in range_args],
+        )
+        return range_call
     def emit_For(adj, node):
         # try and unroll simple range() statements that use constant args
-        unrolled = False
-        if isinstance(node.iter, ast.Call) and node.iter.func.id == "range":
-            is_constant = True
-            for a in node.iter.args:
-                # if all range() arguments are numeric constants we will unroll
-                # note that this only handles trivial constants, it will not unroll
-                # constant compile-time expressions e.g.: range(0, 3*2)
-                if not adj.is_num(a):
-                    is_constant = False
-                    break
-            if is_constant:
-                # range(end)
-                if len(node.iter.args) == 1:
-                    start = 0
-                    end = adj.eval_num(node.iter.args[0])
-                    step = 1
-                # range(start, end)
-                elif len(node.iter.args) == 2:
-                    start = adj.eval_num(node.iter.args[0])
-                    end = adj.eval_num(node.iter.args[1])
-                    step = 1
-                # range(start, end, step)
-                elif len(node.iter.args) == 3:
-                    start = adj.eval_num(node.iter.args[0])
-                    end = adj.eval_num(node.iter.args[1])
-                    step = adj.eval_num(node.iter.args[2])
-                # test if we're above max unroll count
-                max_iters = abs(end - start) // abs(step)
-                max_unroll = adj.builder.options["max_unroll"]
-                if max_iters > max_unroll:
-                    if warp.config.verbose:
-                        print(
-                            f"Warning: fixed-size loop count of {max_iters} is larger than the module 'max_unroll' limit of {max_unroll}, will generate dynamic loop."
-                        )
-                else:
-                    # unroll
-                    for i in range(start, end, step):
-                        const_iter = adj.add_constant(i)
-                        var_iter = adj.add_call(warp.context.builtin_functions["int"], [const_iter])
-                        adj.symbols[node.target.id] = var_iter
+        unroll_range = adj.get_unroll_range(node)
-                        # eval body
-                        for s in node.body:
-                            adj.eval(s)
+        if isinstance(unroll_range, range):
+            for i in unroll_range:
+                const_iter = adj.add_constant(i)
+                var_iter = adj.add_builtin_call("int", [const_iter])
+                adj.symbols[node.target.id] = var_iter
-                    unrolled = True
+                # eval body
+                for s in node.body:
+                    adj.eval(s)
-        # couldn't unroll so generate a dynamic loop
-        if not unrolled:
-            # evaluate the Iterable
-            iter = adj.eval(node.iter)
+        # otherwise generate a dynamic loop
+        else:
+            # evaluate the Iterable -- only if not previously evaluated when trying to unroll
+            if unroll_range is not None:
+                # Range has already been evaluated when trying to unroll, do not re-evaluate
+                iter = unroll_range
+            else:
+                iter = adj.eval(node.iter)
             adj.symbols[node.target.id] = adj.begin_for(iter)
@@ -1217,15 +1615,28 @@ class Adjoint:
     def emit_Expr(adj, node):
         return adj.eval(node.value)
+    def check_tid_in_func_error(adj, node):
+        if adj.is_user_function:
+            if hasattr(node.func, "attr") and node.func.attr == "tid":
+                lineno = adj.lineno + adj.fun_lineno
+                line = adj.source_lines[adj.lineno]
+                raise WarpCodegenError(
+                    "tid() may only be called from a Warp kernel, not a Warp function. "
+                    "Instead, obtain the indices from a @wp.kernel and pass them as "
+                    f"arguments to the function {adj.fun_name}, {adj.filename}:{lineno}:\n{line}\n"
+                )
     def emit_Call(adj, node):
+        adj.check_tid_in_func_error(node)
         # try and lookup function in globals by
         # resolving path (e.g.: module.submodule.attr)
-        func, path = adj.resolve_path(node.func)
+        func, path = adj.resolve_static_expression(node.func)
         templates = []
-        if isinstance(func, warp.context.Function) == False:
+        if not isinstance(func, warp.context.Function):
             if len(path) == 0:
-                raise RuntimeError(f"Unrecognized syntax for function call, path not valid: '{node.func}'")
+                raise WarpCodegenError(f"Unknown function or operator: '{node.func.func.id}'")
             attr = path[-1]
             caller = func
@@ -1250,7 +1661,7 @@ class Adjoint:
                 func = caller.initializer()
             if func is None:
-                raise RuntimeError(
+                raise WarpCodegenError(
                     f"Could not find function {'.'.join(path)} as a built-in or user-defined function. Note that user functions must be annotated with a @wp.func decorator to be called from a kernel."
                 )
@@ -1259,16 +1670,25 @@ class Adjoint:
         # eval all arguments
         for arg in node.args:
             var = adj.eval(arg)
+            if not is_local_value(var):
+                raise RuntimeError(
+                    "Cannot reference a global variable from a kernel unless `wp.constant()` is being used"
+                )
             args.append(var)
-        # eval all keyword ags
+        # eval all keyword args
         def kwval(kw):
             if isinstance(kw.value, ast.Num):
                 return kw.value.n
             elif isinstance(kw.value, ast.Tuple):
-                return tuple(adj.eval_num(e) for e in kw.value.elts)
+                arg_is_numeric, arg_values = zip(*(adj.eval_num(e) for e in kw.value.elts))
+                if not all(arg_is_numeric):
+                    raise WarpCodegenError(
+                        f"All elements of the tuple keyword argument '{kw.name}' must be numeric constants, got '{arg_values}'"
+                    )
+                return arg_values
             else:
-                return adj.resolve_path(kw.value)[0]
+                return adj.resolve_static_expression(kw.value)[0]
         kwds = {kw.arg: kwval(kw) for kw in node.keywords}
@@ -1285,10 +1705,26 @@ class Adjoint:
         # the ast.Index node appears in 3.7 versions
         # when performing array slices, e.g.: x = arr[i]
         # but in version 3.8 and higher it does not appear
+        if hasattr(node, "is_adjoint"):
+            node.value.is_adjoint = True
         return adj.eval(node.value)
     def emit_Subscript(adj, node):
+        if hasattr(node.value, "attr") and node.value.attr == "adjoint":
+            # handle adjoint of a variable, i.e. wp.adjoint[var]
+            node.slice.is_adjoint = True
+            var = adj.eval(node.slice)
+            var_name = var.label
+            var = Var(f"adj_{var_name}", type=var.type, constant=None, prefix=False)
+            return var
         target = adj.eval(node.value)
+        if not is_local_value(target):
+            raise RuntimeError(
+                "Cannot reference a global variable from a kernel unless `wp.constant()` is being used"
+            )
         indices = []
@@ -1308,28 +1744,34 @@ class Adjoint:
             var = adj.eval(node.slice)
             indices.append(var)
-        if is_array(target.type):
-            if len(indices) == target.type.ndim:
+        target_type = strip_reference(target.type)
+        if is_array(target_type):
+            if len(indices) == target_type.ndim:
                 # handles array loads (where each dimension has an index specified)
-                out = adj.add_call(warp.context.builtin_functions["load"], [target, *indices])
+                out = adj.add_builtin_call("address", [target, *indices])
             else:
                 # handles array views (fewer indices than dimensions)
-                out = adj.add_call(warp.context.builtin_functions["view"], [target, *indices])
+                out = adj.add_builtin_call("view", [target, *indices])
         else:
             # handles non-array type indexing, e.g: vec3, mat33, etc
-            out = adj.add_call(warp.context.builtin_functions["index"], [target, *indices])
+            out = adj.add_builtin_call("extract", [target, *indices])
         return out
     def emit_Assign(adj, node):
+        if len(node.targets) != 1:
+            raise WarpCodegenError("Assigning the same value to multiple variables is not supported")
+        lhs = node.targets[0]
         # handle the case where we are assigning multiple output variables
-        if isinstance(node.targets[0], ast.Tuple):
+        if isinstance(lhs, ast.Tuple):
             # record the expected number of outputs on the node
             # we do this so we can decide which function to
             # call based on the number of expected outputs
             if isinstance(node.value, ast.Call):
-                node.value.expects = len(node.targets[0].elts)
+                node.value.expects = len(lhs.elts)
             # evaluate values
             if isinstance(node.value, ast.Tuple):
@@ -1338,40 +1780,47 @@ class Adjoint:
                 out = adj.eval(node.value)
             names = []
-            for v in node.targets[0].elts:
+            for v in lhs.elts:
                 if isinstance(v, ast.Name):
                     names.append(v.id)
                 else:
-                    raise RuntimeError(
+                    raise WarpCodegenError(
                         "Multiple return functions can only assign to simple variables, e.g.: x, y = func()"
                     )
             if len(names) != len(out):
-                raise RuntimeError(
-                    "Multiple return functions need to receive all their output values, incorrect number of values to unpack (expected {}, got {})".format(
-                        len(out), len(names)
-                    )
+                raise WarpCodegenError(
+                    f"Multiple return functions need to receive all their output values, incorrect number of values to unpack (expected {len(out)}, got {len(names)})"
                 )
             for name, rhs in zip(names, out):
                 if name in adj.symbols:
                     if not types_equal(rhs.type, adj.symbols[name].type):
-                        raise TypeError(
-                            "Error, assigning to existing symbol {} ({}) with different type ({})".format(
-                                name, adj.symbols[name].type, rhs.type
-                            )
+                        raise WarpCodegenTypeError(
+                            f"Error, assigning to existing symbol {name} ({adj.symbols[name].type}) with different type ({rhs.type})"
                         )
                 adj.symbols[name] = rhs
-            return out
         # handles the case where we are assigning to an array index (e.g.: arr[i] = 2.0)
-        elif isinstance(node.targets[0], ast.Subscript):
-            target = adj.eval(node.targets[0].value)
+        elif isinstance(lhs, ast.Subscript):
+            if hasattr(lhs.value, "attr") and lhs.value.attr == "adjoint":
+                # handle adjoint of a variable, i.e. wp.adjoint[var]
+                lhs.slice.is_adjoint = True
+                src_var = adj.eval(lhs.slice)
+                var = Var(f"adj_{src_var.label}", type=src_var.type, constant=None, prefix=False)
+                value = adj.eval(node.value)
+                adj.add_forward(f"{var.emit()} = {value.emit()};")
+                return
+            target = adj.eval(lhs.value)
             value = adj.eval(node.value)
+            if not is_local_value(value):
+                raise RuntimeError(
+                    "Cannot reference a global variable from a kernel unless `wp.constant()` is being used"
+                )
-            slice = node.targets[0].slice
+            slice = lhs.slice
             indices = []
             if isinstance(slice, ast.Tuple):
@@ -1379,7 +1828,6 @@ class Adjoint:
                 for arg in slice.elts:
                     var = adj.eval(arg)
                     indices.append(var)
             elif isinstance(slice, ast.Index) and isinstance(slice.value, ast.Tuple):
                 # handles the x[i, j] case (Python 3.7.x)
                 for arg in slice.value.elts:
@@ -1390,64 +1838,84 @@ class Adjoint:
                 var = adj.eval(slice)
                 indices.append(var)
-            if is_array(target.type):
-                adj.add_call(warp.context.builtin_functions["store"], [target, *indices, value])
+            target_type = strip_reference(target.type)
-            elif type_is_vector(target.type) or type_is_matrix(target.type):
-                adj.add_call(warp.context.builtin_functions["indexset"], [target, *indices, value])
+            if is_array(target_type):
+                adj.add_builtin_call("array_store", [target, *indices, value])
-                if warp.config.verbose:
+            elif type_is_vector(target_type) or type_is_matrix(target_type):
+                if is_reference(target.type):
+                    attr = adj.add_builtin_call("indexref", [target, *indices])
+                else:
+                    attr = adj.add_builtin_call("index", [target, *indices])
+                adj.add_builtin_call("store", [attr, value])
+                if warp.config.verbose and not adj.custom_reverse_mode:
                     lineno = adj.lineno + adj.fun_lineno
-                    line = adj.source.splitlines()[adj.lineno]
+                    line = adj.source_lines[adj.lineno]
+                    node_source = adj.get_node_source(lhs.value)
                     print(
-                        f"Warning: mutating {node.targets[0].value.id} in function {adj.fun_name} at {adj.filename}:{lineno}: this is a non-differentiable operation.\n{line}\n"
+                        f"Warning: mutating {node_source} in function {adj.fun_name} at {adj.filename}:{lineno}: this is a non-differentiable operation.\n{line}\n"
                     )
             else:
-                raise RuntimeError("Can only subscript assign array, vector, and matrix types")
+                raise WarpCodegenError("Can only subscript assign array, vector, and matrix types")
-            return var
-        elif isinstance(node.targets[0], ast.Name):
+        elif isinstance(lhs, ast.Name):
             # symbol name
-            name = node.targets[0].id
+            name = lhs.id
             # evaluate rhs
             rhs = adj.eval(node.value)
             # check type matches if symbol already defined
             if name in adj.symbols:
-                if not types_equal(rhs.type, adj.symbols[name].type):
-                    raise TypeError(
-                        "Error, assigning to existing symbol {} ({}) with different type ({})".format(
-                            name, adj.symbols[name].type, rhs.type
-                        )
+                if not types_equal(strip_reference(rhs.type), adj.symbols[name].type):
+                    raise WarpCodegenTypeError(
+                        f"Error, assigning to existing symbol {name} ({adj.symbols[name].type}) with different type ({rhs.type})"
                     )
             # handle simple assignment case (a = b), where we generate a value copy rather than reference
-            if isinstance(node.value, ast.Name):
-                out = adj.add_var(rhs.type)
-                adj.add_call(warp.context.builtin_functions["copy"], [out, rhs])
+            if isinstance(node.value, ast.Name) or is_reference(rhs.type):
+                out = adj.add_builtin_call("copy", [rhs])
             else:
                 out = rhs
             # update symbol map (assumes lhs is a Name node)
             adj.symbols[name] = out
-            return out
-        elif isinstance(node.targets[0], ast.Attribute):
+        elif isinstance(lhs, ast.Attribute):
             rhs = adj.eval(node.value)
-            attr = adj.emit_Attribute(node.targets[0])
-            adj.add_call(warp.context.builtin_functions["copy"], [attr, rhs])
+            aggregate = adj.eval(lhs.value)
+            aggregate_type = strip_reference(aggregate.type)
-            if warp.config.verbose:
-                lineno = adj.lineno + adj.fun_lineno
-                line = adj.source.splitlines()[adj.lineno]
-                msg = f'Warning: detected mutated struct {attr.label} during function "{adj.fun_name}" at {adj.filename}:{lineno}: this is a non-differentiable operation.\n{line}\n'
-                print(msg)
+            # assigning to a vector component
+            if type_is_vector(aggregate_type):
+                index = adj.vector_component_index(lhs.attr, aggregate_type)
+                if is_reference(aggregate.type):
+                    attr = adj.add_builtin_call("indexref", [aggregate, index])
+                else:
+                    attr = adj.add_builtin_call("index", [aggregate, index])
+                adj.add_builtin_call("store", [attr, rhs])
+            else:
+                attr = adj.emit_Attribute(lhs)
+                if is_reference(attr.type):
+                    adj.add_builtin_call("store", [attr, rhs])
+                else:
+                    adj.add_builtin_call("assign", [attr, rhs])
+                if warp.config.verbose and not adj.custom_reverse_mode:
+                    lineno = adj.lineno + adj.fun_lineno
+                    line = adj.source_lines[adj.lineno]
+                    msg = f'Warning: detected mutated struct {attr.label} during function "{adj.fun_name}" at {adj.filename}:{lineno}: this is a non-differentiable operation.\n{line}\n'
+                    print(msg)
         else:
-            raise RuntimeError("Error, unsupported assignment statement.")
+            raise WarpCodegenError("Error, unsupported assignment statement.")
     def emit_Return(adj, node):
         if node.value is None:
@@ -1458,30 +1926,26 @@ class Adjoint:
             var = (adj.eval(node.value),)
         if adj.return_var is not None:
-            old_ctypes = tuple(v.ctype() for v in adj.return_var)
-            new_ctypes = tuple(v.ctype() for v in var)
+            old_ctypes = tuple(v.ctype(value_type=True) for v in adj.return_var)
+            new_ctypes = tuple(v.ctype(value_type=True) for v in var)
             if old_ctypes != new_ctypes:
-                raise TypeError(
+                raise WarpCodegenTypeError(
                     f"Error, function returned different types, previous: [{', '.join(old_ctypes)}], new [{', '.join(new_ctypes)}]"
                 )
-        else:
-            adj.return_var = var
-        adj.add_return(var)
-    def emit_AugAssign(adj, node):
-        # convert inplace operations (+=, -=, etc) to ssa form, e.g.: c = a + b
-        left = adj.eval(node.target)
-        right = adj.eval(node.value)
-        # lookup
-        name = builtin_operators[type(node.op)]
-        func = warp.context.builtin_functions[name]
+        if var is not None:
+            adj.return_var = tuple()
+            for ret in var:
+                if is_reference(ret.type):
+                    ret = adj.add_builtin_call("copy", [ret])
+                adj.return_var += (ret,)
-        out = adj.add_call(func, [left, right])
+        adj.add_return(adj.return_var)
-        # update symbol map
-        adj.symbols[node.target.id] = out
+    def emit_AugAssign(adj, node):
+        # replace augmented assignment with assignment statement + binary op
+        new_node = ast.Assign(targets=[node.target], value=ast.BinOp(node.target, node.op, node.value))
+        adj.eval(new_node)
     def emit_Tuple(adj, node):
         # LHS for expressions, such as i, j, k = 1, 2, 3
@@ -1491,122 +1955,167 @@ class Adjoint:
     def emit_Pass(adj, node):
         pass
+    node_visitors = {
+        ast.FunctionDef: emit_FunctionDef,
+        ast.If: emit_If,
+        ast.Compare: emit_Compare,
+        ast.BoolOp: emit_BoolOp,
+        ast.Name: emit_Name,
+        ast.Attribute: emit_Attribute,
+        ast.Str: emit_String,  # Deprecated in 3.8; use Constant
+        ast.Num: emit_Num,  # Deprecated in 3.8; use Constant
+        ast.NameConstant: emit_NameConstant,  # Deprecated in 3.8; use Constant
+        ast.Constant: emit_Constant,
+        ast.BinOp: emit_BinOp,
+        ast.UnaryOp: emit_UnaryOp,
+        ast.While: emit_While,
+        ast.For: emit_For,
+        ast.Break: emit_Break,
+        ast.Continue: emit_Continue,
+        ast.Expr: emit_Expr,
+        ast.Call: emit_Call,
+        ast.Index: emit_Index,  # Deprecated in 3.8; Use the index value directly instead.
+        ast.Subscript: emit_Subscript,
+        ast.Assign: emit_Assign,
+        ast.Return: emit_Return,
+        ast.AugAssign: emit_AugAssign,
+        ast.Tuple: emit_Tuple,
+        ast.Pass: emit_Pass,
+        ast.Ellipsis: emit_Ellipsis,
+    }
     def eval(adj, node):
         if hasattr(node, "lineno"):
             adj.set_lineno(node.lineno - 1)
-        node_visitors = {
-            ast.FunctionDef: Adjoint.emit_FunctionDef,
-            ast.If: Adjoint.emit_If,
-            ast.Compare: Adjoint.emit_Compare,
-            ast.BoolOp: Adjoint.emit_BoolOp,
-            ast.Name: Adjoint.emit_Name,
-            ast.Attribute: Adjoint.emit_Attribute,
-            ast.Str: Adjoint.emit_String,  # Deprecated in 3.8; use Constant
-            ast.Num: Adjoint.emit_Num,  # Deprecated in 3.8; use Constant
-            ast.NameConstant: Adjoint.emit_NameConstant,  # Deprecated in 3.8; use Constant
-            ast.Constant: Adjoint.emit_Constant,
-            ast.BinOp: Adjoint.emit_BinOp,
-            ast.UnaryOp: Adjoint.emit_UnaryOp,
-            ast.While: Adjoint.emit_While,
-            ast.For: Adjoint.emit_For,
-            ast.Break: Adjoint.emit_Break,
-            ast.Continue: Adjoint.emit_Continue,
-            ast.Expr: Adjoint.emit_Expr,
-            ast.Call: Adjoint.emit_Call,
-            ast.Index: Adjoint.emit_Index,  # Deprecated in 3.8; Use the index value directly instead.
-            ast.Subscript: Adjoint.emit_Subscript,
-            ast.Assign: Adjoint.emit_Assign,
-            ast.Return: Adjoint.emit_Return,
-            ast.AugAssign: Adjoint.emit_AugAssign,
-            ast.Tuple: Adjoint.emit_Tuple,
-            ast.Pass: Adjoint.emit_Pass,
-        }
-        emit_node = node_visitors.get(type(node))
-        if emit_node is not None:
-            return emit_node(adj, node)
-        else:
-            raise Exception("Error, ast node of type {} not supported".format(type(node)))
+        emit_node = adj.node_visitors[type(node)]
+        return emit_node(adj, node)
     # helper to evaluate expressions of the form
     # obj1.obj2.obj3.attr in the function's global scope
-    def resolve_path(adj, node):
-        modules = []
+    def resolve_path(adj, path):
+        if len(path) == 0:
+            return None
-        while isinstance(node, ast.Attribute):
-            modules.append(node.attr)
-            node = node.value
+        # if root is overshadowed by local symbols, bail out
+        if path[0] in adj.symbols:
+            return None
-        if isinstance(node, ast.Name):
-            modules.append(node.id)
+        if path[0] in __builtins__:
+            return __builtins__[path[0]]
-        # reverse list since ast presents it backward order
-        path = [*reversed(modules)]
+        # Look up the closure info and append it to adj.func.__globals__
+        # in case you want to define a kernel inside a function and refer
+        # to variables you've declared inside that function:
+        extract_contents = (
+            lambda contents: contents
+            if isinstance(contents, warp.context.Function) or not callable(contents)
+            else contents
+        )
+        capturedvars = dict(
+            zip(
+                adj.func.__code__.co_freevars,
+                [extract_contents(c.cell_contents) for c in (adj.func.__closure__ or [])],
+            )
+        )
+        vars_dict = {**adj.func.__globals__, **capturedvars}
-        if len(path) == 0:
-            return None, path
+        if path[0] in vars_dict:
+            func = vars_dict[path[0]]
-        # try and evaluate object path
-        try:
-            # Look up the closure info and append it to adj.func.__globals__
-            # in case you want to define a kernel inside a function and refer
-            # to variables you've declared inside that function:
-            extract_contents = (
-                lambda contents: contents
-                if isinstance(contents, warp.context.Function) or not callable(contents)
-                else contents
-            )
-            capturedvars = dict(
-                zip(
-                    adj.func.__code__.co_freevars,
-                    [extract_contents(c.cell_contents) for c in (adj.func.__closure__ or [])],
-                )
-            )
+        # Support Warp types in kernels without the module suffix (e.g. v = vec3(0.0,0.2,0.4)):
+        else:
+            func = getattr(warp, path[0], None)
-            vars_dict = {**adj.func.__globals__, **capturedvars}
-            func = eval(".".join(path), vars_dict)
-            return func, path
-        except:
-            pass
+        if func:
+            for i in range(1, len(path)):
+                if hasattr(func, path[i]):
+                    func = getattr(func, path[i])
-        # I added this so people can eg do this kind of thing
-        # in a kernel:
+        return func
-        # v = vec3(0.0,0.2,0.4)
+    # Evaluates a static expression that does not depend on runtime values
+    # if eval_types is True, try resolving the path using evaluated type information as well
+    def resolve_static_expression(adj, root_node, eval_types=True):
+        attributes = []
-        # vec3 is now an alias and is not in warp.context.builtin_functions.
-        # This means it can't be directly looked up in Adjoint.add_call, and
-        # needs to be looked up by digging some information out of the
-        # python object it actually came from.
+        node = root_node
+        while isinstance(node, ast.Attribute):
+            attributes.append(node.attr)
+            node = node.value
-        # Before this fix, resolve_path was returning None, as the
-        # "vec3" symbol is not available. In this situation I'm assuming
-        # it's a member of the warp module and trying to look it up:
-        try:
-            evalstr = ".".join(["warp"] + path)
-            func = eval(evalstr, {"warp": warp})
-            return func, path
-        except:
-            return None, path
+        if eval_types and isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
+            # support for operators returning modules
+            # i.e. operator_name(*operator_args).x.y.z
+            operator_args = node.args
+            operator_name = node.func.id
+            if operator_name == "type":
+                if len(operator_args) != 1:
+                    raise WarpCodegenError(f"type() operator expects exactly one argument, got {len(operator_args)}")
+                # type() operator
+                var = adj.eval(operator_args[0])
+                if isinstance(var, Var):
+                    var_type = strip_reference(var.type)
+                    # Allow accessing type attributes, for instance array.dtype
+                    while attributes:
+                        attr_name = attributes.pop()
+                        var_type, prev_type = adj.resolve_type_attribute(var_type, attr_name), var_type
+                        if var_type is None:
+                            raise WarpCodegenAttributeError(
+                                f"{attr_name} is not an attribute of {type_repr(prev_type)}"
+                            )
+                    return var_type, [type_repr(var_type)]
+                else:
+                    raise WarpCodegenError(f"Cannot deduce the type of {var}")
+        # reverse list since ast presents it backward order
+        path = [*reversed(attributes)]
+        if isinstance(node, ast.Name):
+            path.insert(0, node.id)
+        # Try resolving path from captured context
+        captured_obj = adj.resolve_path(path)
+        if captured_obj is not None:
+            return captured_obj, path
+        # Still nothing found, maybe this is a predefined type attribute like `dtype`
+        if eval_types:
+            try:
+                val = adj.eval(root_node)
+                if val:
+                    return [val, type_repr(val)]
+            except Exception:
+                pass
+        return None, path
     # annotate generated code with the original source code line
     def set_lineno(adj, lineno):
         if adj.lineno is None or adj.lineno != lineno:
             line = lineno + adj.fun_lineno
-            source = adj.raw_source[lineno].strip().ljust(70)
+            source = adj.source_lines[lineno].strip().ljust(80 - len(adj.indentation), " ")
             adj.add_forward(f"// {source}       <L {line}>")
             adj.add_reverse(f"// adj: {source}  <L {line}>")
         adj.lineno = lineno
+    def get_node_source(adj, node):
+        # return the Python code corresponding to the given AST node
+        return ast.get_source_segment(adj.source, node)
 # ----------------
 # code generation
 cpu_module_header = """
 #define WP_NO_CRT
-#include "../native/builtin.h"
+#include "builtin.h"
 // avoid namespacing of float type for casting to float type, this is to avoid wp::float(x), which is not valid in C++
 #define float(x) cast_float(x)
@@ -1615,13 +2124,16 @@ cpu_module_header = """
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
-using namespace wp;
+#define builtin_tid1d() wp::tid(wp::s_threadIdx)
+#define builtin_tid2d(x, y) wp::tid(x, y, wp::s_threadIdx, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, wp::s_threadIdx, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, wp::s_threadIdx, dim)
 """
 cuda_module_header = """
 #define WP_NO_CRT
-#include "../native/builtin.h"
+#include "builtin.h"
 // avoid namespacing of float type for casting to float type, this is to avoid wp::float(x), which is not valid in C++
 #define float(x) cast_float(x)
@@ -1630,8 +2142,10 @@ cuda_module_header = """
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
-using namespace wp;
+#define builtin_tid1d() wp::tid(_idx)
+#define builtin_tid2d(x, y) wp::tid(x, y, _idx, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, _idx, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, _idx, dim)
 """
@@ -1645,54 +2159,56 @@ struct {name}
     {{
     }}
-    {name}& operator += (const {name}&) {{ return *this; }}
+    CUDA_CALLABLE {name}& operator += (const {name}& rhs)
+    {{{prefix_add_body}
+        return *this;}}
 }};
 static CUDA_CALLABLE void adj_{name}({reverse_args})
 {{
-{reverse_body}
-}}
+{reverse_body}}}
-CUDA_CALLABLE void atomic_add({name}* p, {name} t)
+CUDA_CALLABLE void adj_atomic_add({name}* p, {name} t)
 {{
-{atomic_add_body}
-}}
+{atomic_add_body}}}
 """
-cpu_function_template = """
+cpu_forward_function_template = """
 // {filename}:{lineno}
 static {return_type} {name}(
     {forward_args})
 {{
-{forward_body}
-}}
+{forward_body}}}
+"""
+cpu_reverse_function_template = """
 // {filename}:{lineno}
 static void adj_{name}(
     {reverse_args})
 {{
-{reverse_body}
-}}
+{reverse_body}}}
 """
-cuda_function_template = """
+cuda_forward_function_template = """
 // {filename}:{lineno}
 static CUDA_CALLABLE {return_type} {name}(
     {forward_args})
 {{
-{forward_body}
-}}
+{forward_body}}}
+"""
+cuda_reverse_function_template = """
 // {filename}:{lineno}
 static CUDA_CALLABLE void adj_{name}(
     {reverse_args})
 {{
-{reverse_body}
-}}
+{reverse_body}}}
 """
@@ -1701,25 +2217,21 @@ cuda_kernel_template = """
 extern "C" __global__ void {name}_cuda_kernel_forward(
     {forward_args})
 {{
-    size_t _idx = grid_index();
-    if (_idx >= dim.size)
-        return;
-    set_launch_bounds(dim);
-{forward_body}
+    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+         _idx < dim.size;
+         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+    {{
+{forward_body}    }}
 }}
 extern "C" __global__ void {name}_cuda_kernel_backward(
     {reverse_args})
 {{
-    size_t _idx = grid_index();
-    if (_idx >= dim.size)
-        return;
-    set_launch_bounds(dim);
-{reverse_body}
+    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+         _idx < dim.size;
+         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+    {{
+{reverse_body}    }}
 }}
 """
@@ -1729,39 +2241,12 @@ cpu_kernel_template = """
 void {name}_cpu_kernel_forward(
     {forward_args})
 {{
-{forward_body}
-}}
+{forward_body}}}
 void {name}_cpu_kernel_backward(
     {reverse_args})
 {{
-{reverse_body}
-}}
-"""
-cuda_module_template = """
-extern "C" {{
-// Python entry points
-WP_API void {name}_cuda_forward(
-    void* stream,
-    {forward_args})
-{{
-    {name}_cuda_kernel_forward<<<(dim.size + 256 - 1) / 256, 256, 0, (cudaStream_t)stream>>>(
-            {forward_params});
-}}
-WP_API void {name}_cuda_backward(
-    void* stream,
-    {reverse_args})
-{{
-    {name}_cuda_kernel_backward<<<(dim.size + 256 - 1) / 256, 256, 0, (cudaStream_t)stream>>>(
-            {reverse_params});
-}}
-}} // extern C
+{reverse_body}}}
 """
@@ -1773,11 +2258,9 @@ extern "C" {{
 WP_API void {name}_cpu_forward(
     {forward_args})
 {{
-    set_launch_bounds(dim);
     for (size_t i=0; i < dim.size; ++i)
     {{
-        s_threadIdx = i;
+        wp::s_threadIdx = i;
         {name}_cpu_kernel_forward(
             {forward_params});
@@ -1787,11 +2270,9 @@ WP_API void {name}_cpu_forward(
 WP_API void {name}_cpu_backward(
     {reverse_args})
 {{
-    set_launch_bounds(dim);
     for (size_t i=0; i < dim.size; ++i)
     {{
-        s_threadIdx = i;
+        wp::s_threadIdx = i;
         {name}_cpu_kernel_backward(
             {reverse_params});
@@ -1837,7 +2318,7 @@ WP_API void {name}_cpu_backward(
 def constant_str(value):
     value_type = type(value)
-    if value_type == bool:
+    if value_type == bool or value_type == builtins.bool:
         if value:
             return "true"
         else:
@@ -1854,7 +2335,9 @@ def constant_str(value):
             scalar_value = runtime.core.half_bits_to_float
         else:
-            scalar_value = lambda x: x
+            def scalar_value(x):
+                return x
         # list of scalar initializer values
         initlist = []
@@ -1871,6 +2354,9 @@ def constant_str(value):
         # make sure we emit the value of objects, e.g. uint32
         return str(value.value)
+    elif value == math.inf:
+        return "INFINITY"
     else:
         # otherwise just convert constant to string
         return str(value)
@@ -1879,7 +2365,7 @@ def constant_str(value):
 def indent(args, stops=1):
     sep = ",\n"
     for i in range(stops):
-        sep += "\t"
+        sep += "    "
     # return sep + args.replace(", ", "," + sep)
     return sep.join(args)
@@ -1887,7 +2373,9 @@ def indent(args, stops=1):
 # generates a C function name based on the python function name
 def make_full_qualified_name(func):
-    return re.sub("[^0-9a-zA-Z_]+", "", func.__qualname__.replace(".", "__"))
+    if not isinstance(func, str):
+        func = func.__qualname__
+    return re.sub("[^0-9a-zA-Z_]+", "", func.replace(".", "__"))
 def codegen_struct(struct, device="cpu", indent_size=4):
@@ -1895,8 +2383,13 @@ def codegen_struct(struct, device="cpu", indent_size=4):
     body = []
     indent_block = " " * indent_size
-    for label, var in struct.vars.items():
-        body.append(var.ctype() + " " + label + ";\n")
+    if len(struct.vars) > 0:
+        for label, var in struct.vars.items():
+            body.append(var.ctype() + " " + label + ";\n")
+    else:
+        # for empty structs, emit the dummy attribute to avoid any compiler-specific alignment issues
+        body.append("char _dummy_;\n")
     forward_args = []
     reverse_args = []
@@ -1904,21 +2397,32 @@ def codegen_struct(struct, device="cpu", indent_size=4):
     forward_initializers = []
     reverse_body = []
     atomic_add_body = []
+    prefix_add_body = []
     # forward args
     for label, var in struct.vars.items():
-        forward_args.append(f"{var.ctype()} const& {label} = {{}}")
-        reverse_args.append(f"{var.ctype()} const&")
+        var_ctype = var.ctype()
+        forward_args.append(f"{var_ctype} const& {label} = {{}}")
+        reverse_args.append(f"{var_ctype} const&")
-        atomic_add_body.append(f"{indent_block}atomic_add(&p->{label}, t.{label});\n")
+        namespace = "wp::" if var_ctype.startswith("wp::") or var_ctype == "bool" else ""
+        atomic_add_body.append(f"{indent_block}{namespace}adj_atomic_add(&p->{label}, t.{label});\n")
         prefix = f"{indent_block}," if forward_initializers else ":"
         forward_initializers.append(f"{indent_block}{prefix} {label}{{{label}}}\n")
+    # prefix-add operator
+    for label, var in struct.vars.items():
+        if not is_array(var.type):
+            prefix_add_body.append(f"{indent_block}{label} += rhs.{label};\n")
     # reverse args
     for label, var in struct.vars.items():
-        reverse_args.append(var.ctype() + " const& adj_" + label)
-        reverse_body.append(f"{indent_block}adj_ret.{label} = adj_{label};\n")
+        reverse_args.append(var.ctype() + " & adj_" + label)
+        if is_array(var.type):
+            reverse_body.append(f"{indent_block}adj_{label} = adj_ret.{label};\n")
+        else:
+            reverse_body.append(f"{indent_block}adj_{label} += adj_ret.{label};\n")
     reverse_args.append(name + " & adj_ret")
@@ -1929,109 +2433,101 @@ def codegen_struct(struct, device="cpu", indent_size=4):
         forward_initializers="".join(forward_initializers),
         reverse_args=indent(reverse_args),
         reverse_body="".join(reverse_body),
+        prefix_add_body="".join(prefix_add_body),
         atomic_add_body="".join(atomic_add_body),
     )
-def codegen_func_forward_body(adj, device="cpu", indent=4):
-    body = []
-    indent_block = " " * indent
-    for f in adj.blocks[0].body_forward:
-        body += [f + "\n"]
-    return "".join([indent_block + l for l in body])
 def codegen_func_forward(adj, func_type="kernel", device="cpu"):
-    s = ""
+    if device == "cpu":
+        indent = 4
+    elif device == "cuda":
+        if func_type == "kernel":
+            indent = 8
+        else:
+            indent = 4
+    else:
+        raise ValueError(f"Device {device} not supported for codegen")
+    indent_block = " " * indent
     # primal vars
-    s += "    //---------\n"
-    s += "    // primal vars\n"
+    lines = []
+    lines += ["//---------\n"]
+    lines += ["// primal vars\n"]
     for var in adj.variables:
         if var.constant is None:
-            s += "    " + var.ctype() + " var_" + str(var.label) + ";\n"
+            lines += [f"{var.ctype()} {var.emit()};\n"]
         else:
-            s += "    const " + var.ctype() + " var_" + str(var.label) + " = " + constant_str(var.constant) + ";\n"
+            lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
     # forward pass
-    s += "    //---------\n"
-    s += "    // forward\n"
+    lines += ["//---------\n"]
+    lines += ["// forward\n"]
-    if device == "cpu":
-        s += codegen_func_forward_body(adj, device=device, indent=4)
+    for f in adj.blocks[0].body_forward:
+        lines += [f + "\n"]
+    return "".join([indent_block + l for l in lines])
+def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
+    if device == "cpu":
+        indent = 4
     elif device == "cuda":
         if func_type == "kernel":
-            s += codegen_func_forward_body(adj, device=device, indent=8)
+            indent = 8
         else:
-            s += codegen_func_forward_body(adj, device=device, indent=4)
-    return s
+            indent = 4
+    else:
+        raise ValueError(f"Device {device} not supported for codegen")
-def codegen_func_reverse_body(adj, device="cpu", indent=4):
-    body = []
     indent_block = " " * indent
-    # forward pass
-    body += ["//---------\n"]
-    body += ["// forward\n"]
-    for f in adj.blocks[0].body_replay:
-        body += [f + "\n"]
-    # reverse pass
-    body += ["//---------\n"]
-    body += ["// reverse\n"]
-    for l in reversed(adj.blocks[0].body_reverse):
-        body += [l + "\n"]
-    body += ["return;\n"]
-    return "".join([indent_block + l for l in body])
-def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
-    s = ""
+    lines = []
     # primal vars
-    s += "    //---------\n"
-    s += "    // primal vars\n"
+    lines += ["//---------\n"]
+    lines += ["// primal vars\n"]
     for var in adj.variables:
         if var.constant is None:
-            s += "    " + var.ctype() + " var_" + str(var.label) + ";\n"
+            lines += [f"{var.ctype()} {var.emit()};\n"]
         else:
-            s += "    const " + var.ctype() + " var_" + str(var.label) + " = " + constant_str(var.constant) + ";\n"
+            lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
     # dual vars
-    s += "    //---------\n"
-    s += "    // dual vars\n"
+    lines += ["//---------\n"]
+    lines += ["// dual vars\n"]
     for var in adj.variables:
-        if isinstance(var.type, Struct):
-            s += "    " + var.ctype() + " adj_" + str(var.label) + ";\n"
-        else:
-            s += "    " + var.ctype() + " adj_" + str(var.label) + "(0);\n"
+        lines += [f"{var.ctype(value_type=True)} {var.emit_adj()} = {{}};\n"]
-    if device == "cpu":
-        s += codegen_func_reverse_body(adj, device=device, indent=4)
-    elif device == "cuda":
-        if func_type == "kernel":
-            s += codegen_func_reverse_body(adj, device=device, indent=8)
-        else:
-            s += codegen_func_reverse_body(adj, device=device, indent=4)
+    # forward pass
+    lines += ["//---------\n"]
+    lines += ["// forward\n"]
+    for f in adj.blocks[0].body_replay:
+        lines += [f + "\n"]
+    # reverse pass
+    lines += ["//---------\n"]
+    lines += ["// reverse\n"]
+    for l in reversed(adj.blocks[0].body_reverse):
+        lines += [l + "\n"]
+    # In grid-stride kernels the reverse body is in a for loop
+    if device == "cuda" and func_type == "kernel":
+        lines += ["continue;\n"]
     else:
-        raise ValueError("Device {} not supported for codegen".format(device))
+        lines += ["return;\n"]
-    return s
+    return "".join([indent_block + l for l in lines])
-def codegen_func(adj, device="cpu"):
+def codegen_func(adj, c_func_name: str, device="cpu", options={}):
     # forward header
     if adj.return_var is not None and len(adj.return_var) == 1:
         return_type = adj.return_var[0].ctype()
@@ -2044,16 +2540,20 @@ def codegen_func(adj, device="cpu"):
     reverse_args = []
     # forward args
-    for arg in adj.args:
-        forward_args.append(arg.ctype() + " var_" + arg.label)
-        reverse_args.append(arg.ctype() + " var_" + arg.label)
+    for i, arg in enumerate(adj.args):
+        s = f"{arg.ctype()} {arg.emit()}"
+        forward_args.append(s)
+        if not adj.custom_reverse_mode or i < adj.custom_reverse_num_input_args:
+            reverse_args.append(s)
     if has_multiple_outputs:
         for i, arg in enumerate(adj.return_var):
             forward_args.append(arg.ctype() + " & ret_" + str(i))
             reverse_args.append(arg.ctype() + " & ret_" + str(i))
     # reverse args
-    for arg in adj.args:
+    for i, arg in enumerate(adj.args):
+        if adj.custom_reverse_mode and i >= adj.custom_reverse_num_input_args:
+            break
         # indexed array gradients are regular arrays
         if isinstance(arg.type, indexedarray):
             _arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
@@ -2065,24 +2565,96 @@ def codegen_func(adj, device="cpu"):
             reverse_args.append(arg.ctype() + " & adj_ret_" + str(i))
     elif return_type != "void":
         reverse_args.append(return_type + " & adj_ret")
-    # codegen body
-    forward_body = codegen_func_forward(adj, func_type="function", device=device)
-    reverse_body = codegen_func_reverse(adj, func_type="function", device=device)
+    # custom output reverse args (user-declared)
+    if adj.custom_reverse_mode:
+        for arg in adj.args[adj.custom_reverse_num_input_args :]:
+            reverse_args.append(f"{arg.ctype()} & {arg.emit()}")
     if device == "cpu":
-        template = cpu_function_template
+        forward_template = cpu_forward_function_template
+        reverse_template = cpu_reverse_function_template
     elif device == "cuda":
-        template = cuda_function_template
+        forward_template = cuda_forward_function_template
+        reverse_template = cuda_reverse_function_template
     else:
-        raise ValueError("Device {} is not supported".format(device))
+        raise ValueError(f"Device {device} is not supported")
-    s = template.format(
-        name=make_full_qualified_name(adj.func),
-        return_type=return_type,
+    # codegen body
+    forward_body = codegen_func_forward(adj, func_type="function", device=device)
+    s = ""
+    if not adj.skip_forward_codegen:
+        s += forward_template.format(
+            name=c_func_name,
+            return_type=return_type,
+            forward_args=indent(forward_args),
+            forward_body=forward_body,
+            filename=adj.filename,
+            lineno=adj.fun_lineno,
+        )
+    if not adj.skip_reverse_codegen:
+        if adj.custom_reverse_mode:
+            reverse_body = "\t// user-defined adjoint code\n" + forward_body
+        else:
+            if options.get("enable_backward", True):
+                reverse_body = codegen_func_reverse(adj, func_type="function", device=device)
+            else:
+                reverse_body = '\t// reverse mode disabled (module option "enable_backward" is False)\n'
+        s += reverse_template.format(
+            name=c_func_name,
+            return_type=return_type,
+            reverse_args=indent(reverse_args),
+            forward_body=forward_body,
+            reverse_body=reverse_body,
+            filename=adj.filename,
+            lineno=adj.fun_lineno,
+        )
+    return s
+def codegen_snippet(adj, name, snippet, adj_snippet):
+    forward_args = []
+    reverse_args = []
+    # forward args
+    for i, arg in enumerate(adj.args):
+        s = f"{arg.ctype()} {arg.emit().replace('var_', '')}"
+        forward_args.append(s)
+        reverse_args.append(s)
+    # reverse args
+    for i, arg in enumerate(adj.args):
+        if isinstance(arg.type, indexedarray):
+            _arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
+            reverse_args.append(_arg.ctype() + " & adj_" + arg.label)
+        else:
+            reverse_args.append(arg.ctype() + " & adj_" + arg.label)
+    forward_template = cuda_forward_function_template
+    reverse_template = cuda_reverse_function_template
+    s = ""
+    s += forward_template.format(
+        name=name,
+        return_type="void",
         forward_args=indent(forward_args),
+        forward_body=snippet,
+        filename=adj.filename,
+        lineno=adj.fun_lineno,
+    )
+    if adj_snippet:
+        reverse_body = adj_snippet
+    else:
+        reverse_body = ""
+    s += reverse_template.format(
+        name=name,
+        return_type="void",
         reverse_args=indent(reverse_args),
-        forward_body=forward_body,
+        forward_body=snippet,
         reverse_body=reverse_body,
         filename=adj.filename,
         lineno=adj.fun_lineno,
@@ -2098,8 +2670,8 @@ def codegen_kernel(kernel, device, options):
     adj = kernel.adj
-    forward_args = ["launch_bounds_t dim"]
-    reverse_args = ["launch_bounds_t dim"]
+    forward_args = ["wp::launch_bounds_t dim"]
+    reverse_args = ["wp::launch_bounds_t dim"]
     # forward args
     for arg in adj.args:
@@ -2128,7 +2700,7 @@ def codegen_kernel(kernel, device, options):
     elif device == "cuda":
         template = cuda_kernel_template
     else:
-        raise ValueError("Device {} is not supported".format(device))
+        raise ValueError(f"Device {device} is not supported")
     s = template.format(
         name=kernel.get_mangled_name(),
@@ -2142,10 +2714,13 @@ def codegen_kernel(kernel, device, options):
 def codegen_module(kernel, device="cpu"):
+    if device != "cpu":
+        return ""
     adj = kernel.adj
     # build forward signature
-    forward_args = ["launch_bounds_t dim"]
+    forward_args = ["wp::launch_bounds_t dim"]
     forward_params = ["dim"]
     for arg in adj.args:
@@ -2175,14 +2750,7 @@ def codegen_module(kernel, device="cpu"):
             reverse_args.append(f"{arg.ctype()} adj_{arg.label}")
             reverse_params.append(f"adj_{arg.label}")
-    if device == "cpu":
-        template = cpu_module_template
-    elif device == "cuda":
-        template = cuda_module_template
-    else:
-        raise ValueError("Device {} is not supported".format(device))
-    s = template.format(
+    s = cpu_module_template.format(
         name=kernel.get_mangled_name(),
         forward_args=indent(forward_args),
         reverse_args=indent(reverse_args),