PyPI - warp-lang - Versions diffs - 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (346) hide show

warp/__init__.py +301 -287
warp/__init__.pyi +882 -305
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1077 -0
warp/_src/build.py +620 -0
warp/_src/build_dll.py +642 -0
warp/{builtins.py → _src/builtins.py} +1435 -379
warp/_src/codegen.py +4361 -0
warp/{config.py → _src/config.py} +178 -169
warp/_src/constants.py +59 -0
warp/_src/context.py +8352 -0
warp/_src/dlpack.py +464 -0
warp/_src/fabric.py +362 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +510 -0
warp/_src/fem/cache.py +689 -0
warp/_src/fem/dirichlet.py +190 -0
warp/{fem → _src/fem}/domain.py +42 -30
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +703 -0
warp/{fem → _src/fem}/field/nodal_field.py +32 -15
warp/{fem → _src/fem}/field/restriction.py +3 -1
warp/{fem → _src/fem}/field/virtual.py +55 -27
warp/_src/fem/geometry/__init__.py +32 -0
warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +79 -163
warp/_src/fem/geometry/closest_point.py +99 -0
warp/{fem → _src/fem}/geometry/deformed_geometry.py +16 -22
warp/{fem → _src/fem}/geometry/element.py +34 -10
warp/{fem → _src/fem}/geometry/geometry.py +50 -20
warp/{fem → _src/fem}/geometry/grid_2d.py +14 -23
warp/{fem → _src/fem}/geometry/grid_3d.py +14 -23
warp/{fem → _src/fem}/geometry/hexmesh.py +42 -63
warp/{fem → _src/fem}/geometry/nanogrid.py +256 -247
warp/{fem → _src/fem}/geometry/partition.py +123 -63
warp/{fem → _src/fem}/geometry/quadmesh.py +28 -45
warp/{fem → _src/fem}/geometry/tetmesh.py +42 -63
warp/{fem → _src/fem}/geometry/trimesh.py +28 -45
warp/{fem → _src/fem}/integrate.py +166 -158
warp/_src/fem/linalg.py +385 -0
warp/_src/fem/operator.py +398 -0
warp/_src/fem/polynomial.py +231 -0
warp/{fem → _src/fem}/quadrature/pic_quadrature.py +17 -20
warp/{fem → _src/fem}/quadrature/quadrature.py +97 -47
warp/_src/fem/space/__init__.py +248 -0
warp/{fem → _src/fem}/space/basis_function_space.py +22 -11
warp/_src/fem/space/basis_space.py +681 -0
warp/{fem → _src/fem}/space/dof_mapper.py +5 -3
warp/{fem → _src/fem}/space/function_space.py +16 -13
warp/{fem → _src/fem}/space/grid_2d_function_space.py +6 -7
warp/{fem → _src/fem}/space/grid_3d_function_space.py +6 -4
warp/{fem → _src/fem}/space/hexmesh_function_space.py +6 -10
warp/{fem → _src/fem}/space/nanogrid_function_space.py +5 -9
warp/{fem → _src/fem}/space/partition.py +119 -60
warp/{fem → _src/fem}/space/quadmesh_function_space.py +6 -10
warp/{fem → _src/fem}/space/restriction.py +68 -33
warp/_src/fem/space/shape/__init__.py +152 -0
warp/{fem → _src/fem}/space/shape/cube_shape_function.py +11 -9
warp/{fem → _src/fem}/space/shape/shape_function.py +10 -9
warp/{fem → _src/fem}/space/shape/square_shape_function.py +8 -6
warp/{fem → _src/fem}/space/shape/tet_shape_function.py +5 -3
warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +5 -3
warp/{fem → _src/fem}/space/tetmesh_function_space.py +5 -9
warp/_src/fem/space/topology.py +461 -0
warp/{fem → _src/fem}/space/trimesh_function_space.py +5 -9
warp/_src/fem/types.py +114 -0
warp/_src/fem/utils.py +488 -0
warp/_src/jax.py +188 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +389 -0
warp/_src/jax_experimental/ffi.py +1286 -0
warp/_src/jax_experimental/xla_ffi.py +658 -0
warp/_src/marching_cubes.py +710 -0
warp/_src/math.py +416 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +165 -0
warp/_src/optim/linear.py +1608 -0
warp/_src/optim/sgd.py +114 -0
warp/_src/paddle.py +408 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +291 -0
warp/_src/render/render_opengl.py +3638 -0
warp/_src/render/render_usd.py +939 -0
warp/_src/render/utils.py +162 -0
warp/_src/sparse.py +2718 -0
warp/_src/tape.py +1208 -0
warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
warp/_src/torch.py +393 -0
warp/_src/types.py +5888 -0
warp/_src/utils.py +1695 -0
warp/autograd.py +12 -1054
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +8 -588
warp/build_dll.py +6 -721
warp/codegen.py +6 -4251
warp/constants.py +6 -39
warp/context.py +12 -8062
warp/dlpack.py +6 -444
warp/examples/distributed/example_jacobi_mpi.py +4 -5
warp/examples/fem/example_adaptive_grid.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +8 -8
warp/examples/fem/example_diffusion.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_mixed_elasticity.py +2 -2
warp/examples/fem/example_navier_stokes.py +1 -1
warp/examples/fem/example_nonconforming_contact.py +7 -7
warp/examples/fem/example_stokes.py +1 -1
warp/examples/fem/example_stokes_transfer.py +1 -1
warp/examples/fem/utils.py +2 -2
warp/examples/interop/example_jax_callable.py +1 -1
warp/examples/interop/example_jax_ffi_callback.py +1 -1
warp/examples/interop/example_jax_kernel.py +1 -1
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/fabric.py +6 -337
warp/fem/__init__.py +159 -97
warp/fem/adaptivity.py +7 -489
warp/fem/cache.py +9 -648
warp/fem/dirichlet.py +6 -184
warp/fem/field/__init__.py +8 -109
warp/fem/field/field.py +7 -652
warp/fem/geometry/__init__.py +7 -18
warp/fem/geometry/closest_point.py +11 -77
warp/fem/linalg.py +18 -366
warp/fem/operator.py +11 -369
warp/fem/polynomial.py +9 -209
warp/fem/space/__init__.py +5 -211
warp/fem/space/basis_space.py +6 -662
warp/fem/space/shape/__init__.py +41 -118
warp/fem/space/topology.py +6 -437
warp/fem/types.py +6 -81
warp/fem/utils.py +11 -444
warp/jax.py +8 -165
warp/jax_experimental/__init__.py +14 -1
warp/jax_experimental/custom_call.py +8 -365
warp/jax_experimental/ffi.py +17 -873
warp/jax_experimental/xla_ffi.py +5 -605
warp/marching_cubes.py +5 -689
warp/math.py +16 -393
warp/native/array.h +385 -37
warp/native/builtin.h +314 -37
warp/native/bvh.cpp +43 -9
warp/native/bvh.cu +62 -27
warp/native/bvh.h +310 -309
warp/native/clang/clang.cpp +102 -97
warp/native/coloring.cpp +0 -1
warp/native/crt.h +208 -0
warp/native/exports.h +156 -0
warp/native/hashgrid.cu +2 -0
warp/native/intersect.h +24 -1
warp/native/intersect_tri.h +44 -35
warp/native/mat.h +1456 -276
warp/native/mesh.cpp +4 -4
warp/native/mesh.cu +4 -2
warp/native/mesh.h +176 -61
warp/native/quat.h +0 -52
warp/native/scan.cu +2 -0
warp/native/sparse.cu +7 -3
warp/native/spatial.h +12 -0
warp/native/tile.h +681 -89
warp/native/tile_radix_sort.h +3 -3
warp/native/tile_reduce.h +394 -46
warp/native/tile_scan.h +4 -4
warp/native/vec.h +469 -0
warp/native/version.h +23 -0
warp/native/volume.cpp +1 -1
warp/native/volume.cu +1 -0
warp/native/volume.h +1 -1
warp/native/volume_builder.cu +2 -0
warp/native/warp.cpp +57 -29
warp/native/warp.cu +521 -250
warp/native/warp.h +11 -8
warp/optim/__init__.py +6 -3
warp/optim/adam.py +6 -145
warp/optim/linear.py +14 -1585
warp/optim/sgd.py +6 -94
warp/paddle.py +6 -388
warp/render/__init__.py +8 -4
warp/render/imgui_manager.py +7 -267
warp/render/render_opengl.py +6 -3618
warp/render/render_usd.py +6 -919
warp/render/utils.py +6 -142
warp/sparse.py +37 -2563
warp/tape.py +6 -1188
warp/tests/__main__.py +1 -1
warp/tests/cuda/test_async.py +4 -4
warp/tests/cuda/test_conditional_captures.py +1 -1
warp/tests/cuda/test_multigpu.py +1 -1
warp/tests/cuda/test_streams.py +58 -1
warp/tests/geometry/test_bvh.py +157 -22
warp/tests/geometry/test_marching_cubes.py +0 -1
warp/tests/geometry/test_mesh.py +5 -3
warp/tests/geometry/test_mesh_query_aabb.py +5 -12
warp/tests/geometry/test_mesh_query_point.py +5 -2
warp/tests/geometry/test_mesh_query_ray.py +15 -3
warp/tests/geometry/test_volume_write.py +5 -5
warp/tests/interop/test_dlpack.py +18 -17
warp/tests/interop/test_jax.py +772 -49
warp/tests/interop/test_paddle.py +1 -1
warp/tests/test_adam.py +0 -1
warp/tests/test_arithmetic.py +9 -9
warp/tests/test_array.py +578 -100
warp/tests/test_array_reduce.py +3 -3
warp/tests/test_atomic.py +12 -8
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +4 -4
warp/tests/test_bool.py +2 -2
warp/tests/test_builtins_resolution.py +5 -571
warp/tests/test_codegen.py +33 -14
warp/tests/test_conditional.py +1 -1
warp/tests/test_context.py +6 -6
warp/tests/test_copy.py +242 -161
warp/tests/test_ctypes.py +3 -3
warp/tests/test_devices.py +24 -2
warp/tests/test_examples.py +16 -84
warp/tests/test_fabricarray.py +35 -35
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +56 -10
warp/tests/test_fixedarray.py +3 -3
warp/tests/test_func.py +8 -5
warp/tests/test_generics.py +1 -1
warp/tests/test_indexedarray.py +24 -24
warp/tests/test_intersect.py +39 -9
warp/tests/test_large.py +1 -1
warp/tests/test_lerp.py +3 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_map.py +35 -4
warp/tests/test_mat.py +52 -62
warp/tests/test_mat_constructors.py +4 -5
warp/tests/test_mat_lite.py +1 -1
warp/tests/test_mat_scalar_ops.py +121 -121
warp/tests/test_math.py +34 -0
warp/tests/test_module_aot.py +4 -4
warp/tests/test_modules_lite.py +28 -2
warp/tests/test_print.py +11 -11
warp/tests/test_quat.py +93 -58
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +38 -10
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +126 -15
warp/tests/test_spatial.py +105 -87
warp/tests/test_special_values.py +6 -6
warp/tests/test_static.py +7 -7
warp/tests/test_struct.py +13 -2
warp/tests/test_triangle_closest_point.py +48 -1
warp/tests/test_types.py +27 -15
warp/tests/test_utils.py +52 -52
warp/tests/test_vec.py +29 -29
warp/tests/test_vec_constructors.py +5 -5
warp/tests/test_vec_scalar_ops.py +97 -97
warp/tests/test_version.py +75 -0
warp/tests/tile/test_tile.py +178 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +7 -4
warp/tests/tile/test_tile_load.py +26 -2
warp/tests/tile/test_tile_mathdx.py +3 -3
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +2 -4
warp/tests/tile/test_tile_reduce.py +214 -13
warp/tests/unittest_suites.py +6 -14
warp/tests/unittest_utils.py +10 -9
warp/tests/walkthrough_debug.py +3 -1
warp/torch.py +6 -373
warp/types.py +29 -5764
warp/utils.py +10 -1659
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0.dist-info}/METADATA +46 -99
warp_lang-1.10.0.dist-info/RECORD +468 -0
warp_lang-1.10.0.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp/examples/assets/cartpole.urdf +0 -110
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/nv_ant.xml +0 -92
warp/examples/assets/nv_humanoid.xml +0 -183
warp/examples/assets/quadruped.urdf +0 -268
warp/examples/optim/example_bounce.py +0 -266
warp/examples/optim/example_cloth_throw.py +0 -228
warp/examples/optim/example_drone.py +0 -870
warp/examples/optim/example_inverse_kinematics.py +0 -182
warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
warp/examples/optim/example_softbody_properties.py +0 -400
warp/examples/optim/example_spring_cage.py +0 -245
warp/examples/optim/example_trajectory.py +0 -227
warp/examples/sim/example_cartpole.py +0 -143
warp/examples/sim/example_cloth.py +0 -225
warp/examples/sim/example_cloth_self_contact.py +0 -316
warp/examples/sim/example_granular.py +0 -130
warp/examples/sim/example_granular_collision_sdf.py +0 -202
warp/examples/sim/example_jacobian_ik.py +0 -244
warp/examples/sim/example_particle_chain.py +0 -124
warp/examples/sim/example_quadruped.py +0 -203
warp/examples/sim/example_rigid_chain.py +0 -203
warp/examples/sim/example_rigid_contact.py +0 -195
warp/examples/sim/example_rigid_force.py +0 -133
warp/examples/sim/example_rigid_gyroscopic.py +0 -115
warp/examples/sim/example_rigid_soft_contact.py +0 -140
warp/examples/sim/example_soft_body.py +0 -196
warp/examples/tile/example_tile_walker.py +0 -327
warp/sim/__init__.py +0 -74
warp/sim/articulation.py +0 -793
warp/sim/collide.py +0 -2570
warp/sim/graph_coloring.py +0 -307
warp/sim/import_mjcf.py +0 -791
warp/sim/import_snu.py +0 -227
warp/sim/import_urdf.py +0 -579
warp/sim/import_usd.py +0 -898
warp/sim/inertia.py +0 -357
warp/sim/integrator.py +0 -245
warp/sim/integrator_euler.py +0 -2000
warp/sim/integrator_featherstone.py +0 -2101
warp/sim/integrator_vbd.py +0 -2487
warp/sim/integrator_xpbd.py +0 -3295
warp/sim/model.py +0 -4821
warp/sim/particles.py +0 -121
warp/sim/render.py +0 -431
warp/sim/utils.py +0 -431
warp/tests/sim/disabled_kinematics.py +0 -244
warp/tests/sim/test_cloth.py +0 -863
warp/tests/sim/test_collision.py +0 -743
warp/tests/sim/test_coloring.py +0 -347
warp/tests/sim/test_inertia.py +0 -161
warp/tests/sim/test_model.py +0 -226
warp/tests/sim/test_sim_grad.py +0 -287
warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
warp/tests/sim/test_sim_kinematics.py +0 -98
warp/thirdparty/__init__.py +0 -0
warp_lang-1.9.1.dist-info/RECORD +0 -456
/warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
/warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
/warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
/warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0.dist-info}/WHEEL +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0.dist-info}/top_level.txt +0 -0

warp/_src/codegen.py ADDED Viewed

@@ -0,0 +1,4361 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import ast
+import builtins
+import ctypes
+import enum
+import functools
+import hashlib
+import inspect
+import itertools
+import math
+import re
+import sys
+import textwrap
+import types
+from typing import Any, Callable, ClassVar, Mapping, Sequence, get_args, get_origin
+import warp._src.config
+from warp._src.types import *
+_wp_module_name_ = "warp.codegen"
+# used as a globally accessible copy
+# of current compile options (block_dim) etc
+options = {}
+class WarpCodegenError(RuntimeError):
+    def __init__(self, message):
+        super().__init__(message)
+class WarpCodegenTypeError(TypeError):
+    def __init__(self, message):
+        super().__init__(message)
+class WarpCodegenAttributeError(AttributeError):
+    def __init__(self, message):
+        super().__init__(message)
+def get_node_name_safe(node):
+    """Safely get a string representation of an AST node for error messages.
+    This handles different AST node types (Name, Subscript, etc.) without
+    raising AttributeError when accessing attributes that may not exist.
+    """
+    if hasattr(node, "id"):
+        return node.id
+    elif hasattr(node, "value") and hasattr(node, "slice"):
+        # Subscript node like inputs[tid]
+        base_name = get_node_name_safe(node.value)
+        return f"{base_name}[...]"
+    else:
+        return f"<{type(node).__name__}>"
+class WarpCodegenKeyError(KeyError):
+    def __init__(self, message):
+        super().__init__(message)
+# map operator to function name
+builtin_operators: dict[type[ast.AST], str] = {}
+# see https://www.ics.uci.edu/~pattis/ICS-31/lectures/opexp.pdf for a
+# nice overview of python operators
+builtin_operators[ast.Add] = "add"
+builtin_operators[ast.Sub] = "sub"
+builtin_operators[ast.Mult] = "mul"
+builtin_operators[ast.MatMult] = "mul"
+builtin_operators[ast.Div] = "div"
+builtin_operators[ast.FloorDiv] = "floordiv"
+builtin_operators[ast.Pow] = "pow"
+builtin_operators[ast.Mod] = "mod"
+builtin_operators[ast.UAdd] = "pos"
+builtin_operators[ast.USub] = "neg"
+builtin_operators[ast.Not] = "unot"
+builtin_operators[ast.Gt] = ">"
+builtin_operators[ast.Lt] = "<"
+builtin_operators[ast.GtE] = ">="
+builtin_operators[ast.LtE] = "<="
+builtin_operators[ast.Eq] = "=="
+builtin_operators[ast.NotEq] = "!="
+builtin_operators[ast.BitAnd] = "bit_and"
+builtin_operators[ast.BitOr] = "bit_or"
+builtin_operators[ast.BitXor] = "bit_xor"
+builtin_operators[ast.Invert] = "invert"
+builtin_operators[ast.LShift] = "lshift"
+builtin_operators[ast.RShift] = "rshift"
+comparison_chain_strings = [
+    builtin_operators[ast.Gt],
+    builtin_operators[ast.Lt],
+    builtin_operators[ast.LtE],
+    builtin_operators[ast.GtE],
+    builtin_operators[ast.Eq],
+    builtin_operators[ast.NotEq],
+]
+def values_check_equal(a, b):
+    if isinstance(a, Sequence) and isinstance(b, Sequence):
+        if len(a) != len(b):
+            return False
+        return all(x == y for x, y in zip(a, b))
+    return a == b
+def op_str_is_chainable(op: str) -> builtins.bool:
+    return op in comparison_chain_strings
+def get_closure_cell_contents(obj):
+    """Retrieve a closure's cell contents or `None` if it's empty."""
+    try:
+        return obj.cell_contents
+    except ValueError:
+        pass
+    return None
+def eval_annotations(annotations: Mapping[str, Any], obj: Any) -> Mapping[str, Any]:
+    """Un-stringize annotations caused by `from __future__ import annotations` of PEP 563."""
+    # Implementation backported from `inspect.get_annotations()` for Python 3.9 and older.
+    if not annotations:
+        return {}
+    if not any(isinstance(x, str) for x in annotations.values()):
+        # No annotation to un-stringize.
+        return annotations
+    if isinstance(obj, type):
+        # class
+        globals = {}
+        module_name = getattr(obj, "__module__", None)
+        if module_name:
+            module = sys.modules.get(module_name, None)
+            if module:
+                globals = getattr(module, "__dict__", {})
+        locals = dict(vars(obj))
+        unwrap = obj
+    elif isinstance(obj, types.ModuleType):
+        # module
+        globals = obj.__dict__
+        locals = {}
+        unwrap = None
+    elif callable(obj):
+        # function
+        globals = getattr(obj, "__globals__", {})
+        # Capture the variables from the surrounding scope.
+        closure_vars = zip(
+            obj.__code__.co_freevars, tuple(get_closure_cell_contents(x) for x in (obj.__closure__ or ()))
+        )
+        locals = {k: v for k, v in closure_vars if v is not None}
+        unwrap = obj
+    else:
+        raise TypeError(f"{obj!r} is not a module, class, or callable.")
+    if unwrap is not None:
+        while True:
+            if hasattr(unwrap, "__wrapped__"):
+                unwrap = unwrap.__wrapped__
+                continue
+            if isinstance(unwrap, functools.partial):
+                unwrap = unwrap.func
+                continue
+            break
+        if hasattr(unwrap, "__globals__"):
+            globals = unwrap.__globals__
+    # "Inject" type parameters into the local namespace
+    # (unless they are shadowed by assignments *in* the local namespace),
+    # as a way of emulating annotation scopes when calling `eval()`
+    type_params = getattr(obj, "__type_params__", ())
+    if type_params:
+        locals = {param.__name__: param for param in type_params} | locals
+    return {k: v if not isinstance(v, str) else eval(v, globals, locals) for k, v in annotations.items()}
+def get_annotations(obj: Any) -> Mapping[str, Any]:
+    """Same as `inspect.get_annotations()` but always returning un-stringized annotations."""
+    # Python 3.10+: Use the built-in inspect.get_annotations() which handles
+    # PEP 649 (deferred annotation evaluation) in Python 3.14+
+    if hasattr(inspect, "get_annotations"):
+        # eval_str=True ensures stringized annotations from PEP 563 are evaluated
+        return inspect.get_annotations(obj, eval_str=True)
+    else:
+        # Python 3.9 and older: Manual backport of inspect.get_annotations()
+        # See https://docs.python.org/3/howto/annotations.html#accessing-the-annotations-dict-of-an-object-in-python-3-9-and-older
+        if isinstance(obj, type):
+            annotations = obj.__dict__.get("__annotations__", {})
+        else:
+            annotations = getattr(obj, "__annotations__", {})
+        return eval_annotations(annotations, obj)
+def get_full_arg_spec(func: Callable) -> inspect.FullArgSpec:
+    """Same as `inspect.getfullargspec()` but always returning un-stringized annotations."""
+    spec = inspect.getfullargspec(func)
+    # Python 3.10+: Use inspect.get_annotations()
+    if hasattr(inspect, "get_annotations"):
+        # Capture closure variables to handle cases like `foo.Data` where `foo` is a closure variable
+        closure_vars = dict(
+            zip(func.__code__.co_freevars, (get_closure_cell_contents(x) for x in (func.__closure__ or ())))
+        )
+        # Filter out None values from empty cells
+        closure_vars = {k: v for k, v in closure_vars.items() if v is not None}
+        return spec._replace(annotations=inspect.get_annotations(func, eval_str=True, locals=closure_vars))
+    else:
+        # Python 3.9 and older: Manually un-stringize annotations
+        # See https://docs.python.org/3/howto/annotations.html#manually-un-stringizing-stringized-annotations
+        return spec._replace(annotations=eval_annotations(spec.annotations, func))
+def struct_instance_repr_recursive(inst: StructInstance, depth: int, use_repr: bool) -> str:
+    indent = "\t"
+    # handle empty structs
+    if len(inst._cls.vars) == 0:
+        return f"{inst._cls.key}()"
+    lines = []
+    lines.append(f"{inst._cls.key}(")
+    for field_name, _ in inst._cls.ctype._fields_:
+        field_value = getattr(inst, field_name, None)
+        if isinstance(field_value, StructInstance):
+            field_value = struct_instance_repr_recursive(field_value, depth + 1, use_repr)
+        if use_repr:
+            lines.append(f"{indent * (depth + 1)}{field_name}={field_value!r},")
+        else:
+            lines.append(f"{indent * (depth + 1)}{field_name}={field_value!s},")
+    lines.append(f"{indent * depth})")
+    return "\n".join(lines)
+class StructInstance:
+    def __init__(self, ctype):
+        # maintain a c-types object for the top-level instance the struct
+        super().__setattr__("_ctype", ctype)
+        # create Python attributes for each of the struct's variables
+        for k, cst in type(self)._constructors:
+            self.__dict__[k] = cst(ctype)
+    def __setattr__(self, name, value):
+        try:
+            self._setters[name](self, value)
+        except KeyError as err:
+            raise RuntimeError(f"Trying to set Warp struct attribute that does not exist {name}") from err
+    def __ctype__(self):
+        return self._ctype
+    def __repr__(self):
+        return struct_instance_repr_recursive(self, 0, use_repr=True)
+    def __str__(self):
+        return struct_instance_repr_recursive(self, 0, use_repr=False)
+    def assign(self, value):
+        """Assigns the values of another struct instance to this one."""
+        if not isinstance(value, StructInstance):
+            raise RuntimeError(
+                f"Trying to assign a non-structure value to a struct attribute with type: {self._cls.key}"
+            )
+        if self._cls.key is not value._cls.key:
+            raise RuntimeError(
+                f"Trying to assign a structure of type {value._cls.key} to an attribute of {self._cls.key}"
+            )
+        # update all nested ctype vars by deep copy
+        for n in self._cls.vars:
+            setattr(self, n, getattr(value, n))
+    def to(self, device):
+        """Copies this struct with all array members moved onto the given device.
+        Arrays already living on the desired device are referenced as-is, while
+        arrays being moved are copied.
+        """
+        out = self._cls()
+        stack = [(self, out, k, v) for k, v in self._cls.vars.items()]
+        while stack:
+            src, dst, name, var = stack.pop()
+            value = getattr(src, name)
+            if isinstance(var.type, array):
+                # array_t
+                setattr(dst, name, value.to(device))
+            elif isinstance(var.type, Struct):
+                # nested struct
+                new_struct = var.type()
+                setattr(dst, name, new_struct)
+                # The call to `setattr()` just above makes a copy of `new_struct`
+                # so we need to reference that new instance of the struct.
+                new_struct = getattr(dst, name)
+                stack.extend((value, new_struct, k, v) for k, v in var.type.vars.items())
+            else:
+                setattr(dst, name, value)
+        return out
+    # type description used in numpy structured arrays
+    def numpy_dtype(self):
+        return self._cls.numpy_dtype()
+    # value usable in numpy structured arrays of .numpy_dtype(), e.g. (42, 13.37, [1.0, 2.0, 3.0])
+    def numpy_value(self):
+        npvalue = []
+        for name, var in self._cls.vars.items():
+            # get the attribute value
+            value = getattr(self._ctype, name)
+            if isinstance(var.type, array):
+                # array_t
+                npvalue.append(value.numpy_value())
+            elif isinstance(var.type, Struct):
+                # nested struct
+                npvalue.append(value.numpy_value())
+            elif issubclass(var.type, ctypes.Array):
+                if len(var.type._shape_) == 1:
+                    # vector
+                    npvalue.append(list(value))
+                else:
+                    # matrix
+                    npvalue.append([list(row) for row in value])
+            else:
+                # scalar
+                if var.type == warp.float16:
+                    npvalue.append(half_bits_to_float(value))
+                else:
+                    npvalue.append(value)
+        return tuple(npvalue)
+def _make_struct_field_constructor(field: str, var_type: type):
+    if isinstance(var_type, Struct):
+        return lambda ctype: var_type.instance_type(ctype=getattr(ctype, field))
+    elif isinstance(var_type, warp._src.types.array):
+        return lambda ctype: None
+    elif issubclass(var_type, ctypes.Array):
+        # for vector/matrices, the Python attribute aliases the ctype one
+        return lambda ctype: getattr(ctype, field)
+    else:
+        return lambda ctype: var_type()
+def _make_struct_field_setter(cls, field: str, var_type: type):
+    def set_array_value(inst, value):
+        if value is None:
+            # create array with null pointer
+            setattr(inst._ctype, field, array_t())
+        else:
+            # wp.array
+            assert isinstance(value, array)
+            assert types_equal(value.dtype, var_type.dtype), (
+                f"assign to struct member variable {field} failed, expected type {type_repr(var_type.dtype)}, got type {type_repr(value.dtype)}"
+            )
+            setattr(inst._ctype, field, value.__ctype__())
+            # workaround to prevent gradient buffers being garbage collected
+            # since users can do struct.array.requires_grad = False the gradient array
+            # would be collected while the struct ctype still holds a reference to it
+            if value.requires_grad:
+                cls.__setattr__(inst, "_" + field + "_grad", value.grad)
+        cls.__setattr__(inst, field, value)
+    def set_struct_value(inst, value):
+        getattr(inst, field).assign(value)
+    def set_vector_value(inst, value):
+        # vector/matrix type, e.g. vec3
+        if value is None:
+            setattr(inst._ctype, field, var_type())
+        elif type(value) is var_type:
+            setattr(inst._ctype, field, value)
+        else:
+            # conversion from list/tuple, ndarray, etc.
+            setattr(inst._ctype, field, var_type(value))
+        # no need to update the Python attribute,
+        # it's already aliasing the ctype one
+    def set_primitive_value(inst, value):
+        # primitive type
+        if value is None:
+            # zero initialize
+            setattr(inst._ctype, field, var_type._type_())
+        else:
+            if hasattr(value, "_type_"):
+                # assigning warp type value (e.g.: wp.float32)
+                value = value.value
+            # float16 needs conversion to uint16 bits
+            if var_type == warp.float16:
+                setattr(inst._ctype, field, float_to_half_bits(value))
+            else:
+                setattr(inst._ctype, field, value)
+        cls.__setattr__(inst, field, value)
+    if isinstance(var_type, array):
+        return set_array_value
+    elif isinstance(var_type, Struct):
+        return set_struct_value
+    elif issubclass(var_type, ctypes.Array):
+        return set_vector_value
+    else:
+        return set_primitive_value
+class Struct:
+    hash: bytes
+    def __init__(self, key: str, cls: type, module: warp._src.context.Module):
+        self.key = key
+        self.cls = cls
+        self.module = module
+        self.vars: dict[str, Var] = {}
+        if isinstance(self.cls, Sequence):
+            raise RuntimeError("Warp structs must be defined as base classes")
+        annotations = get_annotations(self.cls)
+        for label, type_ in annotations.items():
+            self.vars[label] = Var(label, type_)
+        fields = []
+        for label, var in self.vars.items():
+            if isinstance(var.type, array):
+                fields.append((label, array_t))
+            elif isinstance(var.type, Struct):
+                fields.append((label, var.type.ctype))
+            elif issubclass(var.type, ctypes.Array):
+                fields.append((label, var.type))
+            else:
+                # HACK: fp16 requires conversion functions from warp.so
+                if var.type is warp.float16:
+                    warp.init()
+                fields.append((label, var.type._type_))
+        class StructType(ctypes.Structure):
+            # if struct is empty, add a dummy field to avoid launch errors on CPU device ("ffi_prep_cif failed")
+            _fields_ = fields or [("_dummy_", ctypes.c_byte)]
+        self.ctype = StructType
+        # Compute the hash.  We can cache the hash because it's static, even with nested structs.
+        # All field types are specified in the annotations, so they're resolved at declaration time.
+        ch = hashlib.sha256()
+        ch.update(bytes(self.key, "utf-8"))
+        for name, type_hint in annotations.items():
+            s = f"{name}:{warp._src.types.get_type_code(type_hint)}"
+            ch.update(bytes(s, "utf-8"))
+            # recurse on nested structs
+            if isinstance(type_hint, Struct):
+                ch.update(type_hint.hash)
+        self.hash = ch.digest()
+        # generate unique identifier for structs in native code
+        hash_suffix = f"{self.hash.hex()[:8]}"
+        self.native_name = f"{self.key}_{hash_suffix}"
+        # create default constructor (zero-initialize)
+        self.default_constructor = warp._src.context.Function(
+            func=None,
+            key=self.native_name,
+            namespace="",
+            value_func=lambda *_: self,
+            input_types={},
+            initializer_list_func=lambda *_: False,
+            native_func=self.native_name,
+        )
+        # build a constructor that takes each param as a value
+        input_types = {label: var.type for label, var in self.vars.items()}
+        self.value_constructor = warp._src.context.Function(
+            func=None,
+            key=self.native_name,
+            namespace="",
+            value_func=lambda *_: self,
+            input_types=input_types,
+            initializer_list_func=lambda *_: False,
+            native_func=self.native_name,
+        )
+        self.default_constructor.add_overload(self.value_constructor)
+        if isinstance(module, warp._src.context.Module):
+            module.register_struct(self)
+        # Define class for instances of this struct
+        # To enable autocomplete on s, we inherit from self.cls.
+        # For example,
+        # @wp.struct
+        # class A:
+        #     # annotations
+        #     ...
+        # The type annotations are inherited in A(), allowing autocomplete in kernels
+        class NewStructInstance(self.cls, StructInstance):
+            cls: ClassVar[type] = self.cls
+            native_name: ClassVar[str] = self.native_name
+            _cls: ClassVar[type] = self
+            _constructors: ClassVar[list[tuple[str, Callable]]] = [
+                (field, _make_struct_field_constructor(field, var.type)) for field, var in self.vars.items()
+            ]
+            _setters: ClassVar[dict[str, Callable]] = {
+                field: _make_struct_field_setter(self.cls, field, var.type) for field, var in self.vars.items()
+            }
+            def __init__(inst, ctype=None):
+                StructInstance.__init__(inst, ctype or self.ctype())
+        self.instance_type = NewStructInstance
+    def __call__(self):
+        """
+        This function returns s = StructInstance(self)
+        s uses self.cls as template.
+        """
+        return self.instance_type()
+    def initializer(self):
+        return self.default_constructor
+    # return structured NumPy dtype, including field names, formats, and offsets
+    def numpy_dtype(self):
+        names = []
+        formats = []
+        offsets = []
+        for name, var in self.vars.items():
+            names.append(name)
+            offsets.append(getattr(self.ctype, name).offset)
+            if isinstance(var.type, array):
+                # array_t
+                formats.append(array_t.numpy_dtype())
+            elif isinstance(var.type, Struct):
+                # nested struct
+                formats.append(var.type.numpy_dtype())
+            elif issubclass(var.type, ctypes.Array):
+                scalar_typestr = type_typestr(var.type._wp_scalar_type_)
+                if len(var.type._shape_) == 1:
+                    # vector
+                    formats.append(f"{var.type._length_}{scalar_typestr}")
+                else:
+                    # matrix
+                    formats.append(f"{var.type._shape_}{scalar_typestr}")
+            else:
+                # scalar
+                formats.append(type_typestr(var.type))
+        return {"names": names, "formats": formats, "offsets": offsets, "itemsize": ctypes.sizeof(self.ctype)}
+    # constructs a Warp struct instance from a pointer to the ctype
+    def from_ptr(self, ptr):
+        if not ptr:
+            raise RuntimeError("NULL pointer exception")
+        # create a new struct instance
+        instance = self()
+        for name, var in self.vars.items():
+            offset = getattr(self.ctype, name).offset
+            if isinstance(var.type, array):
+                # We could reconstruct wp.array from array_t, but it's problematic.
+                # There's no guarantee that the original wp.array is still allocated and
+                # no easy way to make a backref.
+                # Instead, we just create a stub annotation, which is not a fully usable array object.
+                setattr(instance, name, array(dtype=var.type.dtype, ndim=var.type.ndim))
+            elif isinstance(var.type, Struct):
+                # nested struct
+                value = var.type.from_ptr(ptr + offset)
+                setattr(instance, name, value)
+            elif issubclass(var.type, ctypes.Array):
+                # vector/matrix
+                value = var.type.from_ptr(ptr + offset)
+                setattr(instance, name, value)
+            else:
+                # scalar
+                cvalue = ctypes.cast(ptr + offset, ctypes.POINTER(var.type._type_)).contents
+                if var.type == warp.float16:
+                    setattr(instance, name, half_bits_to_float(cvalue))
+                else:
+                    setattr(instance, name, cvalue.value)
+        return instance
+class Reference:
+    def __init__(self, value_type):
+        self.value_type = value_type
+def is_reference(type: Any) -> builtins.bool:
+    return isinstance(type, Reference)
+def strip_reference(arg: Any) -> Any:
+    if is_reference(arg):
+        return arg.value_type
+    else:
+        return arg
+def compute_type_str(base_name, template_params):
+    if not template_params:
+        return base_name
+    def param2str(p):
+        if isinstance(p, builtins.bool):
+            return "true" if p else "false"
+        if isinstance(p, int):
+            return str(p)
+        elif hasattr(p, "_wp_generic_type_str_"):
+            return compute_type_str(f"wp::{p._wp_generic_type_str_}", p._wp_type_params_)
+        elif hasattr(p, "_type_"):
+            if p.__name__ == "bool":
+                return "bool"
+            else:
+                return f"wp::{p.__name__}"
+        elif is_tile(p):
+            return p.ctype()
+        elif isinstance(p, Struct):
+            return p.native_name
+        return p.__name__
+    return f"{base_name}<{', '.join(map(param2str, template_params))}>"
+class Var:
+    def __init__(
+        self,
+        label: str,
+        type: type,
+        requires_grad: builtins.bool = False,
+        constant: builtins.bool | None = None,
+        prefix: builtins.bool = True,
+        relative_lineno: int | None = None,
+    ):
+        # convert built-in types to wp types
+        if type == float:
+            type = float32
+        elif type == int:
+            type = int32
+        elif type == builtins.bool:
+            type = bool
+        self.label = label
+        self.type = type
+        self.requires_grad = requires_grad
+        self.constant = constant
+        self.prefix = prefix
+        # records whether this Var has been read from in a kernel function (array only)
+        self.is_read = False
+        # records whether this Var has been written to in a kernel function (array only)
+        self.is_write = False
+        # used to associate a view array Var with its parent array Var
+        self.parent = None
+        # Used to associate the variable with the Python statement that resulted in it being created.
+        self.relative_lineno = relative_lineno
+    def __str__(self):
+        return self.label
+    @staticmethod
+    def dtype_to_ctype(t: type) -> str:
+        if hasattr(t, "_wp_generic_type_str_"):
+            return compute_type_str(f"wp::{t._wp_generic_type_str_}", t._wp_type_params_)
+        elif isinstance(t, Struct):
+            return t.native_name
+        elif hasattr(t, "_wp_native_name_"):
+            return f"wp::{t._wp_native_name_}"
+        elif t.__name__ in ("bool", "int", "float"):
+            return t.__name__
+        return f"wp::{t.__name__}"
+    @staticmethod
+    def type_to_ctype(t: type, value_type: builtins.bool = False) -> str:
+        if isinstance(t, fixedarray):
+            template_args = (str(t.size), Var.dtype_to_ctype(t.dtype))
+            dtypestr = ", ".join(template_args)
+            classstr = f"wp::{type(t).__name__}"
+            return f"{classstr}_t<{dtypestr}>"
+        elif is_array(t):
+            dtypestr = Var.dtype_to_ctype(t.dtype)
+            classstr = f"wp::{type(t).__name__}"
+            return f"{classstr}_t<{dtypestr}>"
+        elif get_origin(t) is tuple:
+            dtypestr = ", ".join(Var.dtype_to_ctype(x) for x in get_args(t))
+            return f"wp::tuple_t<{dtypestr}>"
+        elif is_tuple(t):
+            dtypestr = ", ".join(Var.dtype_to_ctype(x) for x in t.types)
+            classstr = f"wp::{type(t).__name__}"
+            return f"{classstr}<{dtypestr}>"
+        elif is_tile(t):
+            return t.ctype()
+        elif isinstance(t, type) and issubclass(t, StructInstance):
+            # ensure the actual Struct name is used instead of "NewStructInstance"
+            return t.native_name
+        elif is_reference(t):
+            if not value_type:
+                return Var.type_to_ctype(t.value_type) + "*"
+            return Var.type_to_ctype(t.value_type)
+        return Var.dtype_to_ctype(t)
+    def ctype(self, value_type: builtins.bool = False) -> str:
+        return Var.type_to_ctype(self.type, value_type)
+    def emit(self, prefix: str = "var"):
+        if self.prefix:
+            return f"{prefix}_{self.label}"
+        else:
+            return self.label
+    def emit_adj(self):
+        return self.emit("adj")
+    def mark_read(self):
+        """Marks this Var as having been read from in a kernel (array only)."""
+        if not is_array(self.type):
+            return
+        self.is_read = True
+        # recursively update all parent states
+        parent = self.parent
+        while parent is not None:
+            parent.is_read = True
+            parent = parent.parent
+    def mark_write(self, **kwargs):
+        """Marks this Var has having been written to in a kernel (array only)."""
+        if not is_array(self.type):
+            return
+        # detect if we are writing to an array after reading from it within the same kernel
+        if self.is_read and warp._src.config.verify_autograd_array_access:
+            if "kernel_name" and "filename" and "lineno" in kwargs:
+                print(
+                    f"Warning: Array passed to argument {self.label} in kernel {kwargs['kernel_name']} at {kwargs['filename']}:{kwargs['lineno']} is being written to after it has been read from within the same kernel. This may corrupt gradient computation in the backward pass."
+                )
+            else:
+                print(
+                    f"Warning: Array {self} is being written to after it has been read from within the same kernel. This may corrupt gradient computation in the backward pass."
+                )
+        self.is_write = True
+        # recursively update all parent states
+        parent = self.parent
+        while parent is not None:
+            parent.is_write = True
+            parent = parent.parent
+class Block:
+    # Represents a basic block of instructions, e.g.: list
+    # of straight line instructions inside a for-loop or conditional
+    def __init__(self):
+        # list of statements inside this block
+        self.body_forward = []
+        self.body_replay = []
+        self.body_reverse = []
+        # list of vars declared in this block
+        self.vars = []
+def apply_defaults(
+    bound_args: inspect.BoundArguments,
+    values: Mapping[str, Any],
+):
+    # Similar to Python's `inspect.BoundArguments.apply_defaults()`
+    # but with the possibility to pass an augmented set of default values.
+    arguments = bound_args.arguments
+    new_arguments = []
+    for name in bound_args._signature.parameters.keys():
+        if name in arguments:
+            new_arguments.append((name, arguments[name]))
+        elif name in values:
+            new_arguments.append((name, values[name]))
+    bound_args.arguments = dict(new_arguments)
+def func_match_args(func, arg_types, kwarg_types):
+    try:
+        # Try to bind the given arguments to the function's signature.
+        # This is not checking whether the argument types are matching,
+        # rather it's just assigning each argument to the corresponding
+        # function parameter.
+        bound_arg_types = func.signature.bind(*arg_types, **kwarg_types)
+    except TypeError:
+        return False
+    # Populate the bound arguments with any default values.
+    default_arg_types = {
+        k: None if v is None else get_arg_type(v)
+        for k, v in func.defaults.items()
+        if k not in bound_arg_types.arguments
+    }
+    apply_defaults(bound_arg_types, default_arg_types)
+    bound_arg_types = tuple(bound_arg_types.arguments.values())
+    # Check the given argument types against the ones defined on the function.
+    for bound_arg_type, func_arg_type in zip(bound_arg_types, func.input_types.values()):
+        # Let the `value_func` callback infer the type.
+        if bound_arg_type is None:
+            continue
+        # if arg type registered as Any, treat as
+        # template allowing any type to match
+        if func_arg_type == Any:
+            continue
+        # handle function refs as a special case
+        if func_arg_type == Callable and isinstance(bound_arg_type, warp._src.context.Function):
+            continue
+        # check arg type matches input variable type
+        if not types_equal(func_arg_type, strip_reference(bound_arg_type), match_generic=True):
+            return False
+    return True
+def get_arg_type(arg: Var | Any) -> type:
+    if isinstance(arg, str):
+        return str
+    if isinstance(arg, Sequence):
+        return tuple(get_arg_type(x) for x in arg)
+    if is_array(arg):
+        return arg
+    if get_origin(arg) is tuple:
+        return tuple(get_arg_type(x) for x in get_args(arg))
+    if is_tuple(arg):
+        return arg
+    if isinstance(arg, (type, warp._src.context.Function)):
+        return arg
+    if isinstance(arg, Var):
+        if get_origin(arg.type) is tuple:
+            return get_args(arg.type)
+        return arg.type
+    return type(arg)
+def get_arg_value(arg: Any) -> Any:
+    if isinstance(arg, Sequence):
+        return tuple(get_arg_value(x) for x in arg)
+    if isinstance(arg, (type, warp._src.context.Function)):
+        return arg
+    if isinstance(arg, Var):
+        if is_tuple(arg.type):
+            return tuple(get_arg_value(x) for x in arg.type.values)
+        if arg.constant is not None:
+            return arg.constant
+    return arg
+class Adjoint:
+    # Source code transformer, this class takes a Python function and
+    # generates forward and backward SSA forms of the function instructions
+    def __init__(
+        adj,
+        func: Callable[..., Any],
+        overload_annotations=None,
+        is_user_function=False,
+        skip_forward_codegen=False,
+        skip_reverse_codegen=False,
+        custom_reverse_mode=False,
+        custom_reverse_num_input_args=-1,
+        transformers: list[ast.NodeTransformer] | None = None,
+        source: str | None = None,
+    ):
+        adj.func = func
+        adj.is_user_function = is_user_function
+        # whether the generation of the forward code is skipped for this function
+        adj.skip_forward_codegen = skip_forward_codegen
+        # whether the generation of the adjoint code is skipped for this function
+        adj.skip_reverse_codegen = skip_reverse_codegen
+        # Whether this function is used by a kernel that has has the backward pass enabled.
+        adj.used_by_backward_kernel = False
+        # extract name of source file
+        adj.filename = inspect.getsourcefile(func) or "unknown source file"
+        # get source file line number where function starts
+        adj.fun_lineno = 0
+        adj.source = source
+        if adj.source is None:
+            adj.source, adj.fun_lineno = adj.extract_function_source(func)
+        assert adj.source is not None, f"Failed to extract source code for function {func.__name__}"
+        # Indicates where the function definition starts (excludes decorators)
+        adj.fun_def_lineno = None
+        # get function source code
+        # ensures that indented class methods can be parsed as kernels
+        adj.source = textwrap.dedent(adj.source)
+        adj.source_lines = adj.source.splitlines()
+        if transformers is None:
+            transformers = []
+        # build AST and apply node transformers
+        adj.tree = ast.parse(adj.source)
+        adj.transformers = transformers
+        for transformer in transformers:
+            adj.tree = transformer.visit(adj.tree)
+        adj.fun_name = adj.tree.body[0].name
+        # for keeping track of line number in function code
+        adj.lineno = None
+        # whether the forward code shall be used for the reverse pass and a custom
+        # function signature is applied to the reverse version of the function
+        adj.custom_reverse_mode = custom_reverse_mode
+        # the number of function arguments that pertain to the forward function
+        # input arguments (i.e. the number of arguments that are not adjoint arguments)
+        adj.custom_reverse_num_input_args = custom_reverse_num_input_args
+        # parse argument types
+        argspec = get_full_arg_spec(func)
+        # ensure all arguments are annotated
+        if overload_annotations is None:
+            # use source-level argument annotations
+            if len(argspec.annotations) < len(argspec.args):
+                raise WarpCodegenError(f"Incomplete argument annotations on function {adj.fun_name}")
+            adj.arg_types = {k: v for k, v in argspec.annotations.items() if not (k == "return" and v is None)}
+        else:
+            # use overload argument annotations
+            for arg_name in argspec.args:
+                if arg_name not in overload_annotations:
+                    raise WarpCodegenError(f"Incomplete overload annotations for function {adj.fun_name}")
+            adj.arg_types = overload_annotations.copy()
+        adj.args = []
+        adj.symbols = {}
+        for name, type in adj.arg_types.items():
+            # skip return hint
+            if name == "return":
+                continue
+            # add variable for argument
+            arg = Var(name, type, requires_grad=False)
+            adj.args.append(arg)
+            # pre-populate symbol dictionary with function argument names
+            # this is to avoid registering false references to overshadowed modules
+            adj.symbols[name] = arg
+        # Indicates whether there are unresolved static expressions in the function.
+        # These stem from wp.static() expressions that could not be evaluated at declaration time.
+        # This will signal to the module builder that this module needs to be rebuilt even if the module hash is unchanged.
+        adj.has_unresolved_static_expressions = False
+        # try to replace static expressions by their constant result if the
+        # expression can be evaluated at declaration time
+        adj.static_expressions: dict[str, Any] = {}
+        if "static" in adj.source:
+            adj.replace_static_expressions()
+        # There are cases where a same module might be rebuilt multiple times,
+        # for example when kernels are nested inside of functions, or when
+        # a kernel's launch raises an exception. Ideally we'd always want to
+        # avoid rebuilding kernels but some corner cases seem to depend on it,
+        # so we only avoid rebuilding kernels that errored out to give a chance
+        # for unit testing errors being spit out from kernels.
+        adj.skip_build = False
+    # allocate extra space for a function call that requires its
+    # own shared memory space, we treat shared memory as a stack
+    # where each function pushes and pops space off, the extra
+    # quantity is the 'roofline' amount required for the entire kernel
+    def alloc_shared_extra(adj, num_bytes):
+        adj.max_required_extra_shared_memory = max(adj.max_required_extra_shared_memory, num_bytes)
+    # returns the total number of bytes for a function
+    # based on it's own requirements + worst case
+    # requirements of any dependent functions
+    def get_total_required_shared(adj):
+        total_shared = 0
+        for var in adj.variables:
+            if is_tile(var.type) and var.type.storage == "shared" and var.type.owner:
+                total_shared += var.type.size_in_bytes()
+        return total_shared + adj.max_required_extra_shared_memory
+    @staticmethod
+    def extract_function_source(func: Callable) -> tuple[str, int]:
+        try:
+            _, fun_lineno = inspect.getsourcelines(func)
+            source = inspect.getsource(func)
+        except OSError as e:
+            raise RuntimeError(
+                "Directly evaluating Warp code defined as a string using `exec()` is not supported, "
+                "please save it to a file and use `importlib` if needed."
+            ) from e
+        return source, fun_lineno
+    # generate function ssa form and adjoint
+    def build(adj, builder, default_builder_options=None):
+        # arg Var read/write flags are held during module rebuilds, so we reset here even when skipping a build
+        for arg in adj.args:
+            arg.is_read = False
+            arg.is_write = False
+        if adj.skip_build:
+            return
+        adj.builder = builder
+        if default_builder_options is None:
+            default_builder_options = {}
+        if adj.builder:
+            adj.builder_options = adj.builder.options
+        else:
+            adj.builder_options = default_builder_options
+        global options
+        options = adj.builder_options
+        adj.symbols = {}  # map from symbols to adjoint variables
+        adj.variables = []  # list of local variables (in order)
+        adj.return_var = None  # return type for function or kernel
+        adj.loop_symbols = []  # symbols at the start of each loop
+        adj.loop_const_iter_symbols = (
+            set()
+        )  # constant iteration variables for static loops (mutating them does not raise an error)
+        # blocks
+        adj.blocks = [Block()]
+        adj.loop_blocks = []
+        # holds current indent level
+        adj.indentation = ""
+        # used to generate new label indices
+        adj.label_count = 0
+        # tracks how much additional shared memory is required by any dependent function calls
+        adj.max_required_extra_shared_memory = 0
+        # update symbol map for each argument
+        for a in adj.args:
+            adj.symbols[a.label] = a
+        # recursively evaluate function body
+        try:
+            adj.eval(adj.tree.body[0])
+        except Exception as original_exc:
+            try:
+                lineno = adj.lineno + adj.fun_lineno
+                line = adj.source_lines[adj.lineno]
+                msg = f'Error while parsing function "{adj.fun_name}" at {adj.filename}:{lineno}:\n{line}\n'
+                # Combine the new message with the original exception's arguments
+                new_args = (";".join([msg] + [str(a) for a in original_exc.args]),)
+                # Enhance the original exception with parser context before re-raising.
+                # 'from None' is used to suppress Python's chained exceptions for a cleaner error output.
+                raise type(original_exc)(*new_args).with_traceback(original_exc.__traceback__) from None
+            finally:
+                adj.skip_build = True
+                adj.builder = None
+        if builder is not None:
+            for a in adj.args:
+                if isinstance(a.type, Struct):
+                    builder.build_struct_recursive(a.type)
+                elif isinstance(a.type, warp._src.types.array) and isinstance(a.type.dtype, Struct):
+                    builder.build_struct_recursive(a.type.dtype)
+            # release builder reference for GC
+            adj.builder = None
+    # code generation methods
+    def format_template(adj, template, input_vars, output_var):
+        # output var is always the 0th index
+        args = [output_var, *input_vars]
+        s = template.format(*args)
+        return s
+    # generates a list of formatted args
+    def format_args(adj, prefix, args):
+        arg_strs = []
+        for a in args:
+            if isinstance(a, warp._src.context.Function):
+                # functions don't have a var_ prefix so strip it off here
+                if prefix == "var":
+                    arg_strs.append(f"{a.namespace}{a.native_func}")
+                else:
+                    arg_strs.append(f"{a.namespace}{prefix}_{a.native_func}")
+            elif is_reference(a.type):
+                arg_strs.append(f"{prefix}_{a}")
+            elif isinstance(a, Var):
+                arg_strs.append(a.emit(prefix))
+            else:
+                raise WarpCodegenTypeError(f"Arguments must be variables or functions, got {type(a)}")
+        return arg_strs
+    # generates argument string for a forward function call
+    def format_forward_call_args(adj, args, use_initializer_list):
+        arg_str = ", ".join(adj.format_args("var", args))
+        if use_initializer_list:
+            return f"{{{arg_str}}}"
+        return arg_str
+    # generates argument string for a reverse function call
+    def format_reverse_call_args(
+        adj,
+        args_var,
+        args,
+        args_out,
+        use_initializer_list,
+        has_output_args=True,
+        require_original_output_arg=False,
+    ):
+        formatted_var = adj.format_args("var", args_var)
+        formatted_out = []
+        if has_output_args and (require_original_output_arg or len(args_out) > 1):
+            formatted_out = adj.format_args("var", args_out)
+        formatted_var_adj = adj.format_args(
+            "&adj" if use_initializer_list else "adj",
+            args,
+        )
+        formatted_out_adj = adj.format_args("adj", args_out)
+        if len(formatted_var_adj) == 0 and len(formatted_out_adj) == 0:
+            # there are no adjoint arguments, so we don't need to call the reverse function
+            return None
+        if use_initializer_list:
+            var_str = f"{{{', '.join(formatted_var)}}}"
+            out_str = f"{{{', '.join(formatted_out)}}}"
+            adj_str = f"{{{', '.join(formatted_var_adj)}}}"
+            out_adj_str = ", ".join(formatted_out_adj)
+            if len(args_out) > 1:
+                arg_str = ", ".join([var_str, out_str, adj_str, out_adj_str])
+            else:
+                arg_str = ", ".join([var_str, adj_str, out_adj_str])
+        else:
+            arg_str = ", ".join(formatted_var + formatted_out + formatted_var_adj + formatted_out_adj)
+        return arg_str
+    def indent(adj):
+        adj.indentation = adj.indentation + "    "
+    def dedent(adj):
+        adj.indentation = adj.indentation[:-4]
+    def begin_block(adj, name="block"):
+        b = Block()
+        # give block a unique id
+        b.label = name + "_" + str(adj.label_count)
+        adj.label_count += 1
+        adj.blocks.append(b)
+        return b
+    def end_block(adj):
+        return adj.blocks.pop()
+    def add_var(adj, type=None, constant=None):
+        index = len(adj.variables)
+        name = str(index)
+        # allocate new variable
+        v = Var(name, type=type, constant=constant, relative_lineno=adj.lineno)
+        adj.variables.append(v)
+        adj.blocks[-1].vars.append(v)
+        return v
+    def register_var(adj, var):
+        # We sometimes initialize `Var` instances that might be thrown away
+        # afterwards, so this method allows to defer their registration among
+        # the list of primal vars until later on, instead of registering them
+        # immediately if we were to use `adj.add_var()` or `adj.add_constant()`.
+        if isinstance(var, (Reference, warp._src.context.Function)):
+            return var
+        if isinstance(var, int):
+            return adj.add_constant(var)
+        if var.label is None:
+            return adj.add_var(var.type, var.constant)
+        return var
+    def get_line_directive(adj, statement: str, relative_lineno: int | None = None) -> str | None:
+        """Get a line directive for the given statement.
+        Args:
+            statement: The statement to get the line directive for.
+            relative_lineno: The line number of the statement relative to the function.
+        Returns:
+            A line directive for the given statement, or None if no line directive is needed.
+        """
+        if adj.filename == "unknown source file" or adj.fun_lineno == 0:
+            # Early return if function is not associated with a source file or is otherwise invalid
+            # TODO: Get line directives working with wp.map() functions
+            return None
+        # lineinfo is enabled by default in debug mode regardless of the builder option, don't want to unnecessarily
+        # emit line directives in generated code if it's not being compiled with line information
+        build_mode = val if (val := adj.builder_options.get("mode")) is not None else warp._src.config.mode
+        lineinfo_enabled = adj.builder_options.get("lineinfo", False) or build_mode == "debug"
+        if relative_lineno is not None and lineinfo_enabled and warp._src.config.line_directives:
+            is_comment = statement.strip().startswith("//")
+            if not is_comment:
+                line = relative_lineno + adj.fun_lineno
+                # Convert backslashes to forward slashes for CUDA compatibility
+                normalized_path = adj.filename.replace("\\", "/")
+                return f'#line {line} "{normalized_path}"'
+        return None
+    def add_forward(adj, statement: str, replay: str | None = None, skip_replay: builtins.bool = False) -> None:
+        """Append a statement to the forward pass."""
+        if line_directive := adj.get_line_directive(statement, adj.lineno):
+            adj.blocks[-1].body_forward.append(line_directive)
+        adj.blocks[-1].body_forward.append(adj.indentation + statement)
+        if not skip_replay:
+            if line_directive:
+                adj.blocks[-1].body_replay.append(line_directive)
+            if replay:
+                # if custom replay specified then output it
+                adj.blocks[-1].body_replay.append(adj.indentation + replay)
+            else:
+                # by default just replay the original statement
+                adj.blocks[-1].body_replay.append(adj.indentation + statement)
+    # append a statement to the reverse pass
+    def add_reverse(adj, statement: str) -> None:
+        """Append a statement to the reverse pass."""
+        adj.blocks[-1].body_reverse.append(adj.indentation + statement)
+        if line_directive := adj.get_line_directive(statement, adj.lineno):
+            adj.blocks[-1].body_reverse.append(line_directive)
+    def add_constant(adj, n):
+        output = adj.add_var(type=type(n), constant=n)
+        return output
+    def load(adj, var):
+        if is_reference(var.type):
+            var = adj.add_builtin_call("load", [var])
+        return var
+    def add_comp(adj, op_strings, left, comps):
+        output = adj.add_var(builtins.bool)
+        left = adj.load(left)
+        s = output.emit() + " = " + ("(" * len(comps)) + left.emit() + " "
+        prev_comp_var = None
+        for op, comp in zip(op_strings, comps):
+            comp_chainable = op_str_is_chainable(op)
+            if comp_chainable and prev_comp_var:
+                # We restrict chaining to operands of the same type
+                if prev_comp_var.type is comp.type:
+                    prev_comp_var = adj.load(prev_comp_var)
+                    comp_var = adj.load(comp)
+                    s += "&& (" + prev_comp_var.emit() + " " + op + " " + comp_var.emit() + ")) "
+                else:
+                    raise WarpCodegenTypeError(
+                        f"Cannot chain comparisons of unequal types: {prev_comp_var.type} {op} {comp.type}."
+                    )
+            else:
+                comp_var = adj.load(comp)
+                s += op + " " + comp_var.emit() + ") "
+            prev_comp_var = comp_var
+        s = s.rstrip() + ";"
+        adj.add_forward(s)
+        return output
+    def add_bool_op(adj, op_string, exprs):
+        exprs = [adj.load(expr) for expr in exprs]
+        output = adj.add_var(builtins.bool)
+        command = output.emit() + " = " + (" " + op_string + " ").join([expr.emit() for expr in exprs]) + ";"
+        adj.add_forward(command)
+        return output
+    def resolve_func(adj, func, arg_types, kwarg_types, min_outputs):
+        if not func.is_builtin():
+            # user-defined function
+            overload = func.get_overload(arg_types, kwarg_types)
+            if overload is not None:
+                return overload
+        else:
+            # if func is overloaded then perform overload resolution here
+            # we validate argument types before they go to generated native code
+            for f in func.overloads:
+                # skip type checking for variadic functions
+                if not f.variadic:
+                    # check argument counts match are compatible (may be some default args)
+                    if len(f.input_types) < len(arg_types) + len(kwarg_types):
+                        continue
+                    if not func_match_args(f, arg_types, kwarg_types):
+                        continue
+                # check output dimensions match expectations
+                if min_outputs:
+                    value_type = f.value_func(None, None)
+                    if not isinstance(value_type, Sequence) or len(value_type) != min_outputs:
+                        continue
+                # found a match, use it
+                return f
+        # unresolved function, report error
+        arg_type_reprs = []
+        for x in itertools.chain(arg_types, kwarg_types.values()):
+            if isinstance(x, warp._src.context.Function):
+                arg_type_reprs.append("function")
+            else:
+                # shorten Warp primitive type names
+                if isinstance(x, Sequence):
+                    if len(x) != 1:
+                        raise WarpCodegenError("Argument must not be the result from a multi-valued function")
+                    arg_type = x[0]
+                else:
+                    arg_type = x
+                arg_type_reprs.append(type_repr(arg_type))
+        raise WarpCodegenError(
+            f"Couldn't find function overload for '{func.key}' that matched inputs with types: [{', '.join(arg_type_reprs)}]"
+        )
+    def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
+        # Extract the types and values passed as arguments to the function call.
+        arg_types = tuple(strip_reference(get_arg_type(x)) for x in args)
+        kwarg_types = {k: strip_reference(get_arg_type(v)) for k, v in kwargs.items()}
+        # Resolve the exact function signature among any existing overload.
+        func = adj.resolve_func(func, arg_types, kwarg_types, min_outputs)
+        # Bind the positional and keyword arguments to the function's signature
+        # in order to process them as Python does it.
+        bound_args: inspect.BoundArguments = func.signature.bind(*args, **kwargs)
+        # Type args are the "compile time" argument values we get from codegen.
+        # For example, when calling `wp.vec3f(...)` from within a kernel,
+        # this translates in fact to calling the `vector()` built-in augmented
+        # with the type args `length=3, dtype=float`.
+        # Eventually, these need to be passed to the underlying C++ function,
+        # so we update the arguments with the type args here.
+        if type_args:
+            for arg in type_args:
+                if arg in bound_args.arguments:
+                    # In case of conflict, ideally we'd throw an error since
+                    # what comes from codegen should be the source of truth
+                    # and users also passing the same value as an argument
+                    # is redundant (e.g.: `wp.mat22(shape=(2, 2))`).
+                    # However, for backward compatibility, we allow that form
+                    # as long as the values are equal.
+                    if values_check_equal(get_arg_value(bound_args.arguments[arg]), type_args[arg]):
+                        continue
+                    raise RuntimeError(
+                        f"Remove the extraneous `{arg}` parameter "
+                        f"when calling the templated version of "
+                        f"`wp.{func.native_func}()`"
+                    )
+            type_vars = {k: Var(None, type=type(v), constant=v) for k, v in type_args.items()}
+            apply_defaults(bound_args, type_vars)
+        if func.defaults:
+            default_vars = {
+                k: Var(None, type=type(v), constant=v)
+                for k, v in func.defaults.items()
+                if k not in bound_args.arguments and v is not None
+            }
+            apply_defaults(bound_args, default_vars)
+        bound_args = bound_args.arguments
+        # if it is a user-function then build it recursively
+        if not func.is_builtin():
+            # If the function called is a user function,
+            # we need to ensure its adjoint is also being generated.
+            if adj.used_by_backward_kernel:
+                func.adj.used_by_backward_kernel = True
+            if adj.builder is None:
+                func.build(None)
+            elif func not in adj.builder.functions:
+                adj.builder.build_function(func)
+                # add custom grad, replay functions to the list of functions
+                # to be built later (invalid code could be generated if we built them now)
+                # so that they are not missed when only the forward function is imported
+                # from another module
+                if func.custom_grad_func:
+                    adj.builder.deferred_functions.append(func.custom_grad_func)
+                if func.custom_replay_func:
+                    adj.builder.deferred_functions.append(func.custom_replay_func)
+        # Resolve the return value based on the types and values of the given arguments.
+        bound_arg_types = {k: get_arg_type(v) for k, v in bound_args.items()}
+        bound_arg_values = {k: get_arg_value(v) for k, v in bound_args.items()}
+        return_type = func.value_func(
+            {k: strip_reference(v) for k, v in bound_arg_types.items()},
+            bound_arg_values,
+        )
+        # Handle the special case where a Var instance is returned from the `value_func`
+        # callback, in which case we replace the call with a reference to that variable.
+        if isinstance(return_type, Var):
+            return adj.register_var(return_type)
+        elif isinstance(return_type, Sequence) and all(isinstance(x, Var) for x in return_type):
+            return tuple(adj.register_var(x) for x in return_type)
+        if get_origin(return_type) is tuple:
+            types = get_args(return_type)
+            return_type = warp._src.types.tuple_t(types=types, values=(None,) * len(types))
+        # immediately allocate output variables so we can pass them into the dispatch method
+        if return_type is None:
+            # void function
+            output = None
+            output_list = []
+        elif not isinstance(return_type, Sequence) or len(return_type) == 1:
+            # single return value function
+            if isinstance(return_type, Sequence):
+                return_type = return_type[0]
+            output = adj.add_var(return_type)
+            output_list = [output]
+        else:
+            # multiple return value function
+            output = [adj.add_var(v) for v in return_type]
+            output_list = output
+        # If we have a built-in that requires special handling to dispatch
+        # the arguments to the underlying C++ function, then we can resolve
+        # these using the `dispatch_func`. Since this is only called from
+        # within codegen, we pass it directly `codegen.Var` objects,
+        # which allows for some more advanced resolution to be performed,
+        # for example by checking whether an argument corresponds to
+        # a literal value or references a variable.
+        extra_shared_memory = 0
+        if func.lto_dispatch_func is not None:
+            func_args, template_args, ltoirs, extra_shared_memory = func.lto_dispatch_func(
+                func.input_types, return_type, output_list, bound_args, options=adj.builder_options, builder=adj.builder
+            )
+        elif func.dispatch_func is not None:
+            func_args, template_args = func.dispatch_func(func.input_types, return_type, bound_args)
+        else:
+            func_args = tuple(bound_args.values())
+            template_args = ()
+        func_args = tuple(adj.register_var(x) for x in func_args)
+        func_name = compute_type_str(func.native_func, template_args)
+        use_initializer_list = func.initializer_list_func(bound_args, return_type)
+        fwd_args = []
+        for func_arg in func_args:
+            if not isinstance(func_arg, (Reference, warp._src.context.Function)):
+                func_arg_var = adj.load(func_arg)
+            else:
+                func_arg_var = func_arg
+            # if the argument is a function (and not a builtin), then build it recursively
+            if isinstance(func_arg_var, warp._src.context.Function) and not func_arg_var.is_builtin():
+                if adj.used_by_backward_kernel:
+                    func_arg_var.adj.used_by_backward_kernel = True
+                adj.builder.build_function(func_arg_var)
+            fwd_args.append(strip_reference(func_arg_var))
+        if return_type is None:
+            # handles expression (zero output) functions, e.g.: void do_something();
+            forward_call = (
+                f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
+            )
+            replay_call = forward_call
+            if func.custom_replay_func is not None or func.replay_snippet is not None:
+                replay_call = f"{func.namespace}replay_{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
+        elif not isinstance(return_type, Sequence) or len(return_type) == 1:
+            # handle simple function (one output)
+            forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
+            replay_call = forward_call
+            if func.custom_replay_func is not None:
+                replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
+        else:
+            # handle multiple value functions
+            forward_call = (
+                f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args + output, use_initializer_list)});"
+            )
+            replay_call = forward_call
+        if func.skip_replay:
+            adj.add_forward(forward_call, replay="// " + replay_call)
+        else:
+            adj.add_forward(forward_call, replay=replay_call)
+        if func.is_differentiable and len(func_args):
+            adj_args = tuple(strip_reference(x) for x in func_args)
+            reverse_has_output_args = (
+                func.require_original_output_arg or len(output_list) > 1
+            ) and func.custom_grad_func is None
+            arg_str = adj.format_reverse_call_args(
+                fwd_args,
+                adj_args,
+                output_list,
+                use_initializer_list,
+                has_output_args=reverse_has_output_args,
+                require_original_output_arg=func.require_original_output_arg,
+            )
+            if arg_str is not None:
+                reverse_call = f"{func.namespace}adj_{func.native_func}({arg_str});"
+                adj.add_reverse(reverse_call)
+        # update our smem roofline requirements based on any
+        # shared memory required by the dependent function call
+        if not func.is_builtin():
+            adj.alloc_shared_extra(func.adj.get_total_required_shared() + extra_shared_memory)
+        else:
+            adj.alloc_shared_extra(extra_shared_memory)
+        return output
+    def add_builtin_call(adj, func_name, args, min_outputs=None):
+        func = warp._src.context.builtin_functions[func_name]
+        return adj.add_call(func, args, {}, {}, min_outputs=min_outputs)
+    def add_return(adj, var):
+        if var is None or len(var) == 0:
+            # NOTE: If this kernel gets compiled for a CUDA device, then we need
+            # to convert the return; into a continue; in codegen_func_forward()
+            adj.add_forward("return;", f"goto label{adj.label_count};")
+        elif len(var) == 1:
+            adj.add_forward(f"return {var[0].emit()};", f"goto label{adj.label_count};")
+            adj.add_reverse("adj_" + str(var[0]) + " += adj_ret;")
+        else:
+            for i, v in enumerate(var):
+                adj.add_forward(f"ret_{i} = {v.emit()};")
+                adj.add_reverse(f"adj_{v} += adj_ret_{i};")
+            adj.add_forward("return;", f"goto label{adj.label_count};")
+        adj.add_reverse(f"label{adj.label_count}:;")
+        adj.label_count += 1
+    # define an if statement
+    def begin_if(adj, cond):
+        cond = adj.load(cond)
+        adj.add_forward(f"if ({cond.emit()}) {{")
+        adj.add_reverse("}")
+        adj.indent()
+    def end_if(adj, cond):
+        adj.dedent()
+        adj.add_forward("}")
+        cond = adj.load(cond)
+        adj.add_reverse(f"if ({cond.emit()}) {{")
+    def begin_else(adj, cond):
+        cond = adj.load(cond)
+        adj.add_forward(f"if (!{cond.emit()}) {{")
+        adj.add_reverse("}")
+        adj.indent()
+    def end_else(adj, cond):
+        adj.dedent()
+        adj.add_forward("}")
+        cond = adj.load(cond)
+        adj.add_reverse(f"if (!{cond.emit()}) {{")
+    # define a for-loop
+    def begin_for(adj, iter):
+        cond_block = adj.begin_block("for")
+        adj.loop_blocks.append(cond_block)
+        adj.add_forward(f"start_{cond_block.label}:;")
+        adj.indent()
+        # evaluate cond
+        adj.add_forward(f"if (iter_cmp({iter.emit()}) == 0) goto end_{cond_block.label};")
+        # evaluate iter
+        val = adj.add_builtin_call("iter_next", [iter])
+        adj.begin_block()
+        return val
+    def end_for(adj, iter):
+        body_block = adj.end_block()
+        cond_block = adj.end_block()
+        adj.loop_blocks.pop()
+        ####################
+        # forward pass
+        for i in cond_block.body_forward:
+            adj.blocks[-1].body_forward.append(i)
+        for i in body_block.body_forward:
+            adj.blocks[-1].body_forward.append(i)
+        adj.add_forward(f"goto start_{cond_block.label};", skip_replay=True)
+        adj.dedent()
+        adj.add_forward(f"end_{cond_block.label}:;", skip_replay=True)
+        ####################
+        # reverse pass
+        reverse = []
+        # reverse iterator
+        reverse.append(adj.indentation + f"{iter.emit()} = wp::iter_reverse({iter.emit()});")
+        for i in cond_block.body_forward:
+            reverse.append(i)
+        # zero adjoints
+        for i in body_block.vars:
+            if is_tile(i.type):
+                if i.type.owner:
+                    reverse.append(adj.indentation + f"\t{i.emit_adj()}.grad_zero();")
+            else:
+                reverse.append(adj.indentation + f"\t{i.emit_adj()} = {{}};")
+        # replay
+        for i in body_block.body_replay:
+            reverse.append(i)
+        # reverse
+        for i in reversed(body_block.body_reverse):
+            reverse.append(i)
+        reverse.append(adj.indentation + f"\tgoto start_{cond_block.label};")
+        reverse.append(adj.indentation + f"end_{cond_block.label}:;")
+        adj.blocks[-1].body_reverse.extend(reversed(reverse))
+    # define a while loop
+    def begin_while(adj, cond):
+        # evaluate condition in its own block
+        # so we can control replay
+        cond_block = adj.begin_block("while")
+        adj.loop_blocks.append(cond_block)
+        cond_block.body_forward.append(f"start_{cond_block.label}:;")
+        c = adj.eval(cond)
+        c = adj.load(c)
+        cond_block.body_forward.append(f"if (({c.emit()}) == false) goto end_{cond_block.label};")
+        # being block around loop
+        adj.begin_block()
+        adj.indent()
+    def end_while(adj):
+        adj.dedent()
+        body_block = adj.end_block()
+        cond_block = adj.end_block()
+        adj.loop_blocks.pop()
+        ####################
+        # forward pass
+        for i in cond_block.body_forward:
+            adj.blocks[-1].body_forward.append(i)
+        for i in body_block.body_forward:
+            adj.blocks[-1].body_forward.append(i)
+        adj.blocks[-1].body_forward.append(f"goto start_{cond_block.label};")
+        adj.blocks[-1].body_forward.append(f"end_{cond_block.label}:;")
+        ####################
+        # reverse pass
+        reverse = []
+        # cond
+        for i in cond_block.body_forward:
+            reverse.append(i)
+        # zero adjoints of local vars
+        for i in body_block.vars:
+            reverse.append(f"{i.emit_adj()} = {{}};")
+        # replay
+        for i in body_block.body_replay:
+            reverse.append(i)
+        # reverse
+        for i in reversed(body_block.body_reverse):
+            reverse.append(i)
+        reverse.append(f"goto start_{cond_block.label};")
+        reverse.append(f"end_{cond_block.label}:;")
+        # output
+        adj.blocks[-1].body_reverse.extend(reversed(reverse))
+    def emit_FunctionDef(adj, node):
+        adj.fun_def_lineno = node.lineno
+        for f in node.body:
+            # Skip variable creation for standalone constants, including docstrings
+            if isinstance(f, ast.Expr) and isinstance(f.value, ast.Constant):
+                continue
+            adj.eval(f)
+        if adj.return_var is not None and len(adj.return_var) == 1:
+            if not isinstance(node.body[-1], ast.Return):
+                adj.add_forward("return {};", skip_replay=True)
+        # native function case: return type is specified, eg -> int or -> wp.float32
+        is_func_native = False
+        if node.decorator_list is not None and len(node.decorator_list) == 1:
+            obj = node.decorator_list[0]
+            if isinstance(obj, ast.Call):
+                if isinstance(obj.func, ast.Attribute):
+                    if obj.func.attr == "func_native":
+                        is_func_native = True
+        if is_func_native and node.returns is not None:
+            if isinstance(node.returns, ast.Name):  # python built-in type
+                var = Var(label="return_type", type=eval(node.returns.id))
+            elif isinstance(node.returns, ast.Attribute):  # warp type
+                var = Var(label="return_type", type=eval(node.returns.attr))
+            else:
+                raise WarpCodegenTypeError("Native function return type not recognized")
+            adj.return_var = (var,)
+    def emit_If(adj, node):
+        if len(node.body) == 0:
+            return None
+        # eval condition
+        cond = adj.eval(node.test)
+        if cond.constant is not None:
+            # resolve constant condition
+            if cond.constant:
+                for stmt in node.body:
+                    adj.eval(stmt)
+            else:
+                for stmt in node.orelse:
+                    adj.eval(stmt)
+            return None
+        # save symbol map
+        symbols_prev = adj.symbols.copy()
+        # eval body
+        adj.begin_if(cond)
+        for stmt in node.body:
+            adj.eval(stmt)
+        adj.end_if(cond)
+        # detect existing symbols with conflicting definitions (variables assigned inside the branch)
+        # and resolve with a phi (select) function
+        for items in symbols_prev.items():
+            sym = items[0]
+            var1 = items[1]
+            var2 = adj.symbols[sym]
+            if var1 != var2:
+                # insert a phi function that selects var1, var2 based on cond
+                out = adj.add_builtin_call("where", [cond, var2, var1])
+                adj.symbols[sym] = out
+        symbols_prev = adj.symbols.copy()
+        # evaluate 'else' statement as if (!cond)
+        if len(node.orelse) > 0:
+            adj.begin_else(cond)
+            for stmt in node.orelse:
+                adj.eval(stmt)
+            adj.end_else(cond)
+        # detect existing symbols with conflicting definitions (variables assigned inside the else)
+        # and resolve with a phi (select) function
+        for items in symbols_prev.items():
+            sym = items[0]
+            var1 = items[1]
+            var2 = adj.symbols[sym]
+            if var1 != var2:
+                # insert a phi function that selects var1, var2 based on cond
+                # note the reversed order of vars since we want to use !cond as our select
+                out = adj.add_builtin_call("where", [cond, var1, var2])
+                adj.symbols[sym] = out
+    def emit_IfExp(adj, node):
+        cond = adj.eval(node.test)
+        if cond.constant is not None:
+            return adj.eval(node.body) if cond.constant else adj.eval(node.orelse)
+        adj.begin_if(cond)
+        body = adj.eval(node.body)
+        adj.end_if(cond)
+        adj.begin_else(cond)
+        orelse = adj.eval(node.orelse)
+        adj.end_else(cond)
+        return adj.add_builtin_call("where", [cond, body, orelse])
+    def emit_Compare(adj, node):
+        # node.left, node.ops (list of ops), node.comparators (things to compare to)
+        # e.g. (left ops[0] node.comparators[0]) ops[1] node.comparators[1]
+        left = adj.eval(node.left)
+        comps = [adj.eval(comp) for comp in node.comparators]
+        op_strings = [builtin_operators[type(op)] for op in node.ops]
+        return adj.add_comp(op_strings, left, comps)
+    def emit_BoolOp(adj, node):
+        # op, expr list values
+        op = node.op
+        if isinstance(op, ast.And):
+            func = "&&"
+        elif isinstance(op, ast.Or):
+            func = "||"
+        else:
+            raise WarpCodegenKeyError(f"Op {op} is not supported")
+        return adj.add_bool_op(func, [adj.eval(expr) for expr in node.values])
+    def emit_Name(adj, node):
+        # lookup symbol, if it has already been assigned to a variable then return the existing mapping
+        if node.id in adj.symbols:
+            return adj.symbols[node.id]
+        obj = adj.resolve_external_reference(node.id)
+        if obj is None:
+            raise WarpCodegenKeyError("Referencing undefined symbol: " + str(node.id))
+        if warp._src.types.is_value(obj):
+            # evaluate constant
+            out = adj.add_constant(obj)
+            adj.symbols[node.id] = out
+            return out
+        # the named object is either a function, class name, or module
+        # pass it back to the caller for processing
+        if isinstance(obj, warp._src.context.Function):
+            return obj
+        if isinstance(obj, type):
+            return obj
+        if isinstance(obj, Struct):
+            adj.builder.build_struct_recursive(obj)
+            return obj
+        if isinstance(obj, types.ModuleType):
+            return obj
+        raise TypeError(f"Invalid external reference type: {type(obj)}")
+    @staticmethod
+    def resolve_type_attribute(var_type: type, attr: str):
+        if isinstance(var_type, type) and type_is_value(var_type):
+            if attr == "dtype":
+                return type_scalar_type(var_type)
+            elif attr == "length":
+                return type_size(var_type)
+        return getattr(var_type, attr, None)
+    def vector_component_index(adj, component, vector_type):
+        if len(component) != 1:
+            raise WarpCodegenAttributeError(f"Vector swizzle must be single character, got .{component}")
+        dim = vector_type._shape_[0]
+        swizzles = "xyzw"[0:dim]
+        if component not in swizzles:
+            raise WarpCodegenAttributeError(
+                f"Vector swizzle for {vector_type} must be one of {swizzles}, got {component}"
+            )
+        index = swizzles.index(component)
+        index = adj.add_constant(index)
+        return index
+    def transform_component(adj, component):
+        if len(component) != 1:
+            raise WarpCodegenAttributeError(f"Transform attribute must be single character, got .{component}")
+        if component not in ("p", "q"):
+            raise WarpCodegenAttributeError(f"Attribute for transformation must be either 'p' or 'q', got {component}")
+        return component
+    @staticmethod
+    def is_differentiable_value_type(var_type):
+        # checks that the argument type is a value type (i.e, not an array)
+        # possibly holding differentiable values (for which gradients must be accumulated)
+        return type_scalar_type(var_type) in float_types or isinstance(var_type, Struct)
+    def emit_Attribute(adj, node):
+        if hasattr(node, "is_adjoint"):
+            node.value.is_adjoint = True
+        aggregate = adj.eval(node.value)
+        try:
+            if isinstance(aggregate, Var) and aggregate.constant is not None:
+                # this case may occur when the attribute is a constant, e.g.: `IntEnum.A.value`
+                return aggregate
+            if isinstance(aggregate, types.ModuleType) or isinstance(aggregate, type):
+                out = getattr(aggregate, node.attr)
+                if warp._src.types.is_value(out):
+                    return adj.add_constant(out)
+                if isinstance(out, (enum.IntEnum, enum.IntFlag)):
+                    return adj.add_constant(int(out))
+                return out
+            if hasattr(node, "is_adjoint"):
+                # create a Var that points to the struct attribute, i.e.: directly generates `struct.attr` when used
+                attr_name = aggregate.label + "." + node.attr
+                attr_type = aggregate.type.vars[node.attr].type
+                return Var(attr_name, attr_type)
+            aggregate_type = strip_reference(aggregate.type)
+            # reading a vector or quaternion component
+            if type_is_vector(aggregate_type) or type_is_quaternion(aggregate_type):
+                index = adj.vector_component_index(node.attr, aggregate_type)
+                return adj.add_builtin_call("extract", [aggregate, index])
+            elif type_is_transformation(aggregate_type):
+                component = adj.transform_component(node.attr)
+                if component == "p":
+                    return adj.add_builtin_call("transform_get_translation", [aggregate])
+                else:
+                    return adj.add_builtin_call("transform_get_rotation", [aggregate])
+            else:
+                attr_var = aggregate_type.vars[node.attr]
+                # represent pointer types as uint64
+                if isinstance(attr_var.type, pointer_t):
+                    cast = f"({Var.dtype_to_ctype(uint64)}*)"
+                    adj_cast = f"({Var.dtype_to_ctype(attr_var.type.dtype)}*)"
+                    attr_type = Reference(uint64)
+                else:
+                    cast = ""
+                    adj_cast = ""
+                    attr_type = Reference(attr_var.type)
+                attr = adj.add_var(attr_type)
+                if is_reference(aggregate.type):
+                    adj.add_forward(f"{attr.emit()} = {cast}&({aggregate.emit()}->{attr_var.label});")
+                else:
+                    adj.add_forward(f"{attr.emit()} = {cast}&({aggregate.emit()}.{attr_var.label});")
+                if adj.is_differentiable_value_type(strip_reference(attr_type)):
+                    adj.add_reverse(f"{aggregate.emit_adj()}.{attr_var.label} += {adj_cast}{attr.emit_adj()};")
+                else:
+                    adj.add_reverse(f"{aggregate.emit_adj()}.{attr_var.label} = {adj_cast}{attr.emit_adj()};")
+                return attr
+        except (KeyError, AttributeError) as e:
+            # Try resolving as type attribute
+            aggregate_type = strip_reference(aggregate.type) if isinstance(aggregate, Var) else aggregate
+            type_attribute = adj.resolve_type_attribute(aggregate_type, node.attr)
+            if type_attribute is not None:
+                return type_attribute
+            if isinstance(aggregate, Var):
+                node_name = get_node_name_safe(node.value)
+                raise WarpCodegenAttributeError(
+                    f"Error, `{node.attr}` is not an attribute of '{node_name}' ({type_repr(aggregate.type)})"
+                ) from e
+            raise WarpCodegenAttributeError(f"Error, `{node.attr}` is not an attribute of '{aggregate}'") from e
+    def emit_Assert(adj, node):
+        # eval condition
+        cond = adj.eval(node.test)
+        cond = adj.load(cond)
+        source_segment = ast.get_source_segment(adj.source, node)
+        # If a message was provided with the assert, " marks can interfere with the generated code
+        escaped_segment = source_segment.replace('"', '\\"')
+        adj.add_forward(f'assert(("{escaped_segment}",{cond.emit()}));')
+    def emit_Constant(adj, node):
+        if node.value is None:
+            raise WarpCodegenTypeError("None type unsupported")
+        else:
+            return adj.add_constant(node.value)
+    def emit_BinOp(adj, node):
+        # evaluate binary operator arguments
+        if warp._src.config.verify_autograd_array_access:
+            # array overwrite tracking: in-place operators are a special case
+            # x[tid] = x[tid] + 1 is a read followed by a write, but we only want to record the write
+            # so we save the current arg read flags and restore them after lhs eval
+            is_read_states = []
+            for arg in adj.args:
+                is_read_states.append(arg.is_read)
+        # evaluate lhs binary operator argument
+        left = adj.eval(node.left)
+        if warp._src.config.verify_autograd_array_access:
+            # restore arg read flags
+            for i, arg in enumerate(adj.args):
+                arg.is_read = is_read_states[i]
+        # evaluate rhs binary operator argument
+        right = adj.eval(node.right)
+        name = builtin_operators[type(node.op)]
+        try:
+            # Check if there is any user-defined overload for this operator
+            user_func = adj.resolve_external_reference(name)
+            if isinstance(user_func, warp._src.context.Function):
+                return adj.add_call(user_func, (left, right), {}, {})
+        except WarpCodegenError:
+            pass
+        return adj.add_builtin_call(name, [left, right])
+    def emit_UnaryOp(adj, node):
+        # evaluate unary op arguments
+        arg = adj.eval(node.operand)
+        # evaluate expression to a compile-time constant if arg is a constant
+        if arg.constant is not None and math.isfinite(arg.constant):
+            if isinstance(node.op, ast.USub):
+                return adj.add_constant(-arg.constant)
+        name = builtin_operators[type(node.op)]
+        return adj.add_builtin_call(name, [arg])
+    def materialize_redefinitions(adj, symbols):
+        # detect symbols with conflicting definitions (assigned inside the for loop)
+        for items in symbols.items():
+            sym = items[0]
+            if adj.is_constant_iter_symbol(sym):
+                # ignore constant overwriting in for-loops if it is a loop iterator
+                # (it is no problem to unroll static loops multiple times in sequence)
+                continue
+            var1 = items[1]
+            var2 = adj.symbols[sym]
+            if var1 != var2:
+                if warp._src.config.verbose and not adj.custom_reverse_mode:
+                    lineno = adj.lineno + adj.fun_lineno
+                    line = adj.source_lines[adj.lineno]
+                    msg = f'Warning: detected mutated variable {sym} during a dynamic for-loop in function "{adj.fun_name}" at {adj.filename}:{lineno}: this may not be a differentiable operation.\n{line}\n'
+                    print(msg)
+                if var1.constant is not None:
+                    raise WarpCodegenError(
+                        f"Error mutating a constant {sym} inside a dynamic loop, use the following syntax: pi = float(3.141) to declare a dynamic variable"
+                    )
+                # overwrite the old variable value (violates SSA)
+                adj.add_builtin_call("assign", [var1, var2])
+                # reset the symbol to point to the original variable
+                adj.symbols[sym] = var1
+    def emit_While(adj, node):
+        adj.begin_while(node.test)
+        adj.loop_symbols.append(adj.symbols.copy())
+        # eval body
+        for s in node.body:
+            adj.eval(s)
+        adj.materialize_redefinitions(adj.loop_symbols[-1])
+        adj.loop_symbols.pop()
+        adj.end_while()
+    def eval_num(adj, a):
+        if isinstance(a, ast.Constant):
+            return True, a.value
+        if isinstance(a, ast.UnaryOp) and isinstance(a.op, ast.USub) and isinstance(a.operand, ast.Constant):
+            # Negative constant
+            return True, -a.operand.value
+        # try and resolve the expression to an object
+        # e.g.: wp.constant in the globals scope
+        obj, _ = adj.resolve_static_expression(a)
+        if obj is None:
+            obj = adj.eval(a)
+        if isinstance(obj, Var) and obj.constant is not None:
+            obj = obj.constant
+        return warp._src.types.is_int(obj), obj
+    # detects whether a loop contains a break (or continue) statement
+    def contains_break(adj, body):
+        for s in body:
+            if isinstance(s, ast.Break):
+                return True
+            elif isinstance(s, ast.Continue):
+                return True
+            elif isinstance(s, ast.If):
+                if adj.contains_break(s.body):
+                    return True
+                if adj.contains_break(s.orelse):
+                    return True
+            else:
+                # note that nested for or while loops containing a break statement
+                # do not affect the current loop
+                pass
+        return False
+    # returns a constant range() if unrollable, otherwise None
+    def get_unroll_range(adj, loop):
+        if (
+            not isinstance(loop.iter, ast.Call)
+            or not isinstance(loop.iter.func, ast.Name)
+            or loop.iter.func.id != "range"
+            or len(loop.iter.args) == 0
+            or len(loop.iter.args) > 3
+        ):
+            return None
+        # if all range() arguments are numeric constants we will unroll
+        # note that this only handles trivial constants, it will not unroll
+        # constant compile-time expressions e.g.: range(0, 3*2)
+        # Evaluate the arguments and check that they are numeric constants
+        # It is important to do that in one pass, so that if evaluating these arguments have side effects
+        # the code does not get generated more than once
+        range_args = [adj.eval_num(arg) for arg in loop.iter.args]
+        arg_is_numeric, arg_values = zip(*range_args)
+        if all(arg_is_numeric):
+            # All argument are numeric constants
+            # range(end)
+            if len(loop.iter.args) == 1:
+                start = 0
+                end = arg_values[0]
+                step = 1
+            # range(start, end)
+            elif len(loop.iter.args) == 2:
+                start = arg_values[0]
+                end = arg_values[1]
+                step = 1
+            # range(start, end, step)
+            elif len(loop.iter.args) == 3:
+                start = arg_values[0]
+                end = arg_values[1]
+                step = arg_values[2]
+            # test if we're above max unroll count
+            max_iters = abs(end - start) // abs(step)
+            if "max_unroll" in adj.builder_options:
+                max_unroll = adj.builder_options["max_unroll"]
+            else:
+                max_unroll = warp._src.config.max_unroll
+            ok_to_unroll = True
+            if max_iters > max_unroll:
+                if warp._src.config.verbose:
+                    print(
+                        f"Warning: fixed-size loop count of {max_iters} is larger than the module 'max_unroll' limit of {max_unroll}, will generate dynamic loop."
+                    )
+                ok_to_unroll = False
+            elif adj.contains_break(loop.body):
+                if warp._src.config.verbose:
+                    print("Warning: 'break' or 'continue' found in loop body, will generate dynamic loop.")
+                ok_to_unroll = False
+            if ok_to_unroll:
+                return range(start, end, step)
+        # Unroll is not possible, range needs to be valuated dynamically
+        range_call = adj.add_builtin_call(
+            "range",
+            [adj.add_constant(val) if is_numeric else val for is_numeric, val in range_args],
+        )
+        return range_call
+    def record_constant_iter_symbol(adj, sym):
+        adj.loop_const_iter_symbols.add(sym)
+    def is_constant_iter_symbol(adj, sym):
+        return sym in adj.loop_const_iter_symbols
+    def emit_For(adj, node):
+        # try and unroll simple range() statements that use constant args
+        unroll_range = adj.get_unroll_range(node)
+        if isinstance(unroll_range, range):
+            const_iter_sym = node.target.id
+            # prevent constant conflicts in `materialize_redefinitions()`
+            adj.record_constant_iter_symbol(const_iter_sym)
+            # unroll static for-loop
+            for i in unroll_range:
+                const_iter = adj.add_constant(i)
+                adj.symbols[const_iter_sym] = const_iter
+                # eval body
+                for s in node.body:
+                    adj.eval(s)
+        # otherwise generate a dynamic loop
+        else:
+            # evaluate the Iterable -- only if not previously evaluated when trying to unroll
+            if unroll_range is not None:
+                # Range has already been evaluated when trying to unroll, do not re-evaluate
+                iter = unroll_range
+            else:
+                iter = adj.eval(node.iter)
+            adj.symbols[node.target.id] = adj.begin_for(iter)
+            # for loops should be side-effect free, here we store a copy
+            adj.loop_symbols.append(adj.symbols.copy())
+            # eval body
+            for s in node.body:
+                adj.eval(s)
+            adj.materialize_redefinitions(adj.loop_symbols[-1])
+            adj.loop_symbols.pop()
+            adj.end_for(iter)
+    def emit_Break(adj, node):
+        adj.materialize_redefinitions(adj.loop_symbols[-1])
+        adj.add_forward(f"goto end_{adj.loop_blocks[-1].label};")
+    def emit_Continue(adj, node):
+        adj.materialize_redefinitions(adj.loop_symbols[-1])
+        adj.add_forward(f"goto start_{adj.loop_blocks[-1].label};")
+    def emit_Expr(adj, node):
+        return adj.eval(node.value)
+    def check_tid_in_func_error(adj, node):
+        if adj.is_user_function:
+            if hasattr(node.func, "attr") and node.func.attr == "tid":
+                lineno = adj.lineno + adj.fun_lineno
+                line = adj.source_lines[adj.lineno]
+                raise WarpCodegenError(
+                    "tid() may only be called from a Warp kernel, not a Warp function. "
+                    "Instead, obtain the indices from a @wp.kernel and pass them as "
+                    f"arguments to the function {adj.fun_name}, {adj.filename}:{lineno}:\n{line}\n"
+                )
+    def resolve_arg(adj, arg):
+        # Always try to start with evaluating the argument since it can help
+        # detecting some issues such as global variables being accessed.
+        try:
+            var = adj.eval(arg)
+        except (WarpCodegenError, WarpCodegenKeyError) as e:
+            error = e
+        else:
+            error = None
+        # Check if we can resolve the argument as a static expression.
+        # If not, return the variable resulting from evaluating the argument.
+        expr, _ = adj.resolve_static_expression(arg)
+        if expr is None:
+            if error is not None:
+                raise error
+            return var
+        if isinstance(expr, (type, Struct, Var, warp._src.context.Function)):
+            return expr
+        if isinstance(expr, (enum.IntEnum, enum.IntFlag)):
+            return adj.add_constant(int(expr))
+        return adj.add_constant(expr)
+    def emit_Call(adj, node):
+        adj.check_tid_in_func_error(node)
+        # try and lookup function in globals by
+        # resolving path (e.g.: module.submodule.attr)
+        if hasattr(node.func, "warp_func"):
+            func = node.func.warp_func
+            path = []
+        else:
+            func, path = adj.resolve_static_expression(node.func)
+        if func is None:
+            func = adj.eval(node.func)
+        if adj.is_static_expression(func):
+            # try to evaluate wp.static() expressions
+            obj, code = adj.evaluate_static_expression(node)
+            if obj is not None:
+                adj.static_expressions[code] = obj
+                if isinstance(obj, warp._src.context.Function):
+                    # special handling for wp.static() evaluating to a function
+                    return obj
+                else:
+                    out = adj.add_constant(obj)
+                    return out
+        type_args = {}
+        if len(path) > 0 and not isinstance(func, warp._src.context.Function):
+            attr = path[-1]
+            caller = func
+            func = None
+            # try and lookup function name in builtins (e.g.: using `dot` directly without wp prefix)
+            if attr in warp._src.context.builtin_functions:
+                func = warp._src.context.builtin_functions[attr]
+            # vector class type e.g.: wp.vec3f constructor
+            if func is None and hasattr(caller, "_wp_generic_type_str_"):
+                func = warp._src.context.builtin_functions.get(caller._wp_constructor_)
+            # scalar class type e.g.: wp.int8 constructor
+            if func is None and hasattr(caller, "__name__") and caller.__name__ in warp._src.context.builtin_functions:
+                func = warp._src.context.builtin_functions.get(caller.__name__)
+            # struct constructor
+            if func is None and isinstance(caller, Struct):
+                if adj.builder is not None:
+                    adj.builder.build_struct_recursive(caller)
+                if node.args or node.keywords:
+                    func = caller.value_constructor
+                else:
+                    func = caller.default_constructor
+            # lambda function
+            if func is None and getattr(caller, "__name__", None) == "<lambda>":
+                raise NotImplementedError("Lambda expressions are not yet supported")
+            if hasattr(caller, "_wp_type_args_"):
+                type_args = caller._wp_type_args_
+            if func is None:
+                raise WarpCodegenError(
+                    f"Could not find function {'.'.join(path)} as a built-in or user-defined function. Note that user functions must be annotated with a @wp.func decorator to be called from a kernel."
+                )
+        # get expected return count, e.g.: for multi-assignment
+        min_outputs = None
+        if hasattr(node, "expects"):
+            min_outputs = node.expects
+        # Evaluate all positional and keywords arguments.
+        args = tuple(adj.resolve_arg(x) for x in node.args)
+        kwargs = {x.arg: adj.resolve_arg(x.value) for x in node.keywords}
+        out = adj.add_call(func, args, kwargs, type_args, min_outputs=min_outputs)
+        if warp._src.config.verify_autograd_array_access:
+            # Extract the types and values passed as arguments to the function call.
+            arg_types = tuple(strip_reference(get_arg_type(x)) for x in args)
+            kwarg_types = {k: strip_reference(get_arg_type(v)) for k, v in kwargs.items()}
+            # Resolve the exact function signature among any existing overload.
+            resolved_func = adj.resolve_func(func, arg_types, kwarg_types, min_outputs)
+            # update arg read/write states according to what happens to that arg in the called function
+            if hasattr(resolved_func, "adj"):
+                for i, arg in enumerate(args):
+                    if resolved_func.adj.args[i].is_write:
+                        kernel_name = adj.fun_name
+                        filename = adj.filename
+                        lineno = adj.lineno + adj.fun_lineno
+                        arg.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+                    if resolved_func.adj.args[i].is_read:
+                        arg.mark_read()
+        return out
+    def emit_Index(adj, node):
+        # the ast.Index node appears in 3.7 versions
+        # when performing array slices, e.g.: x = arr[i]
+        # but in version 3.8 and higher it does not appear
+        if hasattr(node, "is_adjoint"):
+            node.value.is_adjoint = True
+        return adj.eval(node.value)
+    def eval_indices(adj, target_type, indices):
+        nodes = indices
+        if hasattr(target_type, "_wp_generic_type_hint_"):
+            indices = []
+            for dim, node in enumerate(nodes):
+                if isinstance(node, ast.Slice):
+                    # In the context of slicing a vec/mat type, indices are expected
+                    # to be compile-time constants, hence we can infer the actual slice
+                    # bounds also at compile-time.
+                    length = target_type._shape_[dim]
+                    step = 1 if node.step is None else adj.eval(node.step).constant
+                    if node.lower is None:
+                        start = length - 1 if step < 0 else 0
+                    else:
+                        start = adj.eval(node.lower).constant
+                        start = min(max(start, -length), length)
+                        start = start + length if start < 0 else start
+                    if node.upper is None:
+                        stop = -1 if step < 0 else length
+                    else:
+                        stop = adj.eval(node.upper).constant
+                        stop = min(max(stop, -length), length)
+                        stop = stop + length if stop < 0 else stop
+                    slice = adj.add_builtin_call("slice", (start, stop, step))
+                    indices.append(slice)
+                else:
+                    indices.append(adj.eval(node))
+            return tuple(indices)
+        else:
+            return tuple(adj.eval(x) for x in nodes)
+    def emit_indexing(adj, target, indices):
+        target_type = strip_reference(target.type)
+        indices = adj.eval_indices(target_type, indices)
+        if is_array(target_type):
+            if len(indices) == target_type.ndim and all(
+                warp._src.types.type_is_int(strip_reference(x.type)) for x in indices
+            ):
+                # handles array loads (where each dimension has an index specified)
+                out = adj.add_builtin_call("address", [target, *indices])
+                if warp._src.config.verify_autograd_array_access:
+                    target.mark_read()
+            else:
+                if isinstance(target_type, warp._src.types.array):
+                    # In order to reduce the number of overloads needed in the C
+                    # implementation to support combinations of int/slice indices,
+                    # we convert all integer indices into slices, and set their
+                    # step to 0 if they are representing an integer index.
+                    new_indices = []
+                    for idx in indices:
+                        if not warp._src.types.is_slice(strip_reference(idx.type)):
+                            new_idx = adj.add_builtin_call("slice", (idx, idx, 0))
+                            new_indices.append(new_idx)
+                        else:
+                            new_indices.append(idx)
+                    indices = new_indices
+                # handles array views (fewer indices than dimensions)
+                out = adj.add_builtin_call("view", [target, *indices])
+                if warp._src.config.verify_autograd_array_access:
+                    # store reference to target Var to propagate downstream read/write state back to root arg Var
+                    out.parent = target
+                    # view arg inherits target Var's read/write states
+                    out.is_read = target.is_read
+                    out.is_write = target.is_write
+        elif is_tile(target_type):
+            if len(indices) >= len(target_type.shape):  # equality for scalars, inequality for composite types
+                # handles extracting a single element from a tile
+                out = adj.add_builtin_call("tile_extract", [target, *indices])
+            elif len(indices) < len(target_type.shape):
+                # handles tile views
+                out = adj.add_builtin_call("tile_view", [target, indices])
+            else:
+                raise RuntimeError(
+                    f"Incorrect number of indices specified for a tile view/extract, got {len(indices)} indices for a {len(target_type.shape)} dimensional tile."
+                )
+        else:
+            # handles non-array type indexing, e.g: vec3, mat33, etc
+            out = adj.add_builtin_call("extract", [target, *indices])
+        return out
+    # from a list of lists of indices, strip the first `count` indices
+    @staticmethod
+    def strip_indices(indices, count):
+        dim = count
+        while count > 0:
+            ij = indices[0]
+            indices = indices[1:]
+            count -= len(ij)
+        # report straddling like in `arr2d[0][1,2]` as a syntax error
+        if count < 0:
+            raise WarpCodegenError(
+                f"Incorrect number of indices specified for array indexing, got {dim - count} indices for a {dim} dimensional array."
+            )
+        return indices
+    def recurse_subscript(adj, node, indices):
+        if isinstance(node, ast.Name):
+            target = adj.eval(node)
+            return target, indices
+        if isinstance(node, ast.Subscript):
+            if hasattr(node.value, "attr") and node.value.attr == "adjoint":
+                return adj.eval(node), indices
+            if isinstance(node.slice, ast.Tuple):
+                ij = node.slice.elts
+            elif isinstance(node.slice, ast.Index) and isinstance(node.slice.value, ast.Tuple):
+                # The node `ast.Index` is deprecated in Python 3.9.
+                ij = node.slice.value.elts
+            elif isinstance(node.slice, ast.ExtSlice):
+                # The node `ast.ExtSlice` is deprecated in Python 3.9.
+                ij = node.slice.dims
+            else:
+                ij = [node.slice]
+            indices = [ij, *indices]  # prepend
+            target, indices = adj.recurse_subscript(node.value, indices)
+            target_type = strip_reference(target.type)
+            if is_array(target_type):
+                flat_indices = [i for ij in indices for i in ij]
+                if len(flat_indices) > target_type.ndim:
+                    target = adj.emit_indexing(target, flat_indices[: target_type.ndim])
+                    indices = adj.strip_indices(indices, target_type.ndim)
+            return target, indices
+        target = adj.eval(node)
+        return target, indices
+    # returns the object being indexed, and the list of indices
+    def eval_subscript(adj, node):
+        target, indices = adj.recurse_subscript(node, [])
+        flat_indices = [i for ij in indices for i in ij]
+        return target, flat_indices
+    def emit_Subscript(adj, node):
+        if hasattr(node.value, "attr") and node.value.attr == "adjoint":
+            # handle adjoint of a variable, i.e. wp.adjoint[var]
+            node.slice.is_adjoint = True
+            var = adj.eval(node.slice)
+            var_name = var.label
+            var = Var(f"adj_{var_name}", type=var.type, constant=None, prefix=False)
+            return var
+        target, indices = adj.eval_subscript(node)
+        return adj.emit_indexing(target, indices)
+    def emit_Slice(adj, node):
+        start = SLICE_BEGIN if node.lower is None else adj.eval(node.lower)
+        stop = SLICE_END if node.upper is None else adj.eval(node.upper)
+        step = 1 if node.step is None else adj.eval(node.step)
+        return adj.add_builtin_call("slice", (start, stop, step))
+    def emit_Assign(adj, node):
+        if len(node.targets) != 1:
+            raise WarpCodegenError("Assigning the same value to multiple variables is not supported")
+        # Check if the rhs corresponds to an unsupported construct.
+        # Tuples are supported in the context of assigning multiple variables
+        # at once, but not for simple assignments like `x = (1, 2, 3)`.
+        # Therefore, we need to catch this specific case here instead of
+        # more generally in `adj.eval()`.
+        if isinstance(node.value, ast.List):
+            raise WarpCodegenError(
+                "List constructs are not supported in kernels. Use vectors like `wp.vec3()` for small collections instead."
+            )
+        lhs = node.targets[0]
+        if isinstance(lhs, ast.Tuple) and isinstance(node.value, ast.Call):
+            # record the expected number of outputs on the node
+            # we do this so we can decide which function to
+            # call based on the number of expected outputs
+            node.value.expects = len(lhs.elts)
+        # evaluate rhs
+        if isinstance(lhs, ast.Tuple) and isinstance(node.value, ast.Tuple):
+            rhs = [adj.eval(v) for v in node.value.elts]
+        else:
+            rhs = adj.eval(node.value)
+        # handle the case where we are assigning multiple output variables
+        if isinstance(lhs, ast.Tuple):
+            subtype = getattr(rhs, "type", None)
+            if isinstance(subtype, warp._src.types.tuple_t):
+                if len(rhs.type.types) != len(lhs.elts):
+                    raise WarpCodegenError(
+                        f"Invalid number of values to unpack (expected {len(lhs.elts)}, got {len(rhs.type.types)})."
+                    )
+                rhs = tuple(adj.add_builtin_call("extract", (rhs, adj.add_constant(i))) for i in range(len(lhs.elts)))
+            names = []
+            for v in lhs.elts:
+                if isinstance(v, ast.Name):
+                    names.append(v.id)
+                else:
+                    raise WarpCodegenError(
+                        "Multiple return functions can only assign to simple variables, e.g.: x, y = func()"
+                    )
+            if len(names) != len(rhs):
+                raise WarpCodegenError(
+                    f"Multiple return functions need to receive all their output values, incorrect number of values to unpack (expected {len(rhs)}, got {len(names)})"
+                )
+            out = rhs
+            for name, rhs in zip(names, out):
+                if name in adj.symbols:
+                    if not types_equal(rhs.type, adj.symbols[name].type):
+                        raise WarpCodegenTypeError(
+                            f"Error, assigning to existing symbol {name} ({adj.symbols[name].type}) with different type ({rhs.type})"
+                        )
+                adj.symbols[name] = rhs
+        # handles the case where we are assigning to an array index (e.g.: arr[i] = 2.0)
+        elif isinstance(lhs, ast.Subscript):
+            if hasattr(lhs.value, "attr") and lhs.value.attr == "adjoint":
+                # handle adjoint of a variable, i.e. wp.adjoint[var]
+                lhs.slice.is_adjoint = True
+                src_var = adj.eval(lhs.slice)
+                var = Var(f"adj_{src_var.label}", type=src_var.type, constant=None, prefix=False)
+                adj.add_forward(f"{var.emit()} = {rhs.emit()};")
+                return
+            target, indices = adj.eval_subscript(lhs)
+            target_type = strip_reference(target.type)
+            indices = adj.eval_indices(target_type, indices)
+            if is_array(target_type):
+                adj.add_builtin_call("array_store", [target, *indices, rhs])
+                if warp._src.config.verify_autograd_array_access:
+                    kernel_name = adj.fun_name
+                    filename = adj.filename
+                    lineno = adj.lineno + adj.fun_lineno
+                    target.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+            elif is_tile(target_type):
+                adj.add_builtin_call("assign", [target, *indices, rhs])
+            elif (
+                type_is_vector(target_type)
+                or type_is_quaternion(target_type)
+                or type_is_matrix(target_type)
+                or type_is_transformation(target_type)
+            ):
+                # recursively unwind AST, stopping at penultimate node
+                root = lhs
+                while hasattr(root.value, "value"):
+                    root = root.value
+                # lhs is updating a variable adjoint (i.e. wp.adjoint[var])
+                if hasattr(root, "attr") and root.attr == "adjoint":
+                    attr = adj.add_builtin_call("index", [target, *indices])
+                    adj.add_builtin_call("store", [attr, rhs])
+                    return
+                # TODO: array vec component case
+                if is_reference(target.type):
+                    attr = adj.add_builtin_call("indexref", [target, *indices])
+                    adj.add_builtin_call("store", [attr, rhs])
+                    if warp._src.config.verbose and not adj.custom_reverse_mode:
+                        lineno = adj.lineno + adj.fun_lineno
+                        line = adj.source_lines[adj.lineno]
+                        node_source = adj.get_node_source(lhs.value)
+                        print(
+                            f"Warning: mutating {node_source} in function {adj.fun_name} at {adj.filename}:{lineno}: this is a non-differentiable operation.\n{line}\n"
+                        )
+                else:
+                    if warp._src.config.enable_vector_component_overwrites:
+                        out = adj.add_builtin_call("assign_copy", [target, *indices, rhs])
+                        # re-point target symbol to out var
+                        for id in adj.symbols:
+                            if adj.symbols[id] == target:
+                                adj.symbols[id] = out
+                                break
+                    else:
+                        adj.add_builtin_call("assign_inplace", [target, *indices, rhs])
+            else:
+                raise WarpCodegenError(
+                    f"Can only subscript assign array, vector, quaternion, transformation, and matrix types, got {target_type}"
+                )
+        elif isinstance(lhs, ast.Name):
+            # symbol name
+            name = lhs.id
+            # check type matches if symbol already defined
+            if name in adj.symbols:
+                if not types_equal(strip_reference(rhs.type), adj.symbols[name].type):
+                    raise WarpCodegenTypeError(
+                        f"Error, assigning to existing symbol {name} ({adj.symbols[name].type}) with different type ({rhs.type})"
+                    )
+            if isinstance(node.value, ast.Tuple):
+                out = rhs
+            elif isinstance(rhs, Sequence):
+                out = adj.add_builtin_call("tuple", rhs)
+            elif isinstance(node.value, ast.Name) or is_reference(rhs.type):
+                out = adj.add_builtin_call("copy", [rhs])
+            else:
+                out = rhs
+            # update symbol map (assumes lhs is a Name node)
+            adj.symbols[name] = out
+        elif isinstance(lhs, ast.Attribute):
+            aggregate = adj.eval(lhs.value)
+            aggregate_type = strip_reference(aggregate.type)
+            # assigning to a vector or quaternion component
+            if type_is_vector(aggregate_type) or type_is_quaternion(aggregate_type):
+                index = adj.vector_component_index(lhs.attr, aggregate_type)
+                if is_reference(aggregate.type):
+                    attr = adj.add_builtin_call("indexref", [aggregate, index])
+                    adj.add_builtin_call("store", [attr, rhs])
+                else:
+                    if warp._src.config.enable_vector_component_overwrites:
+                        out = adj.add_builtin_call("assign_copy", [aggregate, index, rhs])
+                        # re-point target symbol to out var
+                        for id in adj.symbols:
+                            if adj.symbols[id] == aggregate:
+                                adj.symbols[id] = out
+                                break
+                    else:
+                        adj.add_builtin_call("assign_inplace", [aggregate, index, rhs])
+            elif type_is_transformation(aggregate_type):
+                component = adj.transform_component(lhs.attr)
+                # TODO: x[i,j].p = rhs case
+                if is_reference(aggregate.type):
+                    raise WarpCodegenError(f"Error, assigning transform attribute {component} to an array element")
+                if component == "p":
+                    return adj.add_builtin_call("transform_set_translation", [aggregate, rhs])
+                else:
+                    return adj.add_builtin_call("transform_set_rotation", [aggregate, rhs])
+            else:
+                attr = adj.emit_Attribute(lhs)
+                if is_reference(attr.type):
+                    adj.add_builtin_call("store", [attr, rhs])
+                else:
+                    adj.add_builtin_call("assign", [attr, rhs])
+                if warp._src.config.verbose and not adj.custom_reverse_mode:
+                    lineno = adj.lineno + adj.fun_lineno
+                    line = adj.source_lines[adj.lineno]
+                    msg = f'Warning: detected mutated struct {attr.label} during function "{adj.fun_name}" at {adj.filename}:{lineno}: this is a non-differentiable operation.\n{line}\n'
+                    print(msg)
+        else:
+            raise WarpCodegenError("Error, unsupported assignment statement.")
+    def emit_Return(adj, node):
+        if node.value is None:
+            var = None
+        elif isinstance(node.value, ast.Tuple):
+            var = tuple(adj.eval(arg) for arg in node.value.elts)
+        else:
+            var = adj.eval(node.value)
+            if not isinstance(var, list) and not isinstance(var, tuple):
+                var = (var,)
+        if adj.return_var is not None:
+            old_ctypes = tuple(v.ctype(value_type=True) for v in adj.return_var)
+            new_ctypes = tuple(v.ctype(value_type=True) for v in var)
+            if old_ctypes != new_ctypes:
+                raise WarpCodegenTypeError(
+                    f"Error, function returned different types, previous: [{', '.join(old_ctypes)}], new [{', '.join(new_ctypes)}]"
+                )
+        if var is not None:
+            adj.return_var = ()
+            for ret in var:
+                if is_reference(ret.type):
+                    ret_var = adj.add_builtin_call("copy", [ret])
+                else:
+                    ret_var = ret
+                adj.return_var += (ret_var,)
+        adj.add_return(adj.return_var)
+    def emit_AugAssign(adj, node):
+        lhs = node.target
+        # replace augmented assignment with assignment statement + binary op (default behaviour)
+        def make_new_assign_statement():
+            new_node = ast.Assign(targets=[lhs], value=ast.BinOp(lhs, node.op, node.value))
+            adj.eval(new_node)
+        rhs = adj.eval(node.value)
+        if isinstance(lhs, ast.Subscript):
+            # wp.adjoint[var] appears in custom grad functions, and does not require
+            # special consideration in the AugAssign case
+            if hasattr(lhs.value, "attr") and lhs.value.attr == "adjoint":
+                make_new_assign_statement()
+                return
+            target, indices = adj.eval_subscript(lhs)
+            target_type = strip_reference(target.type)
+            indices = adj.eval_indices(target_type, indices)
+            if is_array(target_type):
+                # target_types int8, uint8, int16, uint16 are not suitable for atomic array accumulation
+                if target_type.dtype in warp._src.types.non_atomic_types:
+                    make_new_assign_statement()
+                    return
+                # the same holds true for vecs/mats/quats that are composed of these types
+                if (
+                    type_is_vector(target_type.dtype)
+                    or type_is_quaternion(target_type.dtype)
+                    or type_is_matrix(target_type.dtype)
+                    or type_is_transformation(target_type.dtype)
+                ):
+                    dtype = getattr(target_type.dtype, "_wp_scalar_type_", None)
+                    if dtype in warp._src.types.non_atomic_types:
+                        make_new_assign_statement()
+                        return
+                kernel_name = adj.fun_name
+                filename = adj.filename
+                lineno = adj.lineno + adj.fun_lineno
+                if isinstance(node.op, ast.Add):
+                    adj.add_builtin_call("atomic_add", [target, *indices, rhs])
+                    if warp._src.config.verify_autograd_array_access:
+                        target.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+                elif isinstance(node.op, ast.Sub):
+                    adj.add_builtin_call("atomic_sub", [target, *indices, rhs])
+                    if warp._src.config.verify_autograd_array_access:
+                        target.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+                elif isinstance(node.op, ast.BitAnd):
+                    adj.add_builtin_call("atomic_and", [target, *indices, rhs])
+                    if warp._src.config.verify_autograd_array_access:
+                        target.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+                elif isinstance(node.op, ast.BitOr):
+                    adj.add_builtin_call("atomic_or", [target, *indices, rhs])
+                    if warp._src.config.verify_autograd_array_access:
+                        target.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+                elif isinstance(node.op, ast.BitXor):
+                    adj.add_builtin_call("atomic_xor", [target, *indices, rhs])
+                    if warp._src.config.verify_autograd_array_access:
+                        target.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
+                else:
+                    if warp._src.config.verbose:
+                        print(f"Warning: in-place op {node.op} is not differentiable")
+                    make_new_assign_statement()
+                    return
+            elif (
+                type_is_vector(target_type)
+                or type_is_quaternion(target_type)
+                or type_is_matrix(target_type)
+                or type_is_transformation(target_type)
+            ):
+                if isinstance(node.op, ast.Add):
+                    adj.add_builtin_call("add_inplace", [target, *indices, rhs])
+                elif isinstance(node.op, ast.Sub):
+                    adj.add_builtin_call("sub_inplace", [target, *indices, rhs])
+                elif isinstance(node.op, ast.BitAnd):
+                    adj.add_builtin_call("bit_and_inplace", [target, *indices, rhs])
+                elif isinstance(node.op, ast.BitOr):
+                    adj.add_builtin_call("bit_or_inplace", [target, *indices, rhs])
+                elif isinstance(node.op, ast.BitXor):
+                    adj.add_builtin_call("bit_xor_inplace", [target, *indices, rhs])
+                else:
+                    if warp._src.config.verbose:
+                        print(f"Warning: in-place op {node.op} is not differentiable")
+                    make_new_assign_statement()
+                    return
+            elif is_tile(target.type):
+                if isinstance(node.op, ast.Add):
+                    adj.add_builtin_call("tile_add_inplace", [target, *indices, rhs])
+                elif isinstance(node.op, ast.Sub):
+                    adj.add_builtin_call("tile_sub_inplace", [target, *indices, rhs])
+                elif isinstance(node.op, ast.BitAnd):
+                    adj.add_builtin_call("tile_bit_and_inplace", [target, *indices, rhs])
+                elif isinstance(node.op, ast.BitOr):
+                    adj.add_builtin_call("tile_bit_or_inplace", [target, *indices, rhs])
+                elif isinstance(node.op, ast.BitXor):
+                    adj.add_builtin_call("tile_bit_xor_inplace", [target, *indices, rhs])
+                else:
+                    if warp._src.config.verbose:
+                        print(f"Warning: in-place op {node.op} is not differentiable")
+                    make_new_assign_statement()
+                    return
+            else:
+                raise WarpCodegenError("Can only subscript in-place assign array, vector, quaternion, and matrix types")
+        elif isinstance(lhs, ast.Name):
+            target = adj.eval(node.target)
+            if is_tile(target.type) and is_tile(rhs.type):
+                if isinstance(node.op, ast.Add):
+                    adj.add_builtin_call("add_inplace", [target, rhs])
+                elif isinstance(node.op, ast.Sub):
+                    adj.add_builtin_call("sub_inplace", [target, rhs])
+                elif isinstance(node.op, ast.BitAnd):
+                    adj.add_builtin_call("bit_and_inplace", [target, rhs])
+                elif isinstance(node.op, ast.BitOr):
+                    adj.add_builtin_call("bit_or_inplace", [target, rhs])
+                elif isinstance(node.op, ast.BitXor):
+                    adj.add_builtin_call("bit_xor_inplace", [target, rhs])
+                else:
+                    make_new_assign_statement()
+                    return
+            else:
+                make_new_assign_statement()
+                return
+        # TODO
+        elif isinstance(lhs, ast.Attribute):
+            make_new_assign_statement()
+            return
+        else:
+            make_new_assign_statement()
+            return
+    def emit_Tuple(adj, node):
+        elements = tuple(adj.eval(x) for x in node.elts)
+        return adj.add_builtin_call("tuple", elements)
+    def emit_Pass(adj, node):
+        pass
+    node_visitors: ClassVar[dict[type[ast.AST], Callable]] = {
+        ast.FunctionDef: emit_FunctionDef,
+        ast.If: emit_If,
+        ast.IfExp: emit_IfExp,
+        ast.Compare: emit_Compare,
+        ast.BoolOp: emit_BoolOp,
+        ast.Name: emit_Name,
+        ast.Attribute: emit_Attribute,
+        ast.Constant: emit_Constant,
+        ast.BinOp: emit_BinOp,
+        ast.UnaryOp: emit_UnaryOp,
+        ast.While: emit_While,
+        ast.For: emit_For,
+        ast.Break: emit_Break,
+        ast.Continue: emit_Continue,
+        ast.Expr: emit_Expr,
+        ast.Call: emit_Call,
+        ast.Index: emit_Index,  # Deprecated in 3.9
+        ast.Subscript: emit_Subscript,
+        ast.Slice: emit_Slice,
+        ast.Assign: emit_Assign,
+        ast.Return: emit_Return,
+        ast.AugAssign: emit_AugAssign,
+        ast.Tuple: emit_Tuple,
+        ast.Pass: emit_Pass,
+        ast.Assert: emit_Assert,
+    }
+    def eval(adj, node):
+        if hasattr(node, "lineno"):
+            adj.set_lineno(node.lineno - 1)
+        try:
+            emit_node = adj.node_visitors[type(node)]
+        except KeyError as e:
+            type_name = type(node).__name__
+            namespace = "ast." if isinstance(node, ast.AST) else ""
+            raise WarpCodegenError(f"Construct `{namespace}{type_name}` not supported in kernels.") from e
+        return emit_node(adj, node)
+    # helper to evaluate expressions of the form
+    # obj1.obj2.obj3.attr in the function's global scope
+    def resolve_path(adj, path):
+        if len(path) == 0:
+            return None
+        # if root is overshadowed by local symbols, bail out
+        if path[0] in adj.symbols:
+            return None
+        # look up in closure/global variables
+        expr = adj.resolve_external_reference(path[0])
+        # Support Warp types in kernels without the module suffix (e.g. v = vec3(0.0,0.2,0.4)):
+        if expr is None:
+            expr = getattr(warp, path[0], None)
+        # look up in builtins
+        if expr is None:
+            expr = __builtins__.get(path[0])
+        if expr is not None:
+            for i in range(1, len(path)):
+                if hasattr(expr, path[i]):
+                    expr = getattr(expr, path[i])
+        return expr
+    # retrieves a dictionary of all closure and global variables and their values
+    # to be used in the evaluation context of wp.static() expressions
+    def get_static_evaluation_context(adj):
+        closure_vars = dict(
+            zip(
+                adj.func.__code__.co_freevars,
+                [c.cell_contents for c in (adj.func.__closure__ or [])],
+            )
+        )
+        vars_dict = {}
+        vars_dict.update(adj.func.__globals__)
+        # variables captured in closure have precedence over global vars
+        vars_dict.update(closure_vars)
+        return vars_dict
+    def is_static_expression(adj, func):
+        return (
+            isinstance(func, types.FunctionType)
+            and func.__module__ == "warp._src.builtins"
+            and func.__qualname__ == "static"
+        )
+    # verify the return type of a wp.static() expression is supported inside a Warp kernel
+    def verify_static_return_value(adj, value):
+        if value is None:
+            raise ValueError("None is returned")
+        if warp._src.types.is_value(value):
+            return True
+        if warp._src.types.is_array(value):
+            # more useful explanation for the common case of creating a Warp array
+            raise ValueError("a Warp array cannot be created inside Warp kernels")
+        if isinstance(value, str):
+            # we want to support cases such as `print(wp.static("test"))`
+            return True
+        if isinstance(value, warp._src.context.Function):
+            return True
+        def verify_struct(s: StructInstance, attr_path: list[str]):
+            for key in s._cls.vars.keys():
+                v = getattr(s, key)
+                if issubclass(type(v), StructInstance):
+                    verify_struct(v, [*attr_path, key])
+                else:
+                    try:
+                        adj.verify_static_return_value(v)
+                    except ValueError as e:
+                        raise ValueError(
+                            f"the returned Warp struct contains a data type that cannot be constructed inside Warp kernels: {e} at {value._cls.key}.{'.'.join(attr_path)}"
+                        ) from e
+        if issubclass(type(value), StructInstance):
+            return verify_struct(value, [])
+        raise ValueError(f"value of type {type(value)} cannot be constructed inside Warp kernels")
+    # find the source code string of an AST node
+    @staticmethod
+    def extract_node_source_from_lines(source_lines, node) -> str | None:
+        if not hasattr(node, "lineno") or not hasattr(node, "col_offset"):
+            return None
+        start_line = node.lineno - 1  # line numbers start at 1
+        start_col = node.col_offset
+        if hasattr(node, "end_lineno") and hasattr(node, "end_col_offset"):
+            end_line = node.end_lineno - 1
+            end_col = node.end_col_offset
+        else:
+            # fallback for Python versions before 3.8
+            # we have to find the end line and column manually
+            end_line = start_line
+            end_col = start_col
+            parenthesis_count = 1
+            for lineno in range(start_line, len(source_lines)):
+                if lineno == start_line:
+                    c_start = start_col
+                else:
+                    c_start = 0
+                line = source_lines[lineno]
+                for i in range(c_start, len(line)):
+                    c = line[i]
+                    if c == "(":
+                        parenthesis_count += 1
+                    elif c == ")":
+                        parenthesis_count -= 1
+                        if parenthesis_count == 0:
+                            end_col = i
+                            end_line = lineno
+                            break
+                if parenthesis_count == 0:
+                    break
+        if start_line == end_line:
+            # single-line expression
+            return source_lines[start_line][start_col:end_col]
+        else:
+            # multi-line expression
+            lines = []
+            # first line (from start_col to the end)
+            lines.append(source_lines[start_line][start_col:])
+            # middle lines (entire lines)
+            lines.extend(source_lines[start_line + 1 : end_line])
+            # last line (from the start to end_col)
+            lines.append(source_lines[end_line][:end_col])
+            return "".join(lines).strip()
+    @staticmethod
+    def extract_lambda_source(func, only_body=False) -> str | None:
+        try:
+            source_lines = inspect.getsourcelines(func)[0]
+            source_lines[0] = source_lines[0][source_lines[0].index("lambda") :]
+        except OSError as e:
+            raise WarpCodegenError(
+                "Could not access lambda function source code. Please use a named function instead."
+            ) from e
+        source = "".join(source_lines)
+        source = source[source.index("lambda") :].rstrip()
+        # Remove trailing unbalanced parentheses
+        while source.count("(") < source.count(")"):
+            source = source[:-1]
+        # extract lambda expression up until a comma, e.g. in the case of
+        # "map(lambda a: (a + 2.0, a + 3.0), a, return_kernel=True)"
+        si = max(source.rfind(")"), source.find(":"))
+        ci = source.find(",", si)
+        if ci != -1:
+            source = source[:ci]
+        tree = ast.parse(source)
+        lambda_source = None
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Lambda):
+                if only_body:
+                    # extract the body of the lambda function
+                    lambda_source = Adjoint.extract_node_source_from_lines(source_lines, node.body)
+                else:
+                    # extract the entire lambda function
+                    lambda_source = Adjoint.extract_node_source_from_lines(source_lines, node)
+                    break
+        return lambda_source
+    def extract_node_source(adj, node) -> str | None:
+        return adj.extract_node_source_from_lines(adj.source_lines, node)
+    # handles a wp.static() expression and returns the resulting object and a string representing the code
+    # of the static expression
+    def evaluate_static_expression(adj, node) -> tuple[Any, str]:
+        if len(node.args) == 1:
+            static_code = adj.extract_node_source(node.args[0])
+        elif len(node.keywords) == 1:
+            static_code = adj.extract_node_source(node.keywords[0])
+        else:
+            raise WarpCodegenError("warp.static() requires a single argument or keyword")
+        if static_code is None:
+            raise WarpCodegenError("Error extracting source code from wp.static() expression")
+        # Since this is an expression, we can enforce it to be defined on a single line.
+        static_code = static_code.replace("\n", "")
+        code_to_eval = static_code  # code to be evaluated
+        vars_dict = adj.get_static_evaluation_context()
+        # add constant variables to the static call context
+        constant_vars = {k: v.constant for k, v in adj.symbols.items() if isinstance(v, Var) and v.constant is not None}
+        vars_dict.update(constant_vars)
+        # Replace all constant `len()` expressions with their value.
+        if "len" in static_code:
+            len_expr_ctx = vars_dict.copy()
+            constant_types = {k: v.type for k, v in adj.symbols.items() if isinstance(v, Var) and v.type is not None}
+            len_expr_ctx.update(constant_types)
+            len_expr_ctx.update({"len": warp._src.types.type_length})
+            # We want to replace the expression code in-place,
+            # so reparse it to get the correct column info.
+            len_value_locs: list[tuple[int, int, int]] = []
+            expr_tree = ast.parse(static_code)
+            assert len(expr_tree.body) == 1 and isinstance(expr_tree.body[0], ast.Expr)
+            expr_root = expr_tree.body[0].value
+            for expr_node in ast.walk(expr_root):
+                if (
+                    isinstance(expr_node, ast.Call)
+                    and getattr(expr_node.func, "id", None) == "len"
+                    and len(expr_node.args) == 1
+                ):
+                    len_expr = static_code[expr_node.col_offset : expr_node.end_col_offset]
+                    try:
+                        len_value = eval(len_expr, len_expr_ctx)
+                    except Exception:
+                        pass
+                    else:
+                        len_value_locs.append((len_value, expr_node.col_offset, expr_node.end_col_offset))
+            if len_value_locs:
+                new_static_code = ""
+                loc = 0
+                for value, start, end in len_value_locs:
+                    new_static_code += f"{static_code[loc:start]}{value}"
+                    loc = end
+                new_static_code += static_code[len_value_locs[-1][2] :]
+                code_to_eval = new_static_code
+        try:
+            value = eval(code_to_eval, vars_dict)
+            if isinstance(value, (enum.IntEnum, enum.IntFlag)):
+                value = int(value)
+            if warp._src.config.verbose:
+                print(f"Evaluated static command: {static_code} = {value}")
+        except NameError as e:
+            raise WarpCodegenError(
+                f"Error evaluating static expression: {e}. Make sure all variables used in the static expression are constant."
+            ) from e
+        except Exception as e:
+            raise WarpCodegenError(
+                f"Error evaluating static expression: {e} while evaluating the following code generated from the static expression:\n{static_code}"
+            ) from e
+        try:
+            adj.verify_static_return_value(value)
+        except ValueError as e:
+            raise WarpCodegenError(
+                f"Static expression returns an unsupported value: {e} while evaluating the following code generated from the static expression:\n{static_code}"
+            ) from e
+        return value, static_code
+    # try to replace wp.static() expressions by their evaluated value if the
+    # expression can be evaluated
+    def replace_static_expressions(adj):
+        class StaticExpressionReplacer(ast.NodeTransformer):
+            def visit_Call(self, node):
+                func, _ = adj.resolve_static_expression(node.func, eval_types=False)
+                if adj.is_static_expression(func):
+                    try:
+                        # the static expression will execute as long as the static expression is valid and
+                        # only depends on global or captured variables
+                        obj, code = adj.evaluate_static_expression(node)
+                        if code is not None:
+                            adj.static_expressions[code] = obj
+                            if isinstance(obj, warp._src.context.Function):
+                                name_node = ast.Name("__warp_func__")
+                                # we add a pointer to the Warp function here so that we can refer to it later at
+                                # codegen time (note that the function key itself is not sufficient to uniquely
+                                # identify the function, as the function may be redefined between the current time
+                                # of wp.static() declaration and the time of codegen during module building)
+                                name_node.warp_func = obj
+                                return ast.copy_location(name_node, node)
+                            else:
+                                return ast.copy_location(ast.Constant(value=obj), node)
+                    except Exception:
+                        # Ignoring failing static expressions should generally not be an issue because only
+                        # one of these cases should be possible:
+                        #   1) the static expression itself is invalid code, in which case the module cannot be
+                        #      built all,
+                        #   2) the static expression contains a reference to a local (even if constant) variable
+                        #      (and is therefore not executable and raises this exception), in which
+                        #      case changing the constant, or the code affecting this constant, would lead to
+                        #      a different module hash anyway.
+                        # In any case, we mark this Adjoint to have unresolvable static expressions.
+                        # This will trigger a code generation step even if the module hash is unchanged.
+                        adj.has_unresolved_static_expressions = True
+                        pass
+                return self.generic_visit(node)
+        adj.tree = StaticExpressionReplacer().visit(adj.tree)
+    # Evaluates a static expression that does not depend on runtime values
+    # if eval_types is True, try resolving the path using evaluated type information as well
+    def resolve_static_expression(adj, root_node, eval_types=True):
+        attributes = []
+        node = root_node
+        while isinstance(node, ast.Attribute):
+            attributes.append(node.attr)
+            node = node.value
+        if eval_types and isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
+            # support for operators returning modules
+            # i.e. operator_name(*operator_args).x.y.z
+            operator_args = node.args
+            operator_name = node.func.id
+            if operator_name == "type":
+                if len(operator_args) != 1:
+                    raise WarpCodegenError(f"type() operator expects exactly one argument, got {len(operator_args)}")
+                # type() operator
+                var = adj.eval(operator_args[0])
+                if isinstance(var, Var):
+                    var_type = strip_reference(var.type)
+                    # Allow accessing type attributes, for instance array.dtype
+                    while attributes:
+                        attr_name = attributes.pop()
+                        var_type, prev_type = adj.resolve_type_attribute(var_type, attr_name), var_type
+                        if var_type is None:
+                            raise WarpCodegenAttributeError(
+                                f"{attr_name} is not an attribute of {type_repr(prev_type)}"
+                            )
+                    return var_type, [str(var_type)]
+                else:
+                    raise WarpCodegenError(f"Cannot deduce the type of {var}")
+        # reverse list since ast presents it in backward order
+        path = [*reversed(attributes)]
+        if isinstance(node, ast.Name):
+            path.insert(0, node.id)
+        # Try resolving path from captured context
+        captured_obj = adj.resolve_path(path)
+        if captured_obj is not None:
+            return captured_obj, path
+        return None, path
+    def resolve_external_reference(adj, name: str):
+        try:
+            # look up in closure variables
+            idx = adj.func.__code__.co_freevars.index(name)
+            obj = adj.func.__closure__[idx].cell_contents
+        except ValueError:
+            # look up in global variables
+            obj = adj.func.__globals__.get(name)
+        return obj
+    # annotate generated code with the original source code line
+    def set_lineno(adj, lineno):
+        if adj.lineno is None or adj.lineno != lineno:
+            line = lineno + adj.fun_lineno
+            source = adj.source_lines[lineno].strip().ljust(80 - len(adj.indentation), " ")
+            adj.add_forward(f"// {source}       <L {line}>")
+            adj.add_reverse(f"// adj: {source}  <L {line}>")
+        adj.lineno = lineno
+    def get_node_source(adj, node):
+        # return the Python code corresponding to the given AST node
+        return ast.get_source_segment(adj.source, node)
+    def get_references(adj) -> tuple[dict[str, Any], dict[Any, Any], dict[warp._src.context.Function, Any]]:
+        """Traverses ``adj.tree`` and returns referenced constants, types, and user-defined functions."""
+        local_variables = set()  # Track local variables appearing on the LHS so we know when variables are shadowed
+        constants: dict[str, Any] = {}
+        types: dict[Struct | type, Any] = {}
+        functions: dict[warp._src.context.Function, Any] = {}
+        for node in ast.walk(adj.tree):
+            if isinstance(node, ast.Name) and node.id not in local_variables:
+                # look up in closure/global variables
+                obj = adj.resolve_external_reference(node.id)
+                if warp._src.types.is_value(obj):
+                    constants[node.id] = obj
+            elif isinstance(node, ast.Attribute):
+                obj, path = adj.resolve_static_expression(node, eval_types=False)
+                if warp._src.types.is_value(obj):
+                    constants[".".join(path)] = obj
+            elif isinstance(node, ast.Call):
+                func, _ = adj.resolve_static_expression(node.func, eval_types=False)
+                if isinstance(func, warp._src.context.Function) and not func.is_builtin():
+                    # calling user-defined function
+                    functions[func] = None
+                elif isinstance(func, Struct):
+                    # calling struct constructor
+                    types[func] = None
+                elif isinstance(func, type) and warp._src.types.type_is_value(func):
+                    # calling value type constructor
+                    types[func] = None
+            elif isinstance(node, ast.Assign):
+                # Add the LHS names to the local_variables so we know any subsequent uses are shadowed
+                lhs = node.targets[0]
+                if isinstance(lhs, ast.Tuple):
+                    for v in lhs.elts:
+                        if isinstance(v, ast.Name):
+                            local_variables.add(v.id)
+                elif isinstance(lhs, ast.Name):
+                    local_variables.add(lhs.id)
+        return constants, types, functions
+# ----------------
+# code generation
+cpu_module_header = """
+#define WP_TILE_BLOCK_DIM {block_dim}
+#define WP_NO_CRT
+#include "builtin.h"
+// avoid namespacing of float type for casting to float type, this is to avoid wp::float(x), which is not valid in C++
+#define float(x) cast_float(x)
+#define adj_float(x, adj_x, adj_ret) adj_cast_float(x, adj_x, adj_ret)
+#define int(x) cast_int(x)
+#define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
+#define builtin_tid1d() wp::tid(task_index, dim)
+#define builtin_tid2d(x, y) wp::tid(x, y, task_index, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, task_index, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, task_index, dim)
+#define builtin_block_dim() wp::block_dim()
+"""
+cuda_module_header = """
+#define WP_TILE_BLOCK_DIM {block_dim}
+#define WP_NO_CRT
+#include "builtin.h"
+// Map wp.breakpoint() to a device brkpt at the call site so cuda-gdb attributes the stop to the generated .cu line
+#if defined(__CUDACC__) && !defined(_MSC_VER)
+#define __debugbreak() __brkpt()
+#endif
+// avoid namespacing of float type for casting to float type, this is to avoid wp::float(x), which is not valid in C++
+#define float(x) cast_float(x)
+#define adj_float(x, adj_x, adj_ret) adj_cast_float(x, adj_x, adj_ret)
+#define int(x) cast_int(x)
+#define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
+#define builtin_tid1d() wp::tid(_idx, dim)
+#define builtin_tid2d(x, y) wp::tid(x, y, _idx, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, _idx, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, _idx, dim)
+#define builtin_block_dim() wp::block_dim()
+"""
+struct_template = """
+struct {name}
+{{
+{struct_body}
+    {defaulted_constructor_def}
+    CUDA_CALLABLE {name}({forward_args})
+    {forward_initializers}
+    {{
+    }}
+    CUDA_CALLABLE {name}& operator += (const {name}& rhs)
+    {{{prefix_add_body}
+        return *this;}}
+}};
+static CUDA_CALLABLE void adj_{name}({reverse_args})
+{{
+{reverse_body}}}
+// Required when compiling adjoints.
+CUDA_CALLABLE {name} add(const {name}& a, const {name}& b)
+{{
+    return {name}();
+}}
+CUDA_CALLABLE void adj_atomic_add({name}* p, {name} t)
+{{
+{atomic_add_body}}}
+"""
+cpu_forward_function_template = """
+// {filename}:{lineno}
+static {return_type} {name}(
+    {forward_args})
+{{
+{forward_body}}}
+"""
+cpu_reverse_function_template = """
+// {filename}:{lineno}
+static void adj_{name}(
+    {reverse_args})
+{{
+{reverse_body}}}
+"""
+cuda_forward_function_template = """
+// {filename}:{lineno}
+{line_directive}static CUDA_CALLABLE {return_type} {name}(
+    {forward_args})
+{{
+{forward_body}{line_directive}}}
+"""
+cuda_reverse_function_template = """
+// {filename}:{lineno}
+{line_directive}static CUDA_CALLABLE void adj_{name}(
+    {reverse_args})
+{{
+{reverse_body}{line_directive}}}
+"""
+cuda_kernel_template_forward = """
+{line_directive}extern "C" __global__ void {name}_cuda_kernel_forward(
+    {forward_args})
+{{
+{line_directive}    wp::tile_shared_storage_t tile_mem;
+{line_directive}    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+{line_directive}         _idx < dim.size;
+{line_directive}         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+    {{
+            // reset shared memory allocator
+{line_directive}        wp::tile_shared_storage_t::init();
+{forward_body}{line_directive}    }}
+{line_directive}}}
+"""
+cuda_kernel_template_backward = """
+{line_directive}extern "C" __global__ void {name}_cuda_kernel_backward(
+    {reverse_args})
+{{
+{line_directive}    wp::tile_shared_storage_t tile_mem;
+{line_directive}    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+{line_directive}         _idx < dim.size;
+{line_directive}         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+    {{
+            // reset shared memory allocator
+{line_directive}        wp::tile_shared_storage_t::init();
+{reverse_body}{line_directive}    }}
+{line_directive}}}
+"""
+cpu_kernel_template_forward = """
+void {name}_cpu_kernel_forward(
+    {forward_args},
+    wp_args_{name} *_wp_args)
+{{
+{forward_body}}}
+"""
+cpu_kernel_template_backward = """
+void {name}_cpu_kernel_backward(
+    {reverse_args},
+    wp_args_{name} *_wp_args,
+    wp_args_{name} *_wp_adj_args)
+{{
+{reverse_body}}}
+"""
+cpu_module_template_forward = """
+extern "C" {{
+// Python CPU entry points
+WP_API void {name}_cpu_forward(
+    wp::launch_bounds_t dim,
+    wp_args_{name} *_wp_args)
+{{
+    wp::tile_shared_storage_t tile_mem;
+#if defined(WP_ENABLE_TILES_IN_STACK_MEMORY)
+    wp::shared_tile_storage = &tile_mem;
+#endif
+    for (size_t task_index = 0; task_index < dim.size; ++task_index)
+    {{
+        {name}_cpu_kernel_forward(dim, task_index, _wp_args);
+    }}
+}}
+}} // extern C
+"""
+cpu_module_template_backward = """
+extern "C" {{
+WP_API void {name}_cpu_backward(
+    wp::launch_bounds_t dim,
+    wp_args_{name} *_wp_args,
+    wp_args_{name} *_wp_adj_args)
+{{
+    wp::tile_shared_storage_t tile_mem;
+#if defined(WP_ENABLE_TILES_IN_STACK_MEMORY)
+    wp::shared_tile_storage = &tile_mem;
+#endif
+    for (size_t task_index = 0; task_index < dim.size; ++task_index)
+    {{
+        {name}_cpu_kernel_backward(dim, task_index, _wp_args, _wp_adj_args);
+    }}
+}}
+}} // extern C
+"""
+# converts a constant Python value to equivalent C-repr
+def constant_str(value):
+    value_type = type(value)
+    if value_type == bool or value_type == builtins.bool:
+        if value:
+            return "true"
+        else:
+            return "false"
+    elif value_type == str:
+        # ensure constant strings are correctly escaped
+        return '"' + str(value.encode("unicode-escape").decode()) + '"'
+    elif isinstance(value, ctypes.Array):
+        if value_type._wp_scalar_type_ == float16:
+            # special case for float16, which is stored as uint16 in the ctypes.Array
+            from warp._src.context import runtime
+            scalar_value = runtime.core.wp_half_bits_to_float
+        else:
+            def scalar_value(x):
+                return x
+        # list of scalar initializer values
+        initlist = []
+        for i in range(value._length_):
+            x = ctypes.Array.__getitem__(value, i)
+            initlist.append(str(scalar_value(x)).lower())
+        if value._wp_scalar_type_ is bool:
+            dtypestr = f"wp::initializer_array<{value._length_},{value._wp_scalar_type_.__name__}>"
+        else:
+            dtypestr = f"wp::initializer_array<{value._length_},wp::{value._wp_scalar_type_.__name__}>"
+        # construct value from initializer array, e.g. wp::initializer_array<4,wp::float32>{1.0, 2.0, 3.0, 4.0}
+        return f"{dtypestr}{{{', '.join(initlist)}}}"
+    elif value_type in warp._src.types.scalar_types:
+        # make sure we emit the value of objects, e.g. uint32
+        return str(value.value)
+    elif issubclass(value_type, StructInstance):
+        # constant struct instance
+        arg_strs = []
+        for key, var in value._cls.vars.items():
+            attr = getattr(value, key)
+            arg_strs.append(f"{Var.type_to_ctype(var.type)}({constant_str(attr)})")
+        arg_str = ", ".join(arg_strs)
+        return f"{value.native_name}({arg_str})"
+    elif value == math.inf:
+        return "INFINITY"
+    elif math.isnan(value):
+        return "NAN"
+    else:
+        # otherwise just convert constant to string
+        return str(value)
+def indent(args, stops=1):
+    sep = ",\n"
+    for _i in range(stops):
+        sep += "    "
+    # return sep + args.replace(", ", "," + sep)
+    return sep.join(args)
+# generates a C function name based on the python function name
+def make_full_qualified_name(func: Union[str, Callable]) -> str:
+    if not isinstance(func, str):
+        func = func.__qualname__
+    return re.sub("[^0-9a-zA-Z_]+", "", func.replace(".", "__"))
+def codegen_struct(struct, device="cpu", indent_size=4):
+    name = struct.native_name
+    body = []
+    indent_block = " " * indent_size
+    if len(struct.vars) > 0:
+        for label, var in struct.vars.items():
+            body.append(var.ctype() + " " + label + ";\n")
+    else:
+        # for empty structs, emit the dummy attribute to avoid any compiler-specific alignment issues
+        body.append("char _dummy_;\n")
+    forward_args = []
+    reverse_args = []
+    forward_initializers = []
+    reverse_body = []
+    atomic_add_body = []
+    prefix_add_body = []
+    # forward args
+    for label, var in struct.vars.items():
+        var_ctype = var.ctype()
+        default_arg_def = " = {}" if forward_args else ""
+        forward_args.append(f"{var_ctype} const& {label}{default_arg_def}")
+        reverse_args.append(f"{var_ctype} const&")
+        namespace = "wp::" if var_ctype.startswith("wp::") or var_ctype == "bool" else ""
+        atomic_add_body.append(f"{indent_block}{namespace}adj_atomic_add(&p->{label}, t.{label});\n")
+        prefix = f"{indent_block}," if forward_initializers else ":"
+        forward_initializers.append(f"{indent_block}{prefix} {label}{{{label}}}\n")
+    # prefix-add operator
+    for label, var in struct.vars.items():
+        if not is_array(var.type):
+            prefix_add_body.append(f"{indent_block}{label} += rhs.{label};\n")
+    # reverse args
+    for label, var in struct.vars.items():
+        reverse_args.append(var.ctype() + " & adj_" + label)
+        if is_array(var.type):
+            reverse_body.append(f"{indent_block}adj_{label} = adj_ret.{label};\n")
+        else:
+            reverse_body.append(f"{indent_block}adj_{label} += adj_ret.{label};\n")
+    reverse_args.append(name + " & adj_ret")
+    # explicitly defaulted default constructor if no default constructor has been defined
+    defaulted_constructor_def = f"{name}() = default;" if forward_args else ""
+    return struct_template.format(
+        name=name,
+        struct_body="".join([indent_block + l for l in body]),
+        forward_args=indent(forward_args),
+        forward_initializers="".join(forward_initializers),
+        reverse_args=indent(reverse_args),
+        reverse_body="".join(reverse_body),
+        prefix_add_body="".join(prefix_add_body),
+        atomic_add_body="".join(atomic_add_body),
+        defaulted_constructor_def=defaulted_constructor_def,
+    )
+def codegen_func_forward(adj, func_type="kernel", device="cpu"):
+    if device == "cpu":
+        indent = 4
+    elif device == "cuda":
+        if func_type == "kernel":
+            indent = 8
+        else:
+            indent = 4
+    else:
+        raise ValueError(f"Device {device} not supported for codegen")
+    indent_block = " " * indent
+    lines = []
+    # argument vars
+    if device == "cpu" and func_type == "kernel":
+        lines += ["//---------\n"]
+        lines += ["// argument vars\n"]
+        for var in adj.args:
+            lines += [f"{var.ctype()} {var.emit()} = _wp_args->{var.label};\n"]
+    # primal vars
+    lines += ["//---------\n"]
+    lines += ["// primal vars\n"]
+    for var in adj.variables:
+        if is_tile(var.type):
+            lines += [f"{var.ctype()} {var.emit()} = {var.type.cinit(requires_grad=False)};\n"]
+        elif var.constant is None:
+            lines += [f"{var.ctype()} {var.emit()};\n"]
+        else:
+            lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
+        if line_directive := adj.get_line_directive(lines[-1], var.relative_lineno):
+            lines.insert(-1, f"{line_directive}\n")
+    # forward pass
+    lines += ["//---------\n"]
+    lines += ["// forward\n"]
+    for f in adj.blocks[0].body_forward:
+        if func_type == "kernel" and device == "cuda" and f.lstrip().startswith("return;"):
+            # Use of grid-stride loops in CUDA kernels requires that we convert return; to continue;
+            lines += [f.replace("return;", "continue;") + "\n"]
+        else:
+            lines += [f + "\n"]
+    return "".join(l.lstrip() if l.lstrip().startswith("#line") else indent_block + l for l in lines)
+def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
+    if device == "cpu":
+        indent = 4
+    elif device == "cuda":
+        if func_type == "kernel":
+            indent = 8
+        else:
+            indent = 4
+    else:
+        raise ValueError(f"Device {device} not supported for codegen")
+    indent_block = " " * indent
+    lines = []
+    # argument vars
+    if device == "cpu" and func_type == "kernel":
+        lines += ["//---------\n"]
+        lines += ["// argument vars\n"]
+        for var in adj.args:
+            lines += [f"{var.ctype()} {var.emit()} = _wp_args->{var.label};\n"]
+        for var in adj.args:
+            lines += [f"{var.ctype()} {var.emit_adj()} = _wp_adj_args->{var.label};\n"]
+    # primal vars
+    lines += ["//---------\n"]
+    lines += ["// primal vars\n"]
+    for var in adj.variables:
+        if is_tile(var.type):
+            lines += [f"{var.ctype()} {var.emit()} = {var.type.cinit(requires_grad=True)};\n"]
+        elif var.constant is None:
+            lines += [f"{var.ctype()} {var.emit()};\n"]
+        else:
+            lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
+        if line_directive := adj.get_line_directive(lines[-1], var.relative_lineno):
+            lines.insert(-1, f"{line_directive}\n")
+    # dual vars
+    lines += ["//---------\n"]
+    lines += ["// dual vars\n"]
+    for var in adj.variables:
+        name = var.emit_adj()
+        ctype = var.ctype(value_type=True)
+        if is_tile(var.type):
+            if var.type.storage == "register":
+                lines += [
+                    f"{var.type.ctype()} {name}(0.0);\n"
+                ]  # reverse mode tiles alias the forward vars since shared tiles store both primal/dual vars together
+            elif var.type.storage == "shared":
+                lines += [
+                    f"{var.type.ctype()}& {name} = {var.emit()};\n"
+                ]  # reverse mode tiles alias the forward vars since shared tiles store both primal/dual vars together
+        else:
+            lines += [f"{ctype} {name} = {{}};\n"]
+        if line_directive := adj.get_line_directive(lines[-1], var.relative_lineno):
+            lines.insert(-1, f"{line_directive}\n")
+    # forward pass
+    lines += ["//---------\n"]
+    lines += ["// forward\n"]
+    for f in adj.blocks[0].body_replay:
+        lines += [f + "\n"]
+    # reverse pass
+    lines += ["//---------\n"]
+    lines += ["// reverse\n"]
+    for l in reversed(adj.blocks[0].body_reverse):
+        lines += [l + "\n"]
+    # In grid-stride kernels the reverse body is in a for loop
+    if device == "cuda" and func_type == "kernel":
+        lines += ["continue;\n"]
+    else:
+        lines += ["return;\n"]
+    return "".join(l.lstrip() if l.lstrip().startswith("#line") else indent_block + l for l in lines)
+def codegen_func(adj, c_func_name: str, device="cpu", options=None):
+    if options is None:
+        options = {}
+    if adj.return_var is not None and "return" in adj.arg_types:
+        if get_origin(adj.arg_types["return"]) is tuple:
+            if len(get_args(adj.arg_types["return"])) != len(adj.return_var):
+                raise WarpCodegenError(
+                    f"The function `{adj.fun_name}` has its return type "
+                    f"annotated as a tuple of {len(get_args(adj.arg_types['return']))} elements "
+                    f"but the code returns {len(adj.return_var)} values."
+                )
+            elif not types_equal(adj.arg_types["return"], tuple(x.type for x in adj.return_var), match_generic=True):
+                raise WarpCodegenError(
+                    f"The function `{adj.fun_name}` has its return type "
+                    f"annotated as `{warp._src.context.type_str(adj.arg_types['return'])}` "
+                    f"but the code returns a tuple with types `({', '.join(warp._src.context.type_str(x.type) for x in adj.return_var)})`."
+                )
+        elif len(adj.return_var) > 1 and get_origin(adj.arg_types["return"]) is not tuple:
+            raise WarpCodegenError(
+                f"The function `{adj.fun_name}` has its return type "
+                f"annotated as `{warp._src.context.type_str(adj.arg_types['return'])}` "
+                f"but the code returns {len(adj.return_var)} values."
+            )
+        elif not types_equal(adj.arg_types["return"], adj.return_var[0].type):
+            raise WarpCodegenError(
+                f"The function `{adj.fun_name}` has its return type "
+                f"annotated as `{warp._src.context.type_str(adj.arg_types['return'])}` "
+                f"but the code returns a value of type `{warp._src.context.type_str(adj.return_var[0].type)}`."
+            )
+        elif (
+            isinstance(adj.return_var[0].type, warp._src.types.fixedarray)
+            and type(adj.arg_types["return"]) is warp._src.types.array
+        ):
+            # If the return statement yields a `fixedarray` while the function is annotated
+            # to return a standard `array`, then raise an error since the `fixedarray` storage
+            # allocated on the stack will be freed once the function exits, meaning that the
+            # resulting `array` instance will point to an invalid data.
+            raise WarpCodegenError(
+                f"The function `{adj.fun_name}` returns a fixed-size array "
+                f"whereas it has its return type annotated as "
+                f"`{warp._src.context.type_str(adj.arg_types['return'])}`."
+            )
+    # Build line directive for function definition (subtract 1 to account for 1-indexing of AST line numbers)
+    # This is used as a catch-all C-to-Python source line mapping for any code that does not have
+    # a direct mapping to a Python source line.
+    func_line_directive = ""
+    if line_directive := adj.get_line_directive("", adj.fun_def_lineno - 1):
+        func_line_directive = f"{line_directive}\n"
+    # forward header
+    if adj.return_var is not None and len(adj.return_var) == 1:
+        return_type = adj.return_var[0].ctype()
+    else:
+        return_type = "void"
+    has_multiple_outputs = adj.return_var is not None and len(adj.return_var) != 1
+    forward_args = []
+    reverse_args = []
+    # forward args
+    for i, arg in enumerate(adj.args):
+        s = f"{arg.ctype()} {arg.emit()}"
+        forward_args.append(s)
+        if not adj.custom_reverse_mode or i < adj.custom_reverse_num_input_args:
+            reverse_args.append(s)
+    if has_multiple_outputs:
+        for i, arg in enumerate(adj.return_var):
+            forward_args.append(arg.ctype() + " & ret_" + str(i))
+            reverse_args.append(arg.ctype() + " & ret_" + str(i))
+    # reverse args
+    for i, arg in enumerate(adj.args):
+        if adj.custom_reverse_mode and i >= adj.custom_reverse_num_input_args:
+            break
+        # indexed array gradients are regular arrays
+        if isinstance(arg.type, indexedarray):
+            _arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
+            reverse_args.append(_arg.ctype() + " & adj_" + arg.label)
+        else:
+            reverse_args.append(arg.ctype() + " & adj_" + arg.label)
+    if has_multiple_outputs:
+        for i, arg in enumerate(adj.return_var):
+            reverse_args.append(arg.ctype() + " & adj_ret_" + str(i))
+    elif return_type != "void":
+        reverse_args.append(return_type + " & adj_ret")
+    # custom output reverse args (user-declared)
+    if adj.custom_reverse_mode:
+        for arg in adj.args[adj.custom_reverse_num_input_args :]:
+            reverse_args.append(f"{arg.ctype()} & {arg.emit()}")
+    if device == "cpu":
+        forward_template = cpu_forward_function_template
+        reverse_template = cpu_reverse_function_template
+    elif device == "cuda":
+        forward_template = cuda_forward_function_template
+        reverse_template = cuda_reverse_function_template
+    else:
+        raise ValueError(f"Device {device} is not supported")
+    # codegen body
+    forward_body = codegen_func_forward(adj, func_type="function", device=device)
+    s = ""
+    if not adj.skip_forward_codegen:
+        s += forward_template.format(
+            name=c_func_name,
+            return_type=return_type,
+            forward_args=indent(forward_args),
+            forward_body=forward_body,
+            filename=adj.filename,
+            lineno=adj.fun_lineno,
+            line_directive=func_line_directive,
+        )
+    if not adj.skip_reverse_codegen:
+        if adj.custom_reverse_mode:
+            reverse_body = "\t// user-defined adjoint code\n" + forward_body
+        else:
+            if options.get("enable_backward", True) and adj.used_by_backward_kernel:
+                reverse_body = codegen_func_reverse(adj, func_type="function", device=device)
+            else:
+                reverse_body = '\t// reverse mode disabled (module option "enable_backward" is False or no dependent kernel found with "enable_backward")\n'
+        s += reverse_template.format(
+            name=c_func_name,
+            return_type=return_type,
+            reverse_args=indent(reverse_args),
+            forward_body=forward_body,
+            reverse_body=reverse_body,
+            filename=adj.filename,
+            lineno=adj.fun_lineno,
+            line_directive=func_line_directive,
+        )
+    return s
+def codegen_snippet(adj, name, snippet, adj_snippet, replay_snippet):
+    if adj.return_var is not None and len(adj.return_var) == 1:
+        return_type = adj.return_var[0].ctype()
+    else:
+        return_type = "void"
+    forward_args = []
+    reverse_args = []
+    # forward args
+    for _i, arg in enumerate(adj.args):
+        s = f"{arg.ctype()} {arg.emit().replace('var_', '')}"
+        forward_args.append(s)
+        reverse_args.append(s)
+    # reverse args
+    for _i, arg in enumerate(adj.args):
+        if isinstance(arg.type, indexedarray):
+            _arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
+            reverse_args.append(_arg.ctype() + " & adj_" + arg.label)
+        else:
+            reverse_args.append(arg.ctype() + " & adj_" + arg.label)
+    if return_type != "void":
+        reverse_args.append(return_type + " & adj_ret")
+    forward_template = cuda_forward_function_template
+    replay_template = cuda_forward_function_template
+    reverse_template = cuda_reverse_function_template
+    s = ""
+    s += forward_template.format(
+        name=name,
+        return_type=return_type,
+        forward_args=indent(forward_args),
+        forward_body=snippet,
+        filename=adj.filename,
+        lineno=adj.fun_lineno,
+        line_directive="",
+    )
+    if replay_snippet is not None:
+        s += replay_template.format(
+            name="replay_" + name,
+            return_type=return_type,
+            forward_args=indent(forward_args),
+            forward_body=replay_snippet,
+            filename=adj.filename,
+            lineno=adj.fun_lineno,
+            line_directive="",
+        )
+    if adj_snippet:
+        reverse_body = adj_snippet
+    else:
+        reverse_body = ""
+    s += reverse_template.format(
+        name=name,
+        return_type=return_type,
+        reverse_args=indent(reverse_args),
+        forward_body=snippet,
+        reverse_body=reverse_body,
+        filename=adj.filename,
+        lineno=adj.fun_lineno,
+        line_directive="",
+    )
+    return s
+def codegen_kernel(kernel, device, options):
+    # Update the module's options with the ones defined on the kernel, if any.
+    options = dict(options)
+    options.update(kernel.options)
+    adj = kernel.adj
+    args_struct = ""
+    if device == "cpu":
+        args_struct = f"struct wp_args_{kernel.get_mangled_name()} {{\n"
+        for i in adj.args:
+            args_struct += f"    {i.ctype()} {i.label};\n"
+        args_struct += "};\n"
+    # Build line directive for function definition (subtract 1 to account for 1-indexing of AST line numbers)
+    # This is used as a catch-all C-to-Python source line mapping for any code that does not have
+    # a direct mapping to a Python source line.
+    func_line_directive = ""
+    if line_directive := adj.get_line_directive("", adj.fun_def_lineno - 1):
+        func_line_directive = f"{line_directive}\n"
+    if device == "cpu":
+        template_forward = cpu_kernel_template_forward
+        template_backward = cpu_kernel_template_backward
+    elif device == "cuda":
+        template_forward = cuda_kernel_template_forward
+        template_backward = cuda_kernel_template_backward
+    else:
+        raise ValueError(f"Device {device} is not supported")
+    template = ""
+    template_fmt_args = {
+        "name": kernel.get_mangled_name(),
+    }
+    # build forward signature
+    forward_args = ["wp::launch_bounds_t dim"]
+    if device == "cpu":
+        forward_args.append("size_t task_index")
+    else:
+        for arg in adj.args:
+            forward_args.append(arg.ctype() + " var_" + arg.label)
+    forward_body = codegen_func_forward(adj, func_type="kernel", device=device)
+    template_fmt_args.update(
+        {
+            "forward_args": indent(forward_args),
+            "forward_body": forward_body,
+            "line_directive": func_line_directive,
+        }
+    )
+    template += template_forward
+    if options["enable_backward"]:
+        # build reverse signature
+        reverse_args = ["wp::launch_bounds_t dim"]
+        if device == "cpu":
+            reverse_args.append("size_t task_index")
+        else:
+            for arg in adj.args:
+                reverse_args.append(arg.ctype() + " var_" + arg.label)
+            for arg in adj.args:
+                # indexed array gradients are regular arrays
+                if isinstance(arg.type, indexedarray):
+                    _arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
+                    reverse_args.append(_arg.ctype() + " adj_" + arg.label)
+                else:
+                    reverse_args.append(arg.ctype() + " adj_" + arg.label)
+        reverse_body = codegen_func_reverse(adj, func_type="kernel", device=device)
+        template_fmt_args.update(
+            {
+                "reverse_args": indent(reverse_args),
+                "reverse_body": reverse_body,
+            }
+        )
+        template += template_backward
+    s = template.format(**template_fmt_args)
+    return args_struct + s
+def codegen_module(kernel, device, options):
+    if device != "cpu":
+        return ""
+    # Update the module's options with the ones defined on the kernel, if any.
+    options = dict(options)
+    options.update(kernel.options)
+    template = ""
+    template_fmt_args = {
+        "name": kernel.get_mangled_name(),
+    }
+    template += cpu_module_template_forward
+    if options["enable_backward"]:
+        template += cpu_module_template_backward
+    s = template.format(**template_fmt_args)
+    return s