PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-macosx_10_13_universal2.whl → 1.7.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.6.2__py3-none-macosx_10_13_universal2.whl → 1.7.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (191) hide show

warp/__init__.py +7 -1
warp/autograd.py +12 -2
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +463 -372
warp/codegen.py +196 -124
warp/config.py +42 -6
warp/context.py +496 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_cloth.py +1 -1
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/distributed/example_jacobi_mpi.py +507 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/field.py +11 -1
warp/fem/field/nodal_field.py +56 -88
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +16 -13
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +7 -20
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -29
warp/jax_experimental/ffi.py +702 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +312 -116
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +100 -11
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/render/render_opengl.py +19 -17
warp/render/render_usd.py +93 -3
warp/sim/articulation.py +4 -4
warp/sim/collide.py +32 -19
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/inertia.py +189 -156
warp/sim/integrator_euler.py +8 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +8 -5
warp/sim/model.py +71 -25
warp/sim/render.py +4 -0
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +217 -20
warp/tests/__main__.py +0 -15
warp/tests/assets/torus.usda +1 -1
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +236 -205
warp/tests/sim/test_inertia.py +161 -0
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{flaky_test_sim_grad.py → sim/test_sim_grad.py} +4 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/sim/test_xpbd.py +399 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_codegen.py +24 -3
warp/tests/test_examples.py +40 -38
warp/tests/test_fem.py +98 -14
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +577 -156
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +356 -151
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +336 -178
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +98 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -62
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +175 -666
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/METADATA +46 -12
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/RECORD +184 -171
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/WHEEL +1 -1
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info/licenses}/LICENSE.md +0 -26
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/top_level.txt +0 -0

warp/codegen.py CHANGED Viewed

@@ -26,7 +26,7 @@ import re
 import sys
 import textwrap
 import types
-from typing import Any, Callable, Dict, Mapping, Optional, Sequence
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence, get_args, get_origin
 import warp.config
 from warp.types import *
@@ -57,7 +57,7 @@ class WarpCodegenKeyError(KeyError):
 # map operator to function name
-builtin_operators = {}
+builtin_operators: Dict[type[ast.AST], str] = {}
 # see https://www.ics.uci.edu/~pattis/ICS-31/lectures/opexp.pdf for a
 # nice overview of python operators
@@ -122,16 +122,6 @@ def get_closure_cell_contents(obj):
     return None
-def get_type_origin(tp):
-    # Compatible version of `typing.get_origin()` for Python 3.7 and older.
-    return getattr(tp, "__origin__", None)
-def get_type_args(tp):
-    # Compatible version of `typing.get_args()` for Python 3.7 and older.
-    return getattr(tp, "__args__", ())
 def eval_annotations(annotations: Mapping[str, Any], obj: Any) -> Mapping[str, Any]:
     """Un-stringize annotations caused by `from __future__ import annotations` of PEP 563."""
     # Implementation backported from `inspect.get_annotations()` for Python 3.9 and older.
@@ -212,7 +202,7 @@ def get_full_arg_spec(func: Callable) -> inspect.FullArgSpec:
     return spec._replace(annotations=eval_annotations(spec.annotations, func))
-def struct_instance_repr_recursive(inst: StructInstance, depth: int) -> str:
+def struct_instance_repr_recursive(inst: StructInstance, depth: int, use_repr: bool) -> str:
     indent = "\t"
     # handle empty structs
@@ -226,9 +216,12 @@ def struct_instance_repr_recursive(inst: StructInstance, depth: int) -> str:
         field_value = getattr(inst, field_name, None)
         if isinstance(field_value, StructInstance):
-            field_value = struct_instance_repr_recursive(field_value, depth + 1)
+            field_value = struct_instance_repr_recursive(field_value, depth + 1, use_repr)
-        lines.append(f"{indent * (depth + 1)}{field_name}={field_value},")
+        if use_repr:
+            lines.append(f"{indent * (depth + 1)}{field_name}={field_value!r},")
+        else:
+            lines.append(f"{indent * (depth + 1)}{field_name}={field_value!s},")
     lines.append(f"{indent * depth})")
     return "\n".join(lines)
@@ -351,7 +344,10 @@ class StructInstance:
         return self._ctype
     def __repr__(self):
-        return struct_instance_repr_recursive(self, 0)
+        return struct_instance_repr_recursive(self, 0, use_repr=True)
+    def __str__(self):
+        return struct_instance_repr_recursive(self, 0, use_repr=False)
     def to(self, device):
         """Copies this struct with all array members moved onto the given device.
@@ -415,12 +411,14 @@ class StructInstance:
 class Struct:
-    def __init__(self, cls, key, module):
+    hash: bytes
+    def __init__(self, cls: type, key: str, module: warp.context.Module):
         self.cls = cls
         self.module = module
         self.key = key
+        self.vars: Dict[str, Var] = {}
-        self.vars = {}
         annotations = get_annotations(self.cls)
         for label, type in annotations.items():
             self.vars[label] = Var(label, type)
@@ -591,11 +589,11 @@ class Reference:
         self.value_type = value_type
-def is_reference(type):
+def is_reference(type: Any) -> builtins.bool:
     return isinstance(type, Reference)
-def strip_reference(arg):
+def strip_reference(arg: Any) -> Any:
     if is_reference(arg):
         return arg.value_type
     else:
@@ -623,7 +621,15 @@ def compute_type_str(base_name, template_params):
 class Var:
-    def __init__(self, label, type, requires_grad=False, constant=None, prefix=True):
+    def __init__(
+        self,
+        label: str,
+        type: type,
+        requires_grad: builtins.bool = False,
+        constant: Optional[builtins.bool] = None,
+        prefix: builtins.bool = True,
+        relative_lineno: Optional[int] = None,
+    ):
         # convert built-in types to wp types
         if type == float:
             type = float32
@@ -646,11 +652,14 @@ class Var:
         # used to associate a view array Var with its parent array Var
         self.parent = None
+        # Used to associate the variable with the Python statement that resulted in it being created.
+        self.relative_lineno = relative_lineno
     def __str__(self):
         return self.label
     @staticmethod
-    def type_to_ctype(t, value_type=False):
+    def type_to_ctype(t: type, value_type: builtins.bool = False) -> str:
         if is_array(t):
             if hasattr(t.dtype, "_wp_generic_type_str_"):
                 dtypestr = compute_type_str(f"wp::{t.dtype._wp_generic_type_str_}", t.dtype._wp_type_params_)
@@ -681,7 +690,7 @@ class Var:
         else:
             return f"wp::{t.__name__}"
-    def ctype(self, value_type=False):
+    def ctype(self, value_type: builtins.bool = False) -> str:
         return Var.type_to_ctype(self.type, value_type)
     def emit(self, prefix: str = "var"):
@@ -803,7 +812,7 @@ def func_match_args(func, arg_types, kwarg_types):
     return True
-def get_arg_type(arg: Union[Var, Any]):
+def get_arg_type(arg: Union[Var, Any]) -> type:
     if isinstance(arg, str):
         return str
@@ -819,7 +828,7 @@ def get_arg_type(arg: Union[Var, Any]):
     return type(arg)
-def get_arg_value(arg: Union[Var, Any]):
+def get_arg_value(arg: Any) -> Any:
     if isinstance(arg, Sequence):
         return tuple(get_arg_value(x) for x in arg)
@@ -867,6 +876,9 @@ class Adjoint:
                 "please save it on a file and use `importlib` if needed."
             ) from e
+        # Indicates where the function definition starts (excludes decorators)
+        adj.fun_def_lineno = None
         # get function source code
         adj.source = inspect.getsource(func)
         # ensures that indented class methods can be parsed as kernels
@@ -941,9 +953,6 @@ class Adjoint:
         # for unit testing errors being spit out from kernels.
         adj.skip_build = False
-        # Collect the LTOIR required at link-time
-        adj.ltoirs = []
     # allocate extra space for a function call that requires its
     # own shared memory space, we treat shared memory as a stack
     # where each function pushes and pops space off, the extra
@@ -1133,7 +1142,7 @@ class Adjoint:
         name = str(index)
         # allocate new variable
-        v = Var(name, type=type, constant=constant)
+        v = Var(name, type=type, constant=constant, relative_lineno=adj.lineno)
         adj.variables.append(v)
@@ -1158,11 +1167,44 @@ class Adjoint:
         return var
-    # append a statement to the forward pass
-    def add_forward(adj, statement, replay=None, skip_replay=False):
+    def get_line_directive(adj, statement: str, relative_lineno: Optional[int] = None) -> Optional[str]:
+        """Get a line directive for the given statement.
+        Args:
+            statement: The statement to get the line directive for.
+            relative_lineno: The line number of the statement relative to the function.
+        Returns:
+            A line directive for the given statement, or None if no line directive is needed.
+        """
+        # lineinfo is enabled by default in debug mode regardless of the builder option, don't want to unnecessarily
+        # emit line directives in generated code if it's not being compiled with line information
+        lineinfo_enabled = (
+            adj.builder_options.get("lineinfo", False) or adj.builder_options.get("mode", "release") == "debug"
+        )
+        if relative_lineno is not None and lineinfo_enabled and warp.config.line_directives:
+            is_comment = statement.strip().startswith("//")
+            if not is_comment:
+                line = relative_lineno + adj.fun_lineno
+                # Convert backslashes to forward slashes for CUDA compatibility
+                normalized_path = adj.filename.replace("\\", "/")
+                return f'#line {line} "{normalized_path}"'
+        return None
+    def add_forward(adj, statement: str, replay: Optional[str] = None, skip_replay: builtins.bool = False) -> None:
+        """Append a statement to the forward pass."""
+        if line_directive := adj.get_line_directive(statement, adj.lineno):
+            adj.blocks[-1].body_forward.append(line_directive)
         adj.blocks[-1].body_forward.append(adj.indentation + statement)
         if not skip_replay:
+            if line_directive:
+                adj.blocks[-1].body_replay.append(line_directive)
             if replay:
                 # if custom replay specified then output it
                 adj.blocks[-1].body_replay.append(adj.indentation + replay)
@@ -1171,9 +1213,14 @@ class Adjoint:
                 adj.blocks[-1].body_replay.append(adj.indentation + statement)
     # append a statement to the reverse pass
-    def add_reverse(adj, statement):
+    def add_reverse(adj, statement: str) -> None:
+        """Append a statement to the reverse pass."""
         adj.blocks[-1].body_reverse.append(adj.indentation + statement)
+        if line_directive := adj.get_line_directive(statement, adj.lineno):
+            adj.blocks[-1].body_reverse.append(line_directive)
     def add_constant(adj, n):
         output = adj.add_var(type=type(n), constant=n)
         return output
@@ -1281,7 +1328,7 @@ class Adjoint:
         # Bind the positional and keyword arguments to the function's signature
         # in order to process them as Python does it.
-        bound_args = func.signature.bind(*args, **kwargs)
+        bound_args: inspect.BoundArguments = func.signature.bind(*args, **kwargs)
         # Type args are the “compile time” argument values we get from codegen.
         # For example, when calling `wp.vec3f(...)` from within a kernel,
@@ -1451,6 +1498,8 @@ class Adjoint:
     def add_return(adj, var):
         if var is None or len(var) == 0:
+            # NOTE: If this kernel gets compiled for a CUDA device, then we need
+            # to convert the return; into a continue; in codegen_func_forward()
             adj.add_forward("return;", f"goto label{adj.label_count};")
         elif len(var) == 1:
             adj.add_forward(f"return {var[0].emit()};", f"goto label{adj.label_count};")
@@ -1624,6 +1673,8 @@ class Adjoint:
         adj.blocks[-1].body_reverse.extend(reversed(reverse))
     def emit_FunctionDef(adj, node):
+        adj.fun_def_lineno = node.lineno
         for f in node.body:
             # Skip variable creation for standalone constants, including docstrings
             if isinstance(f, ast.Expr) and isinstance(f.value, ast.Constant):
@@ -1688,7 +1739,7 @@ class Adjoint:
             if var1 != var2:
                 # insert a phi function that selects var1, var2 based on cond
-                out = adj.add_builtin_call("select", [cond, var1, var2])
+                out = adj.add_builtin_call("where", [cond, var2, var1])
                 adj.symbols[sym] = out
         symbols_prev = adj.symbols.copy()
@@ -1712,7 +1763,7 @@ class Adjoint:
             if var1 != var2:
                 # insert a phi function that selects var1, var2 based on cond
                 # note the reversed order of vars since we want to use !cond as our select
-                out = adj.add_builtin_call("select", [cond, var2, var1])
+                out = adj.add_builtin_call("where", [cond, var1, var2])
                 adj.symbols[sym] = out
     def emit_Compare(adj, node):
@@ -1856,25 +1907,6 @@ class Adjoint:
                 ) from e
             raise WarpCodegenAttributeError(f"Error, `{node.attr}` is not an attribute of '{aggregate}'") from e
-    def emit_String(adj, node):
-        # string constant
-        return adj.add_constant(node.s)
-    def emit_Num(adj, node):
-        # lookup constant, if it has already been assigned then return existing var
-        key = (node.n, type(node.n))
-        if key in adj.symbols:
-            return adj.symbols[key]
-        else:
-            out = adj.add_constant(node.n)
-            adj.symbols[key] = out
-            return out
-    def emit_Ellipsis(adj, node):
-        # stubbed @wp.native_func
-        return
     def emit_Assert(adj, node):
         # eval condition
         cond = adj.eval(node.test)
@@ -1886,24 +1918,11 @@ class Adjoint:
         adj.add_forward(f'assert(("{escaped_segment}",{cond.emit()}));')
-    def emit_NameConstant(adj, node):
-        if node.value:
-            return adj.add_constant(node.value)
-        elif node.value is None:
-            raise WarpCodegenTypeError("None type unsupported")
-        else:
-            return adj.add_constant(False)
     def emit_Constant(adj, node):
-        if isinstance(node, ast.Str):
-            return adj.emit_String(node)
-        elif isinstance(node, ast.Num):
-            return adj.emit_Num(node)
-        elif isinstance(node, ast.Ellipsis):
-            return adj.emit_Ellipsis(node)
+        if node.value is None:
+            raise WarpCodegenTypeError("None type unsupported")
         else:
-            assert isinstance(node, ast.NameConstant) or isinstance(node, ast.Constant)
-            return adj.emit_NameConstant(node)
+            return adj.add_constant(node.value)
     def emit_BinOp(adj, node):
         # evaluate binary operator arguments
@@ -1997,10 +2016,11 @@ class Adjoint:
         adj.end_while()
     def eval_num(adj, a):
-        if isinstance(a, ast.Num):
-            return True, a.n
-        if isinstance(a, ast.UnaryOp) and isinstance(a.op, ast.USub) and isinstance(a.operand, ast.Num):
-            return True, -a.operand.n
+        if isinstance(a, ast.Constant):
+            return True, a.value
+        if isinstance(a, ast.UnaryOp) and isinstance(a.op, ast.USub) and isinstance(a.operand, ast.Constant):
+            # Negative constant
+            return True, -a.operand.value
         # try and resolve the expression to an object
         # e.g.: wp.constant in the globals scope
@@ -2530,8 +2550,8 @@ class Adjoint:
                             f"Warning: mutating {node_source} in function {adj.fun_name} at {adj.filename}:{lineno}: this is a non-differentiable operation.\n{line}\n"
                         )
                 else:
-                    if adj.builder_options.get("enable_backward", True):
-                        out = adj.add_builtin_call("assign", [target, *indices, rhs])
+                    if warp.config.enable_vector_component_overwrites:
+                        out = adj.add_builtin_call("assign_copy", [target, *indices, rhs])
                         # re-point target symbol to out var
                         for id in adj.symbols:
@@ -2539,8 +2559,7 @@ class Adjoint:
                                 adj.symbols[id] = out
                                 break
                     else:
-                        attr = adj.add_builtin_call("index", [target, *indices])
-                        adj.add_builtin_call("store", [attr, rhs])
+                        adj.add_builtin_call("assign_inplace", [target, *indices, rhs])
             else:
                 raise WarpCodegenError(
@@ -2583,8 +2602,8 @@ class Adjoint:
                     attr = adj.add_builtin_call("indexref", [aggregate, index])
                     adj.add_builtin_call("store", [attr, rhs])
                 else:
-                    if adj.builder_options.get("enable_backward", True):
-                        out = adj.add_builtin_call("assign", [aggregate, index, rhs])
+                    if warp.config.enable_vector_component_overwrites:
+                        out = adj.add_builtin_call("assign_copy", [aggregate, index, rhs])
                         # re-point target symbol to out var
                         for id in adj.symbols:
@@ -2592,8 +2611,7 @@ class Adjoint:
                                 adj.symbols[id] = out
                                 break
                     else:
-                        attr = adj.add_builtin_call("index", [aggregate, index])
-                        adj.add_builtin_call("store", [attr, rhs])
+                        adj.add_builtin_call("assign_inplace", [aggregate, index, rhs])
             else:
                 attr = adj.emit_Attribute(lhs)
@@ -2699,10 +2717,12 @@ class Adjoint:
             elif type_is_vector(target_type) or type_is_quaternion(target_type) or type_is_matrix(target_type):
                 if isinstance(node.op, ast.Add):
-                    adj.add_builtin_call("augassign_add", [target, *indices, rhs])
+                    adj.add_builtin_call("add_inplace", [target, *indices, rhs])
                 elif isinstance(node.op, ast.Sub):
-                    adj.add_builtin_call("augassign_sub", [target, *indices, rhs])
+                    adj.add_builtin_call("sub_inplace", [target, *indices, rhs])
                 else:
+                    if warp.config.verbose:
+                        print(f"Warning: in-place op {node.op} is not differentiable")
                     make_new_assign_statement()
                     return
@@ -2732,9 +2752,6 @@ class Adjoint:
         ast.BoolOp: emit_BoolOp,
         ast.Name: emit_Name,
         ast.Attribute: emit_Attribute,
-        ast.Str: emit_String,  # Deprecated in 3.8; use Constant
-        ast.Num: emit_Num,  # Deprecated in 3.8; use Constant
-        ast.NameConstant: emit_NameConstant,  # Deprecated in 3.8; use Constant
         ast.Constant: emit_Constant,
         ast.BinOp: emit_BinOp,
         ast.UnaryOp: emit_UnaryOp,
@@ -2744,14 +2761,13 @@ class Adjoint:
         ast.Continue: emit_Continue,
         ast.Expr: emit_Expr,
         ast.Call: emit_Call,
-        ast.Index: emit_Index,  # Deprecated in 3.8; Use the index value directly instead.
+        ast.Index: emit_Index,  # Deprecated in 3.9
         ast.Subscript: emit_Subscript,
         ast.Assign: emit_Assign,
         ast.Return: emit_Return,
         ast.AugAssign: emit_AugAssign,
         ast.Tuple: emit_Tuple,
         ast.Pass: emit_Pass,
-        ast.Ellipsis: emit_Ellipsis,
         ast.Assert: emit_Assert,
     }
@@ -2947,12 +2963,16 @@ class Adjoint:
             # We want to replace the expression code in-place,
             # so reparse it to get the correct column info.
-            len_value_locs = []
+            len_value_locs: List[Tuple[int, int, int]] = []
             expr_tree = ast.parse(static_code)
             assert len(expr_tree.body) == 1 and isinstance(expr_tree.body[0], ast.Expr)
             expr_root = expr_tree.body[0].value
             for expr_node in ast.walk(expr_root):
-                if isinstance(expr_node, ast.Call) and expr_node.func.id == "len" and len(expr_node.args) == 1:
+                if (
+                    isinstance(expr_node, ast.Call)
+                    and getattr(expr_node.func, "id", None) == "len"
+                    and len(expr_node.args) == 1
+                ):
                     len_expr = static_code[expr_node.col_offset : expr_node.end_col_offset]
                     try:
                         len_value = eval(len_expr, len_expr_ctx)
@@ -3110,9 +3130,9 @@ class Adjoint:
         local_variables = set()  # Track local variables appearing on the LHS so we know when variables are shadowed
-        constants = {}
-        types = {}
-        functions = {}
+        constants: Dict[str, Any] = {}
+        types: Dict[Union[Struct, type], Any] = {}
+        functions: Dict[warp.context.Function, Any] = {}
         for node in ast.walk(adj.tree):
             if isinstance(node, ast.Name) and node.id not in local_variables:
@@ -3155,7 +3175,7 @@ class Adjoint:
 # code generation
 cpu_module_header = """
-#define WP_TILE_BLOCK_DIM {tile_size}
+#define WP_TILE_BLOCK_DIM {block_dim}
 #define WP_NO_CRT
 #include "builtin.h"
@@ -3174,7 +3194,7 @@ cpu_module_header = """
 """
 cuda_module_header = """
-#define WP_TILE_BLOCK_DIM {tile_size}
+#define WP_TILE_BLOCK_DIM {block_dim}
 #define WP_NO_CRT
 #include "builtin.h"
@@ -3197,6 +3217,7 @@ struct {name}
 {{
 {struct_body}
+    {defaulted_constructor_def}
     CUDA_CALLABLE {name}({forward_args})
     {forward_initializers}
     {{
@@ -3239,53 +3260,53 @@ static void adj_{name}(
 cuda_forward_function_template = """
 // {filename}:{lineno}
-static CUDA_CALLABLE {return_type} {name}(
+{line_directive}static CUDA_CALLABLE {return_type} {name}(
     {forward_args})
 {{
-{forward_body}}}
+{forward_body}{line_directive}}}
 """
 cuda_reverse_function_template = """
 // {filename}:{lineno}
-static CUDA_CALLABLE void adj_{name}(
+{line_directive}static CUDA_CALLABLE void adj_{name}(
     {reverse_args})
 {{
-{reverse_body}}}
+{reverse_body}{line_directive}}}
 """
 cuda_kernel_template_forward = """
-extern "C" __global__ void {name}_cuda_kernel_forward(
+{line_directive}extern "C" __global__ void {name}_cuda_kernel_forward(
     {forward_args})
 {{
-    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
-         _idx < dim.size;
-         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+{line_directive}    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+{line_directive}         _idx < dim.size;
+{line_directive}         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
     {{
         // reset shared memory allocator
-        wp::tile_alloc_shared(0, true);
+{line_directive}        wp::tile_alloc_shared(0, true);
-{forward_body}    }}
-}}
+{forward_body}{line_directive}    }}
+{line_directive}}}
 """
 cuda_kernel_template_backward = """
-extern "C" __global__ void {name}_cuda_kernel_backward(
+{line_directive}extern "C" __global__ void {name}_cuda_kernel_backward(
     {reverse_args})
 {{
-    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
-         _idx < dim.size;
-         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+{line_directive}    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+{line_directive}         _idx < dim.size;
+{line_directive}         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
     {{
         // reset shared memory allocator
-        wp::tile_alloc_shared(0, true);
+{line_directive}        wp::tile_alloc_shared(0, true);
-{reverse_body}    }}
-}}
+{reverse_body}{line_directive}    }}
+{line_directive}}}
 """
@@ -3315,10 +3336,17 @@ extern "C" {{
 WP_API void {name}_cpu_forward(
     {forward_args})
 {{
-    for (size_t task_index = 0; task_index < dim.size; ++task_index)
+for (size_t task_index = 0; task_index < dim.size; ++task_index)
     {{
+        // init shared memory allocator
+        wp::tile_alloc_shared(0, true);
         {name}_cpu_kernel_forward(
             {forward_params});
+        // check shared memory allocator
+        wp::tile_alloc_shared(0, false, true);
     }}
 }}
@@ -3335,8 +3363,14 @@ WP_API void {name}_cpu_backward(
 {{
     for (size_t task_index = 0; task_index < dim.size; ++task_index)
     {{
+        // initialize shared memory allocator
+        wp::tile_alloc_shared(0, true);
         {name}_cpu_kernel_backward(
             {reverse_params});
+        // check shared memory allocator
+        wp::tile_alloc_shared(0, false, true);
     }}
 }}
@@ -3418,7 +3452,7 @@ def indent(args, stops=1):
 # generates a C function name based on the python function name
-def make_full_qualified_name(func):
+def make_full_qualified_name(func: Union[str, Callable]) -> str:
     if not isinstance(func, str):
         func = func.__qualname__
     return re.sub("[^0-9a-zA-Z_]+", "", func.replace(".", "__"))
@@ -3448,7 +3482,8 @@ def codegen_struct(struct, device="cpu", indent_size=4):
     # forward args
     for label, var in struct.vars.items():
         var_ctype = var.ctype()
-        forward_args.append(f"{var_ctype} const& {label} = {{}}")
+        default_arg_def = " = {}" if forward_args else ""
+        forward_args.append(f"{var_ctype} const& {label}{default_arg_def}")
         reverse_args.append(f"{var_ctype} const&")
         namespace = "wp::" if var_ctype.startswith("wp::") or var_ctype == "bool" else ""
@@ -3472,6 +3507,9 @@ def codegen_struct(struct, device="cpu", indent_size=4):
     reverse_args.append(name + " & adj_ret")
+    # explicitly defaulted default constructor if no default constructor has been defined
+    defaulted_constructor_def = f"{name}() = default;" if forward_args else ""
     return struct_template.format(
         name=name,
         struct_body="".join([indent_block + l for l in body]),
@@ -3481,6 +3519,7 @@ def codegen_struct(struct, device="cpu", indent_size=4):
         reverse_body="".join(reverse_body),
         prefix_add_body="".join(prefix_add_body),
         atomic_add_body="".join(atomic_add_body),
+        defaulted_constructor_def=defaulted_constructor_def,
     )
@@ -3510,14 +3549,21 @@ def codegen_func_forward(adj, func_type="kernel", device="cpu"):
         else:
             lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
+        if line_directive := adj.get_line_directive(lines[-1], var.relative_lineno):
+            lines.insert(-1, f"{line_directive}\n")
     # forward pass
     lines += ["//---------\n"]
     lines += ["// forward\n"]
     for f in adj.blocks[0].body_forward:
-        lines += [f + "\n"]
+        if func_type == "kernel" and device == "cuda" and f.lstrip().startswith("return;"):
+            # Use of grid-stride loops in CUDA kernels requires that we convert return; to continue;
+            lines += [f.replace("return;", "continue;") + "\n"]
+        else:
+            lines += [f + "\n"]
-    return "".join([indent_block + l for l in lines])
+    return "".join(l.lstrip() if l.lstrip().startswith("#line") else indent_block + l for l in lines)
 def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
@@ -3547,6 +3593,9 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
         else:
             lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
+        if line_directive := adj.get_line_directive(lines[-1], var.relative_lineno):
+            lines.insert(-1, f"{line_directive}\n")
     # dual vars
     lines += ["//---------\n"]
     lines += ["// dual vars\n"]
@@ -3567,6 +3616,9 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
         else:
             lines += [f"{ctype} {name} = {{}};\n"]
+        if line_directive := adj.get_line_directive(lines[-1], var.relative_lineno):
+            lines.insert(-1, f"{line_directive}\n")
     # forward pass
     lines += ["//---------\n"]
     lines += ["// forward\n"]
@@ -3587,7 +3639,7 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
     else:
         lines += ["return;\n"]
-    return "".join([indent_block + l for l in lines])
+    return "".join(l.lstrip() if l.lstrip().startswith("#line") else indent_block + l for l in lines)
 def codegen_func(adj, c_func_name: str, device="cpu", options=None):
@@ -3595,11 +3647,11 @@ def codegen_func(adj, c_func_name: str, device="cpu", options=None):
         options = {}
     if adj.return_var is not None and "return" in adj.arg_types:
-        if get_type_origin(adj.arg_types["return"]) is tuple:
-            if len(get_type_args(adj.arg_types["return"])) != len(adj.return_var):
+        if get_origin(adj.arg_types["return"]) is tuple:
+            if len(get_args(adj.arg_types["return"])) != len(adj.return_var):
                 raise WarpCodegenError(
                     f"The function `{adj.fun_name}` has its return type "
-                    f"annotated as a tuple of {len(get_type_args(adj.arg_types['return']))} elements "
+                    f"annotated as a tuple of {len(get_args(adj.arg_types['return']))} elements "
                     f"but the code returns {len(adj.return_var)} values."
                 )
             elif not types_equal(adj.arg_types["return"], tuple(x.type for x in adj.return_var)):
@@ -3608,7 +3660,7 @@ def codegen_func(adj, c_func_name: str, device="cpu", options=None):
                     f"annotated as `{warp.context.type_str(adj.arg_types['return'])}` "
                     f"but the code returns a tuple with types `({', '.join(warp.context.type_str(x.type) for x in adj.return_var)})`."
                 )
-        elif len(adj.return_var) > 1 and get_type_origin(adj.arg_types["return"]) is not tuple:
+        elif len(adj.return_var) > 1 and get_origin(adj.arg_types["return"]) is not tuple:
             raise WarpCodegenError(
                 f"The function `{adj.fun_name}` has its return type "
                 f"annotated as `{warp.context.type_str(adj.arg_types['return'])}` "
@@ -3621,6 +3673,13 @@ def codegen_func(adj, c_func_name: str, device="cpu", options=None):
                 f"but the code returns a value of type `{warp.context.type_str(adj.return_var[0].type)}`."
             )
+    # Build line directive for function definition (subtract 1 to account for 1-indexing of AST line numbers)
+    # This is used as a catch-all C-to-Python source line mapping for any code that does not have
+    # a direct mapping to a Python source line.
+    func_line_directive = ""
+    if line_directive := adj.get_line_directive("", adj.fun_def_lineno - 1):
+        func_line_directive = f"{line_directive}\n"
     # forward header
     if adj.return_var is not None and len(adj.return_var) == 1:
         return_type = adj.return_var[0].ctype()
@@ -3684,6 +3743,7 @@ def codegen_func(adj, c_func_name: str, device="cpu", options=None):
             forward_body=forward_body,
             filename=adj.filename,
             lineno=adj.fun_lineno,
+            line_directive=func_line_directive,
         )
     if not adj.skip_reverse_codegen:
@@ -3702,6 +3762,7 @@ def codegen_func(adj, c_func_name: str, device="cpu", options=None):
             reverse_body=reverse_body,
             filename=adj.filename,
             lineno=adj.fun_lineno,
+            line_directive=func_line_directive,
         )
     return s
@@ -3744,6 +3805,7 @@ def codegen_snippet(adj, name, snippet, adj_snippet, replay_snippet):
         forward_body=snippet,
         filename=adj.filename,
         lineno=adj.fun_lineno,
+        line_directive="",
     )
     if replay_snippet is not None:
@@ -3754,6 +3816,7 @@ def codegen_snippet(adj, name, snippet, adj_snippet, replay_snippet):
             forward_body=replay_snippet,
             filename=adj.filename,
             lineno=adj.fun_lineno,
+            line_directive="",
         )
     if adj_snippet:
@@ -3769,6 +3832,7 @@ def codegen_snippet(adj, name, snippet, adj_snippet, replay_snippet):
         reverse_body=reverse_body,
         filename=adj.filename,
         lineno=adj.fun_lineno,
+        line_directive="",
     )
     return s
@@ -3781,6 +3845,13 @@ def codegen_kernel(kernel, device, options):
     adj = kernel.adj
+    # Build line directive for function definition (subtract 1 to account for 1-indexing of AST line numbers)
+    # This is used as a catch-all C-to-Python source line mapping for any code that does not have
+    # a direct mapping to a Python source line.
+    func_line_directive = ""
+    if line_directive := adj.get_line_directive("", adj.fun_def_lineno - 1):
+        func_line_directive = f"{line_directive}\n"
     if device == "cpu":
         template_forward = cpu_kernel_template_forward
         template_backward = cpu_kernel_template_backward
@@ -3808,6 +3879,7 @@ def codegen_kernel(kernel, device, options):
         {
             "forward_args": indent(forward_args),
             "forward_body": forward_body,
+            "line_directive": func_line_directive,
         }
     )
     template += template_forward