PyPI - warp-lang - Versions diffs - 1.1.0__py3-none-manylinux2014_x86_64.whl → 1.2.1__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.1.0__py3-none-manylinux2014_x86_64.whl → 1.2.1__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (218) hide show

warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +10 -37
warp/build_dll.py +2 -2
warp/builtins.py +274 -6
warp/codegen.py +51 -4
warp/config.py +2 -2
warp/constants.py +4 -0
warp/context.py +422 -203
warp/examples/benchmarks/benchmark_api.py +0 -2
warp/examples/benchmarks/benchmark_cloth_warp.py +0 -1
warp/examples/benchmarks/benchmark_launches.py +0 -2
warp/examples/core/example_dem.py +0 -2
warp/examples/core/example_fluid.py +0 -2
warp/examples/core/example_graph_capture.py +0 -2
warp/examples/core/example_marching_cubes.py +0 -2
warp/examples/core/example_mesh.py +0 -2
warp/examples/core/example_mesh_intersect.py +0 -2
warp/examples/core/example_nvdb.py +0 -2
warp/examples/core/example_raycast.py +0 -2
warp/examples/core/example_raymarch.py +0 -2
warp/examples/core/example_render_opengl.py +0 -2
warp/examples/core/example_sph.py +0 -2
warp/examples/core/example_torch.py +0 -3
warp/examples/core/example_wave.py +0 -2
warp/examples/fem/example_apic_fluid.py +140 -115
warp/examples/fem/example_burgers.py +262 -0
warp/examples/fem/example_convection_diffusion.py +0 -2
warp/examples/fem/example_convection_diffusion_dg.py +0 -2
warp/examples/fem/example_deformed_geometry.py +0 -2
warp/examples/fem/example_diffusion.py +0 -2
warp/examples/fem/example_diffusion_3d.py +5 -4
warp/examples/fem/example_diffusion_mgpu.py +0 -2
warp/examples/fem/example_mixed_elasticity.py +0 -2
warp/examples/fem/example_navier_stokes.py +0 -2
warp/examples/fem/example_stokes.py +0 -2
warp/examples/fem/example_stokes_transfer.py +0 -2
warp/examples/optim/example_bounce.py +0 -2
warp/examples/optim/example_cloth_throw.py +0 -2
warp/examples/optim/example_diffray.py +0 -2
warp/examples/optim/example_drone.py +0 -2
warp/examples/optim/example_inverse_kinematics.py +0 -2
warp/examples/optim/example_inverse_kinematics_torch.py +0 -2
warp/examples/optim/example_spring_cage.py +0 -2
warp/examples/optim/example_trajectory.py +0 -2
warp/examples/optim/example_walker.py +0 -2
warp/examples/sim/example_cartpole.py +0 -2
warp/examples/sim/example_cloth.py +0 -2
warp/examples/sim/example_granular.py +0 -2
warp/examples/sim/example_granular_collision_sdf.py +0 -2
warp/examples/sim/example_jacobian_ik.py +0 -2
warp/examples/sim/example_particle_chain.py +0 -2
warp/examples/sim/example_quadruped.py +0 -2
warp/examples/sim/example_rigid_chain.py +0 -2
warp/examples/sim/example_rigid_contact.py +0 -2
warp/examples/sim/example_rigid_force.py +0 -2
warp/examples/sim/example_rigid_gyroscopic.py +0 -2
warp/examples/sim/example_rigid_soft_contact.py +0 -2
warp/examples/sim/example_soft_body.py +0 -2
warp/fem/__init__.py +1 -0
warp/fem/cache.py +3 -1
warp/fem/geometry/__init__.py +1 -0
warp/fem/geometry/element.py +4 -0
warp/fem/geometry/grid_3d.py +0 -4
warp/fem/geometry/nanogrid.py +455 -0
warp/fem/integrate.py +63 -9
warp/fem/space/__init__.py +43 -158
warp/fem/space/basis_space.py +34 -0
warp/fem/space/collocated_function_space.py +1 -1
warp/fem/space/grid_2d_function_space.py +13 -132
warp/fem/space/grid_3d_function_space.py +16 -154
warp/fem/space/hexmesh_function_space.py +37 -134
warp/fem/space/nanogrid_function_space.py +202 -0
warp/fem/space/quadmesh_2d_function_space.py +12 -119
warp/fem/space/restriction.py +4 -1
warp/fem/space/shape/__init__.py +77 -0
warp/fem/space/shape/cube_shape_function.py +5 -15
warp/fem/space/tetmesh_function_space.py +6 -76
warp/fem/space/trimesh_2d_function_space.py +6 -76
warp/native/array.h +12 -3
warp/native/builtin.h +48 -5
warp/native/bvh.cpp +14 -10
warp/native/bvh.cu +23 -15
warp/native/bvh.h +1 -0
warp/native/clang/clang.cpp +2 -1
warp/native/crt.cpp +11 -1
warp/native/crt.h +18 -1
warp/native/exports.h +187 -0
warp/native/mat.h +47 -0
warp/native/mesh.cpp +1 -1
warp/native/mesh.cu +1 -2
warp/native/nanovdb/GridHandle.h +366 -0
warp/native/nanovdb/HostBuffer.h +590 -0
warp/native/nanovdb/NanoVDB.h +3999 -2157
warp/native/nanovdb/PNanoVDB.h +936 -99
warp/native/quat.h +28 -1
warp/native/rand.h +5 -1
warp/native/vec.h +45 -1
warp/native/volume.cpp +335 -103
warp/native/volume.cu +39 -13
warp/native/volume.h +725 -303
warp/native/volume_builder.cu +381 -360
warp/native/volume_builder.h +16 -1
warp/native/volume_impl.h +61 -0
warp/native/warp.cu +8 -2
warp/native/warp.h +15 -7
warp/render/render_opengl.py +191 -52
warp/sim/integrator_featherstone.py +10 -3
warp/sim/integrator_xpbd.py +16 -22
warp/sparse.py +89 -27
warp/stubs.py +83 -0
warp/tests/assets/test_index_grid.nvdb +0 -0
warp/tests/aux_test_dependent.py +0 -2
warp/tests/aux_test_grad_customs.py +0 -2
warp/tests/aux_test_reference.py +0 -2
warp/tests/aux_test_reference_reference.py +0 -2
warp/tests/aux_test_square.py +0 -2
warp/tests/disabled_kinematics.py +0 -2
warp/tests/test_adam.py +0 -2
warp/tests/test_arithmetic.py +0 -36
warp/tests/test_array.py +9 -11
warp/tests/test_array_reduce.py +0 -2
warp/tests/test_async.py +0 -2
warp/tests/test_atomic.py +0 -2
warp/tests/test_bool.py +58 -50
warp/tests/test_builtins_resolution.py +0 -2
warp/tests/test_bvh.py +0 -2
warp/tests/test_closest_point_edge_edge.py +0 -1
warp/tests/test_codegen.py +0 -4
warp/tests/test_compile_consts.py +130 -10
warp/tests/test_conditional.py +0 -2
warp/tests/test_copy.py +0 -2
warp/tests/test_ctypes.py +6 -8
warp/tests/test_dense.py +0 -2
warp/tests/test_devices.py +0 -2
warp/tests/test_dlpack.py +9 -11
warp/tests/test_examples.py +42 -39
warp/tests/test_fabricarray.py +0 -3
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +75 -54
warp/tests/test_fp16.py +0 -2
warp/tests/test_func.py +0 -2
warp/tests/test_generics.py +27 -2
warp/tests/test_grad.py +147 -8
warp/tests/test_grad_customs.py +0 -2
warp/tests/test_hash_grid.py +1 -3
warp/tests/test_import.py +0 -2
warp/tests/test_indexedarray.py +0 -2
warp/tests/test_intersect.py +0 -2
warp/tests/test_jax.py +0 -2
warp/tests/test_large.py +11 -9
warp/tests/test_launch.py +0 -2
warp/tests/test_lerp.py +10 -54
warp/tests/test_linear_solvers.py +3 -5
warp/tests/test_lvalue.py +0 -2
warp/tests/test_marching_cubes.py +0 -2
warp/tests/test_mat.py +0 -2
warp/tests/test_mat_lite.py +0 -2
warp/tests/test_mat_scalar_ops.py +0 -2
warp/tests/test_math.py +0 -2
warp/tests/test_matmul.py +35 -37
warp/tests/test_matmul_lite.py +29 -31
warp/tests/test_mempool.py +0 -2
warp/tests/test_mesh.py +0 -3
warp/tests/test_mesh_query_aabb.py +0 -2
warp/tests/test_mesh_query_point.py +0 -2
warp/tests/test_mesh_query_ray.py +0 -2
warp/tests/test_mlp.py +0 -2
warp/tests/test_model.py +0 -2
warp/tests/test_module_hashing.py +111 -0
warp/tests/test_modules_lite.py +0 -3
warp/tests/test_multigpu.py +0 -2
warp/tests/test_noise.py +0 -4
warp/tests/test_operators.py +0 -2
warp/tests/test_options.py +0 -2
warp/tests/test_peer.py +0 -2
warp/tests/test_pinned.py +0 -2
warp/tests/test_print.py +0 -2
warp/tests/test_quat.py +0 -2
warp/tests/test_rand.py +41 -5
warp/tests/test_reload.py +0 -10
warp/tests/test_rounding.py +0 -2
warp/tests/test_runlength_encode.py +0 -2
warp/tests/test_sim_grad.py +0 -2
warp/tests/test_sim_kinematics.py +0 -2
warp/tests/test_smoothstep.py +0 -2
warp/tests/test_snippet.py +0 -2
warp/tests/test_sparse.py +0 -2
warp/tests/test_spatial.py +0 -2
warp/tests/test_special_values.py +362 -0
warp/tests/test_streams.py +0 -2
warp/tests/test_struct.py +0 -2
warp/tests/test_tape.py +0 -2
warp/tests/test_torch.py +0 -2
warp/tests/test_transient_module.py +0 -2
warp/tests/test_types.py +0 -2
warp/tests/test_utils.py +0 -2
warp/tests/test_vec.py +0 -2
warp/tests/test_vec_lite.py +0 -2
warp/tests/test_vec_scalar_ops.py +0 -2
warp/tests/test_verify_fp.py +0 -2
warp/tests/test_volume.py +237 -13
warp/tests/test_volume_write.py +86 -3
warp/tests/unittest_serial.py +10 -9
warp/tests/unittest_suites.py +6 -2
warp/tests/unittest_utils.py +2 -171
warp/tests/unused_test_misc.py +0 -2
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +37 -40
warp/types.py +526 -85
{warp_lang-1.1.0.dist-info → warp_lang-1.2.1.dist-info}/METADATA +61 -31
warp_lang-1.2.1.dist-info/RECORD +359 -0
warp/examples/fem/example_convection_diffusion_dg0.py +0 -204
warp/native/nanovdb/PNanoVDBWrite.h +0 -295
warp_lang-1.1.0.dist-info/RECORD +0 -352
{warp_lang-1.1.0.dist-info → warp_lang-1.2.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.1.0.dist-info → warp_lang-1.2.1.dist-info}/WHEEL +0 -0
{warp_lang-1.1.0.dist-info → warp_lang-1.2.1.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -6,6 +6,7 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 import ast
+import builtins
 import ctypes
 import functools
 import hashlib
@@ -18,7 +19,8 @@ import platform
 import sys
 import types
 from copy import copy as shallowcopy
-from types import ModuleType
+from pathlib import Path
+from struct import pack as struct_pack
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 import numpy as np
@@ -345,6 +347,8 @@ class Function:
 def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
     uses_non_warp_array_type = False
+    warp.context.init()
     # Retrieve the built-in function from Warp's dll.
     c_func = getattr(warp.context.runtime.core, func.mangled_name)
@@ -1168,7 +1172,7 @@ def get_module(name):
             # clear out old kernels, funcs, struct definitions
             old_module.kernels = {}
             old_module.functions = {}
-            old_module.constants = []
+            old_module.constants = {}
             old_module.structs = {}
             old_module.loader = parent_loader
@@ -1315,7 +1319,7 @@ class Module:
         self.kernels = {}
         self.functions = {}
-        self.constants = []
+        self.constants = {}  # Any constants referenced in this module including those defined in other modules
         self.structs = {}
         self.cpu_module = None
@@ -1442,7 +1446,13 @@ class Module:
             if isinstance(arg.type, warp.codegen.Struct) and arg.type.module is not None:
                 add_ref(arg.type.module)
-    def hash_module(self):
+    def hash_module(self, recompute_content_hash=False):
+        """Recursively compute and return a hash for the module.
+        If ``recompute_content_hash`` is False, each module's previously
+        computed ``content_hash`` will be used.
+        """
         def get_annotations(obj: Any) -> Mapping[str, Any]:
             """Alternative to `inspect.get_annotations()` for Python 3.9 and older."""
             # See https://docs.python.org/3/howto/annotations.html#accessing-the-annotations-dict-of-an-object-in-python-3-9-and-older
@@ -1461,10 +1471,13 @@ class Module:
             # The visited set tracks modules already visited to avoid circular references.
             # check if we need to update the content hash
-            if not module.content_hash:
+            if not module.content_hash or recompute_content_hash:
                 # recompute content hash
                 ch = hashlib.sha256()
+                # Start with an empty constants dictionary in case any have been removed
+                module.constants = {}
                 # struct source
                 for struct in module.structs.values():
                     s = ",".join(
@@ -1474,28 +1487,34 @@ class Module:
                     ch.update(bytes(s, "utf-8"))
                 # functions source
-                for func in module.functions.values():
-                    s = func.adj.source
-                    ch.update(bytes(s, "utf-8"))
-                    if func.custom_grad_func:
-                        s = func.custom_grad_func.adj.source
-                        ch.update(bytes(s, "utf-8"))
-                    if func.custom_replay_func:
-                        s = func.custom_replay_func.adj.source
-                    if func.replay_snippet:
-                        s = func.replay_snippet
-                    if func.native_snippet:
-                        s = func.native_snippet
-                        ch.update(bytes(s, "utf-8"))
-                    if func.adj_native_snippet:
-                        s = func.adj_native_snippet
+                for function in module.functions.values():
+                    # include all concrete and generic overloads
+                    overloads = itertools.chain(function.user_overloads.items(), function.user_templates.items())
+                    for sig, func in overloads:
+                        # signature
+                        ch.update(bytes(sig, "utf-8"))
+                        # source
+                        s = func.adj.source
                         ch.update(bytes(s, "utf-8"))
-                    # cache func arg types
-                    for arg, arg_type in func.adj.arg_types.items():
-                        s = f"{arg}: {get_type_name(arg_type)}"
-                        ch.update(bytes(s, "utf-8"))
+                        if func.custom_grad_func:
+                            s = func.custom_grad_func.adj.source
+                            ch.update(bytes(s, "utf-8"))
+                        if func.custom_replay_func:
+                            s = func.custom_replay_func.adj.source
+                        if func.replay_snippet:
+                            s = func.replay_snippet
+                        if func.native_snippet:
+                            s = func.native_snippet
+                            ch.update(bytes(s, "utf-8"))
+                        if func.adj_native_snippet:
+                            s = func.adj_native_snippet
+                            ch.update(bytes(s, "utf-8"))
+                        # Populate constants referenced in this function
+                        if func.adj:
+                            module.constants.update(func.adj.get_constant_references())
                 # kernel source
                 for kernel in module.kernels.values():
@@ -1511,6 +1530,34 @@ class Module:
                         for sig in sorted(kernel.overloads.keys()):
                             ch.update(bytes(sig, "utf-8"))
+                    # Populate constants referenced in this kernel
+                    module.constants.update(kernel.adj.get_constant_references())
+                # constants referenced in this module
+                for constant_name, constant_value in module.constants.items():
+                    ch.update(bytes(constant_name, "utf-8"))
+                    # hash the constant value
+                    if isinstance(constant_value, builtins.bool):
+                        # This needs to come before the check for `int` since all boolean
+                        # values are also instances of `int`.
+                        ch.update(struct_pack("?", constant_value))
+                    elif isinstance(constant_value, int):
+                        ch.update(struct_pack("<q", constant_value))
+                    elif isinstance(constant_value, float):
+                        ch.update(struct_pack("<d", constant_value))
+                    elif isinstance(constant_value, warp.types.float16):
+                        # float16 is a special case
+                        p = ctypes.pointer(ctypes.c_float(constant_value.value))
+                        ch.update(p.contents)
+                    elif isinstance(constant_value, tuple(warp.types.scalar_types)):
+                        p = ctypes.pointer(constant_value._type_(constant_value.value))
+                        ch.update(p.contents)
+                    elif isinstance(constant_value, ctypes.Array):
+                        ch.update(bytes(constant_value))
+                    else:
+                        raise RuntimeError(f"Invalid constant type: {type(constant_value)}")
                 module.content_hash = ch.digest()
             h = hashlib.sha256()
@@ -1529,10 +1576,6 @@ class Module:
             h.update(bytes(warp.config.mode, "utf-8"))
-            # compile-time constants (global)
-            if warp.types._constant_hash:
-                h.update(warp.types._constant_hash.digest())
             # recurse on references
             visited.add(module)
@@ -1546,7 +1589,7 @@ class Module:
         return hash_recursive(self, visited=set())
-    def load(self, device):
+    def load(self, device) -> bool:
         from warp.utils import ScopedTimer
         device = get_device(device)
@@ -1570,68 +1613,19 @@ class Module:
             if not warp.is_cuda_available():
                 raise RuntimeError("Failed to build CUDA module because CUDA is not available")
-        with ScopedTimer(f"Module {self.name} load on device '{device}'", active=not warp.config.quiet):
-            build_path = warp.build.kernel_bin_dir
-            gen_path = warp.build.kernel_gen_dir
-            if not os.path.exists(build_path):
-                os.makedirs(build_path)
-            if not os.path.exists(gen_path):
-                os.makedirs(gen_path)
+        module_name = "wp_" + self.name
+        module_hash = self.hash_module()
-            module_name = "wp_" + self.name
-            module_path = os.path.join(build_path, module_name)
-            module_hash = self.hash_module()
-            builder = ModuleBuilder(self, self.options)
+        # use a unique module path using the module short hash
+        module_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}")
+        with ScopedTimer(
+            f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'", active=not warp.config.quiet
+        ):
+            # -----------------------------------------------------------
+            # determine output paths
             if device.is_cpu:
-                obj_path = os.path.join(build_path, module_name)
-                obj_path = obj_path + ".o"
-                cpu_hash_path = module_path + ".cpu.hash"
-                # check cache
-                if warp.config.cache_kernels and os.path.isfile(cpu_hash_path) and os.path.isfile(obj_path):
-                    with open(cpu_hash_path, "rb") as f:
-                        cache_hash = f.read()
-                    if cache_hash == module_hash:
-                        runtime.llvm.load_obj(obj_path.encode("utf-8"), module_name.encode("utf-8"))
-                        self.cpu_module = module_name
-                        return True
-                # build
-                try:
-                    cpp_path = os.path.join(gen_path, module_name + ".cpp")
-                    # write cpp sources
-                    cpp_source = builder.codegen("cpu")
-                    cpp_file = open(cpp_path, "w")
-                    cpp_file.write(cpp_source)
-                    cpp_file.close()
-                    # build object code
-                    with ScopedTimer("Compile x86", active=warp.config.verbose):
-                        warp.build.build_cpu(
-                            obj_path,
-                            cpp_path,
-                            mode=self.options["mode"],
-                            fast_math=self.options["fast_math"],
-                            verify_fp=warp.config.verify_fp,
-                        )
-                    # update cpu hash
-                    with open(cpu_hash_path, "wb") as f:
-                        f.write(module_hash)
-                    # load the object code
-                    runtime.llvm.load_obj(obj_path.encode("utf-8"), module_name.encode("utf-8"))
-                    self.cpu_module = module_name
-                except Exception as e:
-                    self.cpu_build_failed = True
-                    raise (e)
+                output_name = "module_codegen.o"
             elif device.is_cuda:
                 # determine whether to use PTX or CUBIN
@@ -1650,62 +1644,138 @@ class Module:
                 if use_ptx:
                     output_arch = min(device.arch, warp.config.ptx_target_arch)
-                    output_path = module_path + f".sm{output_arch}.ptx"
+                    output_name = f"module_codegen.sm{output_arch}.ptx"
                 else:
                     output_arch = device.arch
-                    output_path = module_path + f".sm{output_arch}.cubin"
+                    output_name = f"module_codegen.sm{output_arch}.cubin"
+            # final object binary path
+            binary_path = os.path.join(module_dir, output_name)
+            # -----------------------------------------------------------
+            # check cache and build if necessary
+            build_dir = None
+            if not os.path.exists(binary_path) or not warp.config.cache_kernels:
+                builder = ModuleBuilder(self, self.options)
+                # create a temporary (process unique) dir for build outputs before moving to the binary dir
+                build_dir = os.path.join(
+                    warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}_p{os.getpid()}"
+                )
+                # dir may exist from previous attempts / runs / archs
+                Path(build_dir).mkdir(parents=True, exist_ok=True)
+                # build CPU
+                if device.is_cpu:
+                    # build
+                    try:
+                        source_code_path = os.path.join(build_dir, "module_codegen.cpp")
+                        # write cpp sources
+                        cpp_source = builder.codegen("cpu")
+                        with open(source_code_path, "w") as cpp_file:
+                            cpp_file.write(cpp_source)
-                cuda_hash_path = module_path + f".sm{output_arch}.hash"
+                        output_path = os.path.join(build_dir, output_name)
-                # check cache
-                if warp.config.cache_kernels and os.path.isfile(cuda_hash_path) and os.path.isfile(output_path):
-                    with open(cuda_hash_path, "rb") as f:
-                        cache_hash = f.read()
+                        # build object code
+                        with ScopedTimer("Compile x86", active=warp.config.verbose):
+                            warp.build.build_cpu(
+                                output_path,
+                                source_code_path,
+                                mode=self.options["mode"],
+                                fast_math=self.options["fast_math"],
+                                verify_fp=warp.config.verify_fp,
+                            )
+                    except Exception as e:
+                        self.cpu_build_failed = True
+                        raise (e)
+                elif device.is_cuda:
+                    # build
+                    try:
+                        source_code_path = os.path.join(build_dir, "module_codegen.cu")
+                        # write cuda sources
+                        cu_source = builder.codegen("cuda")
+                        with open(source_code_path, "w") as cu_file:
+                            cu_file.write(cu_source)
+                        output_path = os.path.join(build_dir, output_name)
+                        # generate PTX or CUBIN
+                        with ScopedTimer("Compile CUDA", active=warp.config.verbose):
+                            warp.build.build_cuda(
+                                source_code_path,
+                                output_arch,
+                                output_path,
+                                config=self.options["mode"],
+                                fast_math=self.options["fast_math"],
+                                verify_fp=warp.config.verify_fp,
+                            )
+                    except Exception as e:
+                        self.cuda_build_failed = True
+                        raise (e)
-                    if cache_hash == module_hash:
-                        cuda_module = warp.build.load_cuda(output_path, device)
-                        if cuda_module is not None:
-                            self.cuda_modules[device.context] = cuda_module
-                            return True
+                # -----------------------------------------------------------
+                # update cache
-                # build
                 try:
-                    cu_path = os.path.join(gen_path, module_name + ".cu")
-                    # write cuda sources
-                    cu_source = builder.codegen("cuda")
-                    cu_file = open(cu_path, "w")
-                    cu_file.write(cu_source)
-                    cu_file.close()
-                    # generate PTX or CUBIN
-                    with ScopedTimer("Compile CUDA", active=warp.config.verbose):
-                        warp.build.build_cuda(
-                            cu_path,
-                            output_arch,
-                            output_path,
-                            config=self.options["mode"],
-                            fast_math=self.options["fast_math"],
-                            verify_fp=warp.config.verify_fp,
-                        )
-                    # update cuda hash
-                    with open(cuda_hash_path, "wb") as f:
-                        f.write(module_hash)
-                    # load the module
-                    cuda_module = warp.build.load_cuda(output_path, device)
-                    if cuda_module is not None:
-                        self.cuda_modules[device.context] = cuda_module
-                    else:
-                        raise Exception(f"Failed to load CUDA module '{self.name}'")
+                    # Copy process-specific build directory to a process-independent location
+                    os.rename(build_dir, module_dir)
+                except (OSError, FileExistsError):
+                    # another process likely updated the module dir first
+                    pass
-                except Exception as e:
-                    self.cuda_build_failed = True
-                    raise (e)
+                if os.path.exists(module_dir):
+                    if not os.path.exists(binary_path):
+                        # copy our output file to the destination module
+                        # this is necessary in case different processes
+                        # have different GPU architectures / devices
+                        try:
+                            os.rename(output_path, binary_path)
+                        except (OSError, FileExistsError):
+                            # another process likely updated the module dir first
+                            pass
-            return True
+                    try:
+                        final_source_path = os.path.join(module_dir, os.path.basename(source_code_path))
+                        if not os.path.exists(final_source_path):
+                            os.rename(source_code_path, final_source_path)
+                    except (OSError, FileExistsError):
+                        # another process likely updated the module dir first
+                        pass
+                    except Exception as e:
+                        # We don't need source_code_path to be copied successfully to proceed, so warn and keep running
+                        warp.utils.warn(f"Exception when renaming {source_code_path}: {e}")
+            # -----------------------------------------------------------
+            # Load CPU or CUDA binary
+            if device.is_cpu:
+                runtime.llvm.load_obj(binary_path.encode("utf-8"), module_name.encode("utf-8"))
+                self.cpu_module = module_name
+            elif device.is_cuda:
+                cuda_module = warp.build.load_cuda(binary_path, device)
+                if cuda_module is not None:
+                    self.cuda_modules[device.context] = cuda_module
+                else:
+                    raise Exception(f"Failed to load CUDA module '{self.name}'")
+            if build_dir:
+                import shutil
+                # clean up build_dir used for this process regardless
+                shutil.rmtree(build_dir, ignore_errors=True)
+        return True
     def unload(self):
         if self.cpu_module:
@@ -2578,22 +2648,36 @@ class Runtime:
             ]
             self.core.cutlass_gemm.restype = ctypes.c_bool
-            self.core.volume_create_host.argtypes = [ctypes.c_void_p, ctypes.c_uint64]
+            self.core.volume_create_host.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_bool, ctypes.c_bool]
             self.core.volume_create_host.restype = ctypes.c_uint64
-            self.core.volume_get_buffer_info_host.argtypes = [
+            self.core.volume_get_tiles_host.argtypes = [
                 ctypes.c_uint64,
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_uint64),
+                ctypes.c_void_p,
             ]
-            self.core.volume_get_tiles_host.argtypes = [
+            self.core.volume_get_voxels_host.argtypes = [
                 ctypes.c_uint64,
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_uint64),
+                ctypes.c_void_p,
             ]
             self.core.volume_destroy_host.argtypes = [ctypes.c_uint64]
-            self.core.volume_create_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint64]
+            self.core.volume_create_device.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_uint64,
+                ctypes.c_bool,
+                ctypes.c_bool,
+            ]
             self.core.volume_create_device.restype = ctypes.c_uint64
+            self.core.volume_get_tiles_device.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_void_p,
+            ]
+            self.core.volume_get_voxels_device.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_void_p,
+            ]
+            self.core.volume_destroy_device.argtypes = [ctypes.c_uint64]
             self.core.volume_f_from_tiles_device.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
@@ -2632,24 +2716,68 @@ class Runtime:
                 ctypes.c_bool,
             ]
             self.core.volume_i_from_tiles_device.restype = ctypes.c_uint64
-            self.core.volume_get_buffer_info_device.argtypes = [
-                ctypes.c_uint64,
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_uint64),
+            self.core.volume_index_from_tiles_device.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_bool,
             ]
-            self.core.volume_get_tiles_device.argtypes = [
+            self.core.volume_index_from_tiles_device.restype = ctypes.c_uint64
+            self.core.volume_from_active_voxels_device.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_bool,
+            ]
+            self.core.volume_from_active_voxels_device.restype = ctypes.c_uint64
+            self.core.volume_get_buffer_info.argtypes = [
                 ctypes.c_uint64,
                 ctypes.POINTER(ctypes.c_void_p),
                 ctypes.POINTER(ctypes.c_uint64),
             ]
-            self.core.volume_destroy_device.argtypes = [ctypes.c_uint64]
             self.core.volume_get_voxel_size.argtypes = [
                 ctypes.c_uint64,
                 ctypes.POINTER(ctypes.c_float),
                 ctypes.POINTER(ctypes.c_float),
                 ctypes.POINTER(ctypes.c_float),
             ]
+            self.core.volume_get_tile_and_voxel_count.argtypes = [
+                ctypes.c_uint64,
+                ctypes.POINTER(ctypes.c_uint32),
+                ctypes.POINTER(ctypes.c_uint64),
+            ]
+            self.core.volume_get_grid_info.argtypes = [
+                ctypes.c_uint64,
+                ctypes.POINTER(ctypes.c_uint64),
+                ctypes.POINTER(ctypes.c_uint32),
+                ctypes.POINTER(ctypes.c_uint32),
+                ctypes.c_float * 3,
+                ctypes.c_float * 9,
+                ctypes.c_char * 16,
+            ]
+            self.core.volume_get_grid_info.restype = ctypes.c_char_p
+            self.core.volume_get_blind_data_count.argtypes = [
+                ctypes.c_uint64,
+            ]
+            self.core.volume_get_blind_data_count.restype = ctypes.c_uint64
+            self.core.volume_get_blind_data_info.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_uint32,
+                ctypes.POINTER(ctypes.c_void_p),
+                ctypes.POINTER(ctypes.c_uint64),
+                ctypes.POINTER(ctypes.c_uint32),
+                ctypes.c_char * 16,
+            ]
+            self.core.volume_get_blind_data_info.restype = ctypes.c_char_p
             bsr_matrix_from_triplets_argtypes = [
                 ctypes.c_int,
@@ -3194,12 +3322,10 @@ class Runtime:
                 raise RuntimeError(f"CUDA error detected: {err}")
-def assert_initialized():
-    assert runtime is not None, "Warp not initialized, call wp.init() before use"
 # global entry points
 def is_cpu_available():
+    init()
     return runtime.llvm
@@ -3221,7 +3347,7 @@ def is_cuda_driver_initialized() -> bool:
     This can be helpful in cases in which ``cuInit()`` was called before a fork.
     """
-    assert_initialized()
+    init()
     return runtime.core.cuda_driver_is_initialized()
@@ -3229,7 +3355,7 @@ def is_cuda_driver_initialized() -> bool:
 def get_devices() -> List[Device]:
     """Returns a list of devices supported in this environment."""
-    assert_initialized()
+    init()
     devices = []
     if is_cpu_available():
@@ -3242,7 +3368,7 @@ def get_devices() -> List[Device]:
 def get_cuda_device_count() -> int:
     """Returns the number of CUDA devices supported in this environment."""
-    assert_initialized()
+    init()
     return len(runtime.cuda_devices)
@@ -3250,7 +3376,7 @@ def get_cuda_device_count() -> int:
 def get_cuda_device(ordinal: Union[int, None] = None) -> Device:
     """Returns the CUDA device with the given ordinal or the current CUDA device if ordinal is None."""
-    assert_initialized()
+    init()
     if ordinal is None:
         return runtime.get_current_cuda_device()
@@ -3261,7 +3387,7 @@ def get_cuda_device(ordinal: Union[int, None] = None) -> Device:
 def get_cuda_devices() -> List[Device]:
     """Returns a list of CUDA devices supported in this environment."""
-    assert_initialized()
+    init()
     return runtime.cuda_devices
@@ -3269,7 +3395,7 @@ def get_cuda_devices() -> List[Device]:
 def get_preferred_device() -> Device:
     """Returns the preferred compute device, CUDA if available and CPU otherwise."""
-    assert_initialized()
+    init()
     if is_cuda_available():
         return runtime.cuda_devices[0]
@@ -3282,7 +3408,7 @@ def get_preferred_device() -> Device:
 def get_device(ident: Devicelike = None) -> Device:
     """Returns the device identified by the argument."""
-    assert_initialized()
+    init()
     return runtime.get_device(ident)
@@ -3290,7 +3416,7 @@ def get_device(ident: Devicelike = None) -> Device:
 def set_device(ident: Devicelike):
     """Sets the target device identified by the argument."""
-    assert_initialized()
+    init()
     device = runtime.get_device(ident)
     runtime.set_default_device(device)
@@ -3311,7 +3437,7 @@ def map_cuda_device(alias: str, context: ctypes.c_void_p = None) -> Device:
         The associated wp.Device.
     """
-    assert_initialized()
+    init()
     return runtime.map_cuda_device(alias, context)
@@ -3319,7 +3445,7 @@ def map_cuda_device(alias: str, context: ctypes.c_void_p = None) -> Device:
 def unmap_cuda_device(alias: str):
     """Remove a CUDA device with the given alias."""
-    assert_initialized()
+    init()
     runtime.unmap_cuda_device(alias)
@@ -3327,7 +3453,7 @@ def unmap_cuda_device(alias: str):
 def is_mempool_supported(device: Devicelike):
     """Check if CUDA memory pool allocators are available on the device."""
-    assert_initialized()
+    init()
     device = runtime.get_device(device)
@@ -3337,7 +3463,7 @@ def is_mempool_supported(device: Devicelike):
 def is_mempool_enabled(device: Devicelike):
     """Check if CUDA memory pool allocators are enabled on the device."""
-    assert_initialized()
+    init()
     device = runtime.get_device(device)
@@ -3357,7 +3483,7 @@ def set_mempool_enabled(device: Devicelike, enable: bool):
     prior to graph capture.
     """
-    assert_initialized()
+    init()
     device = runtime.get_device(device)
@@ -3387,7 +3513,7 @@ def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, floa
     For example, 1024**3 means one GiB of memory.
     """
-    assert_initialized()
+    init()
     device = runtime.get_device(device)
@@ -3409,7 +3535,7 @@ def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, floa
 def get_mempool_release_threshold(device: Devicelike):
     """Get the CUDA memory pool release threshold on the device."""
-    assert_initialized()
+    init()
     device = runtime.get_device(device)
@@ -3432,7 +3558,7 @@ def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike)
         A Boolean value indicating if this peer access is supported by the system.
     """
-    assert_initialized()
+    init()
     target_device = runtime.get_device(target_device)
     peer_device = runtime.get_device(peer_device)
@@ -3453,7 +3579,7 @@ def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike):
         A Boolean value indicating if this peer access is currently enabled.
     """
-    assert_initialized()
+    init()
     target_device = runtime.get_device(target_device)
     peer_device = runtime.get_device(peer_device)
@@ -3474,7 +3600,7 @@ def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike,
     CUDA pooled allocators, use `set_mempool_access_enabled()`.
     """
-    assert_initialized()
+    init()
     target_device = runtime.get_device(target_device)
     peer_device = runtime.get_device(peer_device)
@@ -3505,7 +3631,10 @@ def is_mempool_access_supported(target_device: Devicelike, peer_device: Deviceli
         A Boolean value indicating if this memory pool access is supported by the system.
     """
-    assert_initialized()
+    init()
+    target_device = runtime.get_device(target_device)
+    peer_device = runtime.get_device(peer_device)
     return target_device.is_mempool_supported and is_peer_access_supported(target_device, peer_device)
@@ -3520,7 +3649,7 @@ def is_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike
         A Boolean value indicating if this peer access is currently enabled.
     """
-    assert_initialized()
+    init()
     target_device = runtime.get_device(target_device)
     peer_device = runtime.get_device(peer_device)
@@ -3538,7 +3667,7 @@ def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelik
     default CUDA allocators, use `set_peer_access_enabled()`.
     """
-    assert_initialized()
+    init()
     target_device = runtime.get_device(target_device)
     peer_device = runtime.get_device(peer_device)
@@ -3640,34 +3769,87 @@ def wait_stream(stream: Stream, event: Event = None):
 class RegisteredGLBuffer:
     """
-    Helper object to register a GL buffer with CUDA so that it can be mapped to a Warp array.
+    Helper class to register a GL buffer with CUDA so that it can be mapped to a Warp array.
+    Example usage::
+        import warp as wp
+        import numpy as np
+        from pyglet.gl import *
+        wp.init()
+        # create a GL buffer
+        gl_buffer_id = GLuint()
+        glGenBuffers(1, gl_buffer_id)
+        # copy some data to the GL buffer
+        glBindBuffer(GL_ARRAY_BUFFER, gl_buffer_id)
+        gl_data = np.arange(1024, dtype=np.float32)
+        glBufferData(GL_ARRAY_BUFFER, gl_data.nbytes, gl_data.ctypes.data, GL_DYNAMIC_DRAW)
+        glBindBuffer(GL_ARRAY_BUFFER, 0)
+        # register the GL buffer with CUDA
+        cuda_gl_buffer = wp.RegisteredGLBuffer(gl_buffer_id)
+        # map the GL buffer to a Warp array
+        arr = cuda_gl_buffer.map(dtype=wp.float32, shape=(1024,))
+        # launch a Warp kernel to manipulate or read the array
+        wp.launch(my_kernel, dim=1024, inputs=[arr])
+        # unmap the GL buffer
+        cuda_gl_buffer.unmap()
     """
-    # Specifies no hints about how this resource will be used.
-    # It is therefore assumed that this resource will be
-    # read from and written to by CUDA. This is the default value.
     NONE = 0x00
+    """
+    Flag that specifies no hints about how this resource will be used.
+    It is therefore assumed that this resource will be
+    read from and written to by CUDA. This is the default value.
+    """
-    # Specifies that CUDA will not write to this resource.
     READ_ONLY = 0x01
+    """
+    Flag that specifies that CUDA will not write to this resource.
+    """
-    # Specifies that CUDA will not read from this resource and will write over the
-    # entire contents of the resource, so none of the data previously
-    # stored in the resource will be preserved.
     WRITE_DISCARD = 0x02
+    """
+    Flag that specifies that CUDA will not read from this resource and will write over the
+    entire contents of the resource, so none of the data previously
+    stored in the resource will be preserved.
+    """
-    def __init__(self, gl_buffer_id: int, device: Devicelike = None, flags: int = NONE):
-        """Create a new RegisteredGLBuffer object.
+    __fallback_warning_shown = False
+    def __init__(self, gl_buffer_id: int, device: Devicelike = None, flags: int = NONE, fallback_to_copy: bool = True):
+        """
         Args:
             gl_buffer_id: The OpenGL buffer id (GLuint).
             device: The device to register the buffer with.  If None, the current device will be used.
-            flags: A combination of the flags constants.
+            flags: A combination of the flags constants :attr:`NONE`, :attr:`READ_ONLY`, and :attr:`WRITE_DISCARD`.
+            fallback_to_copy: If True and CUDA/OpenGL interop is not available, fall back to copy operations between the Warp array and the OpenGL buffer. Otherwise, a ``RuntimeError`` will be raised.
+        Note:
+            The ``fallback_to_copy`` option (to use copy operations if CUDA graphics interop functionality is not available) requires pyglet version 2.0 or later. Install via ``pip install pyglet==2.*``.
         """
         self.gl_buffer_id = gl_buffer_id
         self.device = get_device(device)
         self.context = self.device.context
+        self.flags = flags
+        self.fallback_to_copy = fallback_to_copy
         self.resource = runtime.core.cuda_graphics_register_gl_buffer(self.context, gl_buffer_id, flags)
+        if self.resource is None:
+            if self.fallback_to_copy:
+                self.warp_buffer = None
+                self.warp_buffer_cpu = None
+                if not RegisteredGLBuffer.__fallback_warning_shown:
+                    warp.utils.warn(
+                        "Could not register GL buffer since CUDA/OpenGL interoperability is not available. Falling back to copy operations between the Warp array and the OpenGL buffer.",
+                    )
+                    RegisteredGLBuffer.__fallback_warning_shown = True
+            else:
+                raise RuntimeError(f"Failed to register OpenGL buffer {gl_buffer_id} with CUDA")
     def __del__(self):
         if not self.resource:
@@ -3687,18 +3869,48 @@ class RegisteredGLBuffer:
         Returns:
             A Warp array object representing the mapped OpenGL buffer.
         """
-        runtime.core.cuda_graphics_map(self.context, self.resource)
-        ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_size_t)
-        ptr = ctypes.c_uint64(0)
-        size = ctypes.c_size_t(0)
-        runtime.core.cuda_graphics_device_ptr_and_size(
-            self.context, self.resource, ctypes.byref(ptr), ctypes.byref(size)
-        )
-        return warp.array(ptr=ptr.value, dtype=dtype, shape=shape, device=self.device)
+        if self.resource is not None:
+            runtime.core.cuda_graphics_map(self.context, self.resource)
+            ptr = ctypes.c_uint64(0)
+            size = ctypes.c_size_t(0)
+            runtime.core.cuda_graphics_device_ptr_and_size(
+                self.context, self.resource, ctypes.byref(ptr), ctypes.byref(size)
+            )
+            return warp.array(ptr=ptr.value, dtype=dtype, shape=shape, device=self.device)
+        elif self.fallback_to_copy:
+            if self.warp_buffer is None or self.warp_buffer.dtype != dtype or self.warp_buffer.shape != shape:
+                self.warp_buffer = warp.empty(shape, dtype, device=self.device)
+                self.warp_buffer_cpu = warp.empty(shape, dtype, device="cpu", pinned=True)
+            if self.flags == self.READ_ONLY or self.flags == self.NONE:
+                # copy from OpenGL buffer to Warp array
+                from pyglet import gl
+                gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self.gl_buffer_id)
+                nbytes = self.warp_buffer.size * warp.types.type_size_in_bytes(dtype)
+                gl.glGetBufferSubData(gl.GL_ARRAY_BUFFER, 0, nbytes, self.warp_buffer_cpu.ptr)
+                gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
+                warp.copy(self.warp_buffer, self.warp_buffer_cpu)
+            return self.warp_buffer
+        return None
     def unmap(self):
         """Unmap the OpenGL buffer."""
-        runtime.core.cuda_graphics_unmap(self.context, self.resource)
+        if self.resource is not None:
+            runtime.core.cuda_graphics_unmap(self.context, self.resource)
+        elif self.fallback_to_copy:
+            if self.warp_buffer is None:
+                raise RuntimeError("RegisteredGLBuffer first has to be mapped")
+            if self.flags == self.WRITE_DISCARD or self.flags == self.NONE:
+                # copy from Warp array to OpenGL buffer
+                from pyglet import gl
+                gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self.gl_buffer_id)
+                buffer = self.warp_buffer.numpy()
+                gl.glBufferData(gl.GL_ARRAY_BUFFER, buffer.nbytes, buffer.ctypes.data, gl.GL_DYNAMIC_DRAW)
+                gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
 def zeros(
@@ -4253,7 +4465,7 @@ def launch(
             If negative or zero, the maximum hardware value will be used.
     """
-    assert_initialized()
+    init()
     # if stream is specified, use the associated device
     if stream is not None:
@@ -4496,7 +4708,7 @@ def force_load(device: Union[Device, str, List[Device], List[str]] = None, modul
 def load_module(
-    module: Union[Module, ModuleType, str] = None, device: Union[Device, str] = None, recursive: bool = False
+    module: Union[Module, types.ModuleType, str] = None, device: Union[Device, str] = None, recursive: bool = False
 ):
     """Force user-defined module to be compiled and loaded
@@ -4514,7 +4726,7 @@ def load_module(
         module_name = module.__name__
     elif isinstance(module, Module):
         module_name = module.name
-    elif isinstance(module, ModuleType):
+    elif isinstance(module, types.ModuleType):
         module_name = module.__name__
     elif isinstance(module, str):
         module_name = module
@@ -4863,13 +5075,20 @@ def copy(
     # copy gradient, if needed
     if hasattr(src, "grad") and src.grad is not None and hasattr(dest, "grad") and dest.grad is not None:
-        copy(dest.grad, src.grad, stream=stream)
+        copy(dest.grad, src.grad, dest_offset=dest_offset, src_offset=src_offset, count=count, stream=stream)
         if runtime.tape:
-            runtime.tape.record_func(backward=lambda: adj_copy(dest.grad, src.grad, stream=stream), arrays=[dest, src])
+            runtime.tape.record_func(
+                backward=lambda: adj_copy(
+                    dest.grad, src.grad, dest_offset=dest_offset, src_offset=src_offset, count=count, stream=stream
+                ),
+                arrays=[dest, src],
+            )
-def adj_copy(adj_dest: warp.array, adj_src: warp.array, stream: Stream = None):
+def adj_copy(
+    adj_dest: warp.array, adj_src: warp.array, dest_offset: int, src_offset: int, count: int, stream: Stream = None
+):
     """Copy adjoint operation for wp.copy() calls on the tape.
     Args:
@@ -4877,7 +5096,7 @@ def adj_copy(adj_dest: warp.array, adj_src: warp.array, stream: Stream = None):
         adj_src: Source array adjoint
         stream: The stream on which the copy was performed in the forward pass
     """
-    copy(adj_src, adj_dest, stream=stream)
+    copy(adj_src, adj_dest, dest_offset=dest_offset, src_offset=src_offset, count=count, stream=stream)
 def type_str(t):