PyPI - warp-lang - Versions diffs - 1.4.2__py3-none-win_amd64.whl → 1.5.0__py3-none-win_amd64.whl - Mend

warp-lang 1.4.2__py3-none-win_amd64.whl → 1.5.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (158) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1783 -2
warp/codegen.py +177 -45
warp/config.py +2 -2
warp/context.py +321 -73
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/sim/example_cloth.py +2 -25
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -5
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +15 -0
warp/native/builtin.h +66 -26
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +600 -0
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1857 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +137 -65
warp/sim/graph_coloring.py +292 -0
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +88 -15
warp/stubs.py +569 -4
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +39 -0
warp/tests/test_codegen.py +81 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +241 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +18 -4
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +13 -0
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +56 -1
warp/tests/test_spatial.py +1 -1
warp/tests/test_tile.py +700 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +19 -2
warp/tests/unittest_utils.py +4 -0
warp/types.py +338 -72
warp/utils.py +22 -1
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/RECORD +153 -126
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -12,6 +12,7 @@ import hashlib
 import inspect
 import io
 import itertools
+import json
 import operator
 import os
 import platform
@@ -21,7 +22,7 @@ import typing
 import weakref
 from copy import copy as shallowcopy
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 import numpy as np
@@ -101,6 +102,7 @@ class Function:
         value_func=None,
         export_func=None,
         dispatch_func=None,
+        lto_dispatch_func=None,
         module=None,
         variadic=False,
         initializer_list_func=None,
@@ -137,6 +139,7 @@ class Function:
         self.value_func = value_func  # a function that takes a list of args and a list of templates and returns the value type, e.g.: load(array, index) returns the type of value being loaded
         self.export_func = export_func
         self.dispatch_func = dispatch_func
+        self.lto_dispatch_func = lto_dispatch_func
         self.input_types = {}
         self.export = export
         self.doc = doc
@@ -619,10 +622,13 @@ def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
 class KernelHooks:
-    def __init__(self, forward, backward):
+    def __init__(self, forward, backward, forward_smem_bytes=0, backward_smem_bytes=0):
         self.forward = forward
         self.backward = backward
+        self.forward_smem_bytes = forward_smem_bytes
+        self.backward_smem_bytes = backward_smem_bytes
 # caches source and compiled entry points for a kernel (will be populated after module loads)
 class Kernel:
@@ -970,8 +976,17 @@ def struct(c):
     return s
-# overload a kernel with the given argument types
-def overload(kernel, arg_types=None):
+def overload(kernel, arg_types=Union[None, Dict[str, Any], List[Any]]):
+    """Overload a generic kernel with the given argument types.
+    Can be called directly or used as a function decorator.
+    Args:
+        kernel: The generic kernel to be instantiated with concrete types.
+        arg_types: A list of concrete argument types for the kernel or a
+            dictionary specifying generic argument names as keys and concrete
+            types as variables.
+    """
     if isinstance(kernel, Kernel):
         # handle cases where user calls us directly, e.g. wp.overload(kernel, [args...])
@@ -1073,6 +1088,7 @@ def add_builtin(
     value_func=None,
     export_func=None,
     dispatch_func=None,
+    lto_dispatch_func=None,
     doc="",
     namespace="wp::",
     variadic=False,
@@ -1113,6 +1129,9 @@ def add_builtin(
             The arguments returned must be of type `codegen.Var`.
             If not provided, all arguments passed by the users when calling
             the built-in are passed as-is as runtime arguments to the C++ function.
+        lto_dispatch_func (Callable): Same as dispatch_func, but takes an 'option' dict
+            as extra argument (indicating tile_size and target architecture) and returns
+            an LTO-IR buffer as extra return value
         doc (str): Used to generate the Python's docstring and the HTML documentation.
         namespace: Namespace for the underlying C++ function.
         variadic (bool): Whether the function declares variadic arguments.
@@ -1252,6 +1271,7 @@ def add_builtin(
                     value_func=value_func if return_type is Any else None,
                     export_func=export_func,
                     dispatch_func=dispatch_func,
+                    lto_dispatch_func=lto_dispatch_func,
                     doc=doc,
                     namespace=namespace,
                     variadic=variadic,
@@ -1274,6 +1294,7 @@ def add_builtin(
         value_func=value_func,
         export_func=export_func,
         dispatch_func=dispatch_func,
+        lto_dispatch_func=lto_dispatch_func,
         variadic=variadic,
         initializer_list_func=initializer_list_func,
         export=export,
@@ -1540,6 +1561,8 @@ class ModuleBuilder:
         self.options = options
         self.module = module
         self.deferred_functions = []
+        self.ltoirs = {}  # map from lto symbol to lto binary
+        self.ltoirs_decl = {}  # map from lto symbol to lto forward declaration
         if hasher is None:
             hasher = ModuleHasher(module)
@@ -1607,9 +1630,26 @@ class ModuleBuilder:
             # use dict to preserve import order
             self.functions[func] = None
+    def build_meta(self):
+        meta = {}
+        for kernel in self.kernels:
+            name = kernel.get_mangled_name()
+            meta[name + "_cuda_kernel_forward_smem_bytes"] = kernel.adj.get_total_required_shared()
+            meta[name + "_cuda_kernel_backward_smem_bytes"] = kernel.adj.get_total_required_shared() * 2
+        return meta
     def codegen(self, device):
         source = ""
+        # code-gen LTO forward declarations
+        source += 'extern "C" {\n'
+        for fwd in self.ltoirs_decl.values():
+            source += fwd + "\n"
+        source += "}\n"
         # code-gen structs
         visited_structs = set()
         for struct in self.structs.keys():
@@ -1639,9 +1679,9 @@ class ModuleBuilder:
         # add headers
         if device == "cpu":
-            source = warp.codegen.cpu_module_header + source
+            source = warp.codegen.cpu_module_header.format(tile_size=self.options["block_dim"]) + source
         else:
-            source = warp.codegen.cuda_module_header + source
+            source = warp.codegen.cuda_module_header.format(tile_size=self.options["block_dim"]) + source
         return source
@@ -1660,11 +1700,12 @@ class ModuleExec:
         instance.handle = None
         return instance
-    def __init__(self, handle, module_hash, device):
+    def __init__(self, handle, module_hash, device, meta):
         self.handle = handle
         self.module_hash = module_hash
         self.device = device
         self.kernel_hooks = {}
+        self.meta = meta
     # release the loaded module
     def __del__(self):
@@ -1678,19 +1719,50 @@ class ModuleExec:
     # lookup and cache kernel entry points
     def get_kernel_hooks(self, kernel):
-        hooks = self.kernel_hooks.get(kernel)
+        # Use kernel.adj as a unique key for cache lookups instead of the kernel itself.
+        # This avoids holding a reference to the kernel and is faster than using
+        # a WeakKeyDictionary with kernels as keys.
+        hooks = self.kernel_hooks.get(kernel.adj)
         if hooks is not None:
             return hooks
         name = kernel.get_mangled_name()
         if self.device.is_cuda:
-            forward = runtime.core.cuda_get_kernel(
-                self.device.context, self.handle, (name + "_cuda_kernel_forward").encode("utf-8")
+            forward_name = name + "_cuda_kernel_forward"
+            forward_kernel = runtime.core.cuda_get_kernel(
+                self.device.context, self.handle, forward_name.encode("utf-8")
             )
-            backward = runtime.core.cuda_get_kernel(
-                self.device.context, self.handle, (name + "_cuda_kernel_backward").encode("utf-8")
+            backward_name = name + "_cuda_kernel_backward"
+            backward_kernel = runtime.core.cuda_get_kernel(
+                self.device.context, self.handle, backward_name.encode("utf-8")
             )
+            # look up the required shared memory size for each kernel from module metadata
+            forward_smem_bytes = self.meta[forward_name + "_smem_bytes"]
+            backward_smem_bytes = self.meta[backward_name + "_smem_bytes"]
+            # configure kernels maximum shared memory size
+            max_smem_bytes = runtime.core.cuda_get_max_shared_memory(self.device.context)
+            if not runtime.core.cuda_configure_kernel_shared_memory(forward_kernel, forward_smem_bytes):
+                print(
+                    f"Warning: Failed to configure kernel dynamic shared memory for this device, tried to configure {forward_name} kernel for {forward_smem_bytes} bytes, but maximum available is {max_smem_bytes}"
+                )
+            options = dict(kernel.module.options)
+            options.update(kernel.options)
+            if options["enable_backward"] and not runtime.core.cuda_configure_kernel_shared_memory(
+                backward_kernel, backward_smem_bytes
+            ):
+                print(
+                    f"Warning: Failed to configure kernel dynamic shared memory for this device, tried to configure {backward_name} kernel for {backward_smem_bytes} bytes, but maximum available is {max_smem_bytes}"
+                )
+            hooks = KernelHooks(forward_kernel, backward_kernel, forward_smem_bytes, backward_smem_bytes)
         else:
             func = ctypes.CFUNCTYPE(None)
             forward = (
@@ -1700,9 +1772,9 @@ class ModuleExec:
                 func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))) or None
             )
-        hooks = KernelHooks(forward, backward)
-        self.kernel_hooks[kernel] = hooks
+            hooks = KernelHooks(forward, backward)
+        self.kernel_hooks[kernel.adj] = hooks
         return hooks
@@ -1712,7 +1784,8 @@ class ModuleExec:
 # build cache
 class Module:
     def __init__(self, name, loader):
-        self.name = name
+        self.name = name if name is not None else "None"
         self.loader = loader
         # lookup the latest versions of kernels, functions, and structs by key
@@ -1720,12 +1793,14 @@ class Module:
         self.functions = {}  # (key: function)
         self.structs = {}  # (key: struct)
-        # Set of all "live" kernels in this module.
+        # Set of all "live" kernels in this module, i.e., kernels that still have references.
+        # We keep a weak reference to every kernel ever created in this module and rely on Python GC
+        # to release kernels that no longer have any references (in user code or internal bookkeeping).
         # The difference between `live_kernels` and `kernels` is that `live_kernels` may contain
         # multiple kernels with the same key (which is essential to support closures), while `kernels`
         # only holds the latest kernel for each key.  When the module is built, we compute the hash
         # of each kernel in `live_kernels` and filter out duplicates for codegen.
-        self.live_kernels = weakref.WeakSet()
+        self._live_kernels = weakref.WeakSet()
         # executable modules currently loaded
         self.execs = {}  # (device.context: ModuleExec)
@@ -1749,6 +1824,7 @@ class Module:
             "fast_math": False,
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
             "mode": warp.config.mode,
+            "block_dim": 256,
         }
         # Module dependencies are determined by scanning each function
@@ -1773,7 +1849,7 @@ class Module:
         self.kernels[kernel.key] = kernel
         # track all kernel objects, even if they are duplicates
-        self.live_kernels.add(kernel)
+        self._live_kernels.add(kernel)
         self.find_references(kernel.adj)
@@ -1839,6 +1915,19 @@ class Module:
         # for a reload of module on next launch
         self.mark_modified()
+    @property
+    def live_kernels(self):
+        # Return a list of kernels that still have references.
+        # We return a regular list instead of the WeakSet to avoid undesirable issues
+        # if kernels are garbage collected before the caller is done using this list.
+        # Note that we should avoid retaining strong references to kernels unnecessarily
+        # so that Python GC can release kernels that no longer have user references.
+        # It is tempting to call gc.collect() here to force garbage collection,
+        # but this can have undesirable consequences (e.g., GC during graph capture),
+        # so we should avoid it as a general rule.  Instead, we rely on Python's
+        # reference counting GC to collect kernels that have gone out of scope.
+        return list(self._live_kernels)
     # find kernel corresponding to a Python function
     def find_kernel(self, func):
         qualname = warp.codegen.make_full_qualified_name(func)
@@ -1879,9 +1968,17 @@ class Module:
         self.hasher = ModuleHasher(self)
         return self.hasher.get_module_hash()
-    def load(self, device) -> ModuleExec:
+    def load(self, device, block_dim=None) -> ModuleExec:
         device = runtime.get_device(device)
+        # re-compile module if tile size (blockdim) changes
+        # todo: it would be better to have a method such as `module.get_kernel(block_dim=N)`
+        # that can return a single kernel instance with a given block size
+        if block_dim is not None:
+            if self.options["block_dim"] != block_dim:
+                self.unload()
+            self.options["block_dim"] = block_dim
         # compute the hash if needed
         if self.hasher is None:
             self.hasher = ModuleHasher(self)
@@ -1909,6 +2006,7 @@ class Module:
             # determine output paths
             if device.is_cpu:
                 output_name = "module_codegen.o"
+                output_arch = None
             elif device.is_cuda:
                 # determine whether to use PTX or CUBIN
@@ -1947,7 +2045,12 @@ class Module:
                 or not warp.config.cache_kernels
                 or warp.config.verify_autograd_array_access
             ):
-                builder = ModuleBuilder(self, self.options, hasher=self.hasher)
+                builder_options = {
+                    **self.options,
+                    # Some of the Tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
+                    "output_arch": output_arch,
+                }
+                builder = ModuleBuilder(self, builder_options, hasher=self.hasher)
                 # create a temporary (process unique) dir for build outputs before moving to the binary dir
                 build_dir = os.path.join(
@@ -2010,6 +2113,7 @@ class Module:
                                 config=self.options["mode"],
                                 fast_math=self.options["fast_math"],
                                 verify_fp=warp.config.verify_fp,
+                                ltoirs=builder.ltoirs.values(),
                             )
                     except Exception as e:
@@ -2017,6 +2121,15 @@ class Module:
                         module_load_timer.extra_msg = " (error)"
                         raise (e)
+                # ------------------------------------------------------------
+                # build meta data
+                meta = builder.build_meta()
+                meta_path = os.path.join(build_dir, "module_codegen.meta")
+                with open(meta_path, "w") as meta_file:
+                    json.dump(meta, meta_file)
                 # -----------------------------------------------------------
                 # update cache
@@ -2053,18 +2166,23 @@ class Module:
             # -----------------------------------------------------------
             # Load CPU or CUDA binary
+            meta_path = os.path.join(module_dir, "module_codegen.meta")
+            with open(meta_path, "r") as meta_file:
+                meta = json.load(meta_file)
             if device.is_cpu:
                 # LLVM modules are identified using strings, so we need to ensure uniqueness
                 module_handle = f"{module_name}_{self.cpu_exec_id}"
                 self.cpu_exec_id += 1
                 runtime.llvm.load_obj(binary_path.encode("utf-8"), module_handle.encode("utf-8"))
-                module_exec = ModuleExec(module_handle, module_hash, device)
+                module_exec = ModuleExec(module_handle, module_hash, device, meta)
                 self.execs[None] = module_exec
             elif device.is_cuda:
                 cuda_module = warp.build.load_cuda(binary_path, device)
                 if cuda_module is not None:
-                    module_exec = ModuleExec(cuda_module, module_hash, device)
+                    module_exec = ModuleExec(cuda_module, module_hash, device, meta)
                     self.execs[device.context] = module_exec
                 else:
                     module_load_timer.extra_msg = " (error)"
@@ -2719,21 +2837,16 @@ class Graph:
 class Runtime:
     def __init__(self):
-        if sys.version_info < (3, 7):
-            raise RuntimeError("Warp requires Python 3.7 as a minimum")
+        if sys.version_info < (3, 8):
+            raise RuntimeError("Warp requires Python 3.8 as a minimum")
         if sys.version_info < (3, 9):
             warp.utils.warn(f"Python 3.9 or newer is recommended for running Warp, detected {sys.version_info}")
         bin_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "bin")
         if os.name == "nt":
-            if sys.version_info >= (3, 8):
-                # Python >= 3.8 this method to add dll search paths
-                os.add_dll_directory(bin_path)
-            else:
-                # Python < 3.8 we add dll directory to path
-                os.environ["PATH"] = bin_path + os.pathsep + os.environ["PATH"]
+            # Python >= 3.8 this method to add dll search paths
+            os.add_dll_directory(bin_path)
             warp_lib = os.path.join(bin_path, "warp.dll")
             llvm_lib = os.path.join(bin_path, "warp-clang.dll")
@@ -3205,6 +3318,8 @@ class Runtime:
             self.core.is_cuda_compatibility_enabled.restype = ctypes.c_int
             self.core.is_cutlass_enabled.argtypes = None
             self.core.is_cutlass_enabled.restype = ctypes.c_int
+            self.core.is_mathdx_enabled.argtypes = None
+            self.core.is_mathdx_enabled.restype = ctypes.c_int
             self.core.cuda_driver_version.argtypes = None
             self.core.cuda_driver_version.restype = ctypes.c_int
@@ -3329,17 +3444,58 @@ class Runtime:
             self.core.cuda_graph_destroy.restype = ctypes.c_bool
             self.core.cuda_compile_program.argtypes = [
-                ctypes.c_char_p,
-                ctypes.c_int,
-                ctypes.c_char_p,
-                ctypes.c_bool,
-                ctypes.c_bool,
-                ctypes.c_bool,
-                ctypes.c_bool,
-                ctypes.c_char_p,
+                ctypes.c_char_p,  # cuda_src
+                ctypes.c_int,  # arch
+                ctypes.c_char_p,  # include_dir
+                ctypes.c_int,  # num_cuda_include_dirs
+                ctypes.POINTER(ctypes.c_char_p),  # cuda include dirs
+                ctypes.c_bool,  # debug
+                ctypes.c_bool,  # verbose
+                ctypes.c_bool,  # verify_fp
+                ctypes.c_bool,  # fast_math
+                ctypes.c_char_p,  # output_path
+                ctypes.c_size_t,  # num_ltoirs
+                ctypes.POINTER(ctypes.c_char_p),  # ltoirs
+                ctypes.POINTER(ctypes.c_size_t),  # ltoir_sizes
             ]
             self.core.cuda_compile_program.restype = ctypes.c_size_t
+            self.core.cuda_compile_fft.argtypes = [
+                ctypes.c_char_p,  # lto
+                ctypes.c_char_p,  # function name
+                ctypes.c_int,  # num include dirs
+                ctypes.POINTER(ctypes.c_char_p),  # include dirs
+                ctypes.c_char_p,  # mathdx include dir
+                ctypes.c_int,  # arch
+                ctypes.c_int,  # size
+                ctypes.c_int,  # ept
+                ctypes.c_int,  # direction
+                ctypes.c_int,  # precision
+                ctypes.POINTER(ctypes.c_int),  # smem (out)
+            ]
+            self.core.cuda_compile_fft.restype = ctypes.c_bool
+            self.core.cuda_compile_dot.argtypes = [
+                ctypes.c_char_p,  # lto
+                ctypes.c_char_p,  # function name
+                ctypes.c_int,  # num include dirs
+                ctypes.POINTER(ctypes.c_char_p),  # include dirs
+                ctypes.c_char_p,  # mathdx include dir
+                ctypes.c_int,  # arch
+                ctypes.c_int,  # M
+                ctypes.c_int,  # N
+                ctypes.c_int,  # K
+                ctypes.c_int,  # a_precision
+                ctypes.c_int,  # b_precision
+                ctypes.c_int,  # c_precision
+                ctypes.c_int,  # type
+                ctypes.c_int,  # a_arrangement
+                ctypes.c_int,  # b_arrangement
+                ctypes.c_int,  # c_arrangement
+                ctypes.c_int,  # num threads
+            ]
+            self.core.cuda_compile_dot.restype = ctypes.c_bool
             self.core.cuda_load_module.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
             self.core.cuda_load_module.restype = ctypes.c_void_p
@@ -3349,11 +3505,19 @@ class Runtime:
             self.core.cuda_get_kernel.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_char_p]
             self.core.cuda_get_kernel.restype = ctypes.c_void_p
+            self.core.cuda_get_max_shared_memory.argtypes = [ctypes.c_void_p]
+            self.core.cuda_get_max_shared_memory.restype = ctypes.c_int
+            self.core.cuda_configure_kernel_shared_memory.argtypes = [ctypes.c_void_p, ctypes.c_int]
+            self.core.cuda_configure_kernel_shared_memory.restype = ctypes.c_bool
             self.core.cuda_launch_kernel.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.c_size_t,
                 ctypes.c_int,
+                ctypes.c_int,
+                ctypes.c_int,
                 ctypes.POINTER(ctypes.c_void_p),
                 ctypes.c_void_p,
             ]
@@ -3382,6 +3546,23 @@ class Runtime:
             self.core.cuda_timing_end.argtypes = []
             self.core.cuda_timing_end.restype = None
+            self.core.graph_coloring.argtypes = [
+                ctypes.c_int,
+                warp.types.array_t,
+                ctypes.c_int,
+                warp.types.array_t,
+            ]
+            self.core.graph_coloring.restype = ctypes.c_int
+            self.core.balance_coloring.argtypes = [
+                ctypes.c_int,
+                warp.types.array_t,
+                ctypes.c_int,
+                ctypes.c_float,
+                warp.types.array_t,
+            ]
+            self.core.balance_coloring.restype = ctypes.c_float
             self.core.init.restype = ctypes.c_int
         except AttributeError as e:
@@ -3607,10 +3788,7 @@ class Runtime:
     def load_dll(self, dll_path):
         try:
-            if sys.version_info >= (3, 8):
-                dll = ctypes.CDLL(dll_path, winmode=0)
-            else:
-                dll = ctypes.CDLL(dll_path)
+            dll = ctypes.CDLL(dll_path, winmode=0)
         except OSError as e:
             if "GLIBCXX" in str(e):
                 raise RuntimeError(
@@ -3751,7 +3929,7 @@ def is_cuda_available() -> bool:
     return get_cuda_device_count() > 0
-def is_device_available(device):
+def is_device_available(device: Device) -> bool:
     return device in get_devices()
@@ -3811,7 +3989,7 @@ def get_cuda_devices() -> List[Device]:
 def get_preferred_device() -> Device:
-    """Returns the preferred compute device, CUDA if available and CPU otherwise."""
+    """Returns the preferred compute device, ``cuda:0`` if available and ``cpu`` otherwise."""
     init()
@@ -3951,7 +4129,7 @@ def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, floa
 def get_mempool_release_threshold(device: Devicelike) -> int:
-    """Get the CUDA memory pool release threshold on the device."""
+    """Get the CUDA memory pool release threshold on the device in bytes."""
     init()
@@ -3970,7 +4148,7 @@ def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike)
     """Check if `peer_device` can directly access the memory of `target_device` on this system.
     This applies to memory allocated using default CUDA allocators.  For memory allocated using
-    CUDA pooled allocators, use `is_mempool_access_supported()`.
+    CUDA pooled allocators, use :func:`is_mempool_access_supported()`.
     Returns:
         A Boolean value indicating if this peer access is supported by the system.
@@ -3991,7 +4169,7 @@ def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike) -
     """Check if `peer_device` can currently access the memory of `target_device`.
     This applies to memory allocated using default CUDA allocators.  For memory allocated using
-    CUDA pooled allocators, use `is_mempool_access_enabled()`.
+    CUDA pooled allocators, use :func:`is_mempool_access_enabled()`.
     Returns:
         A Boolean value indicating if this peer access is currently enabled.
@@ -4015,7 +4193,7 @@ def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike,
     a negative impact on memory consumption and allocation performance.
     This applies to memory allocated using default CUDA allocators.  For memory allocated using
-    CUDA pooled allocators, use `set_mempool_access_enabled()`.
+    CUDA pooled allocators, use :func:`set_mempool_access_enabled()`.
     """
     init()
@@ -4043,7 +4221,8 @@ def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike,
 def is_mempool_access_supported(target_device: Devicelike, peer_device: Devicelike) -> bool:
     """Check if `peer_device` can directly access the memory pool of `target_device`.
-    If mempool access is possible, it can be managed using `set_mempool_access_enabled()` and `is_mempool_access_enabled()`.
+    If mempool access is possible, it can be managed using :func:`set_mempool_access_enabled()`
+    and :func:`is_mempool_access_enabled()`.
     Returns:
         A Boolean value indicating if this memory pool access is supported by the system.
@@ -4061,7 +4240,7 @@ def is_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike
     """Check if `peer_device` can currently access the memory pool of `target_device`.
     This applies to memory allocated using CUDA pooled allocators.  For memory allocated using
-    default CUDA allocators, use `is_peer_access_enabled()`.
+    default CUDA allocators, use :func:`is_peer_access_enabled()`.
     Returns:
         A Boolean value indicating if this peer access is currently enabled.
@@ -4082,7 +4261,7 @@ def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelik
     """Enable or disable access from `peer_device` to the memory pool of `target_device`.
     This applies to memory allocated using CUDA pooled allocators.  For memory allocated using
-    default CUDA allocators, use `set_peer_access_enabled()`.
+    default CUDA allocators, use :func:`set_peer_access_enabled()`.
     """
     init()
@@ -4791,7 +4970,9 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
 # represents all data required for a kernel launch
 # so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
 class Launch:
-    def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0):
+    def __init__(
+        self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0, block_dim=256
+    ):
         # retain the module executable so it doesn't get unloaded
         self.module_exec = kernel.module.load(device)
         if not self.module_exec:
@@ -4830,6 +5011,7 @@ class Launch:
         self.device = device
         self.bounds = bounds
         self.max_blocks = max_blocks
+        self.block_dim = block_dim
     def set_dim(self, dim):
         self.bounds = warp.types.launch_bounds_t(dim)
@@ -4911,6 +5093,8 @@ class Launch:
                 self.hooks.forward,
                 self.bounds.size,
                 self.max_blocks,
+                self.block_dim,
+                self.hooks.forward_smem_bytes,
                 self.params_addr,
                 stream.cuda_stream,
             )
@@ -4929,6 +5113,7 @@ def launch(
     record_tape=True,
     record_cmd=False,
     max_blocks=0,
+    block_dim=256,
 ):
     """Launch a Warp kernel on the target device
@@ -4948,6 +5133,7 @@ def launch(
         record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
         max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
             If negative or zero, the maximum hardware value will be used.
+        block_dim: The number of threads per block.
     """
     init()
@@ -5001,7 +5187,12 @@ def launch(
             kernel = kernel.add_overload(fwd_types)
         # delay load modules, including new overload if needed
-        module_exec = kernel.module.load(device)
+        try:
+            module_exec = kernel.module.load(device, block_dim)
+        except Exception:
+            kernel.adj.skip_build = True
+            raise
         if not module_exec:
             return
@@ -5057,7 +5248,14 @@ def launch(
                     )
                 runtime.core.cuda_launch_kernel(
-                    device.context, hooks.backward, bounds.size, max_blocks, kernel_params, stream.cuda_stream
+                    device.context,
+                    hooks.backward,
+                    bounds.size,
+                    max_blocks,
+                    block_dim,
+                    hooks.backward_smem_bytes,
+                    kernel_params,
+                    stream.cuda_stream,
                 )
             else:
@@ -5080,7 +5278,14 @@ def launch(
                 else:
                     # launch
                     runtime.core.cuda_launch_kernel(
-                        device.context, hooks.forward, bounds.size, max_blocks, kernel_params, stream.cuda_stream
+                        device.context,
+                        hooks.forward,
+                        bounds.size,
+                        max_blocks,
+                        block_dim,
+                        hooks.forward_smem_bytes,
+                        kernel_params,
+                        stream.cuda_stream,
                     )
             try:
@@ -5094,13 +5299,65 @@ def launch(
         # record file, lineno, func as metadata
         frame = inspect.currentframe().f_back
         caller = {"file": frame.f_code.co_filename, "lineno": frame.f_lineno, "func": frame.f_code.co_name}
-        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device, metadata={"caller": caller})
+        runtime.tape.record_launch(
+            kernel, dim, max_blocks, inputs, outputs, device, block_dim, metadata={"caller": caller}
+        )
         # detect illegal inter-kernel read/write access patterns if verification flag is set
         if warp.config.verify_autograd_array_access:
             runtime.tape._check_kernel_array_access(kernel, fwd_args)
+def launch_tiled(*args, **kwargs):
+    """A helper method for launching a grid with an extra trailing dimension equal to the block size.
+    For example, to launch a 2D grid, where each element has 64 threads assigned you would use the following:
+    .. code-block:: python
+        wp.launch_tiled(kernel, [M, N], inputs=[...], block_dim=64)
+    Which is equivalent to the following:
+    .. code-block:: python
+        wp.launch(kernel, [M, N, 64], inputs=[...], block_dim=64)
+    Inside your kernel code you can retrieve the first two indices of the thread as usual, ignoring the implicit third dimension if desired:
+    .. code-block:: python
+        @wp.kernel
+        def compute()
+            i, j = wp.tid()
+            ...
+    """
+    # promote dim to a list in case it was passed as a scalar or tuple
+    if "dim" not in kwargs:
+        raise RuntimeError("Launch dimensions 'dim' argument should be passed via. keyword args for wp.launch_tiled()")
+    if "block_dim" not in kwargs:
+        raise RuntimeError(
+            "Launch block dimension 'block_dim' argument should be passed via. keyword args for wp.launch_tiled()"
+        )
+    dim = kwargs["dim"]
+    if not isinstance(dim, list):
+        dim = list(dim) if isinstance(dim, tuple) else [dim]
+    if len(dim) > 3:
+        raise RuntimeError("wp.launch_tiled() requires a grid with fewer than 4 dimensions")
+    # add trailing dimension
+    kwargs["dim"] = dim + [kwargs["block_dim"]]
+    # forward to original launch method
+    launch(*args, **kwargs)
 def synchronize():
     """Manually synchronize the calling CPU thread with any outstanding CUDA work on all devices
@@ -5619,16 +5876,6 @@ def type_str(t):
         return "Any"
     elif t == Callable:
         return "Callable"
-    elif t == Tuple[int]:
-        return "Tuple[int]"
-    elif t == Tuple[int, int]:
-        return "Tuple[int, int]"
-    elif t == Tuple[int, int, int]:
-        return "Tuple[int, int, int]"
-    elif t == Tuple[int, int, int, int]:
-        return "Tuple[int, int, int, int]"
-    elif t == Tuple[int, ...]:
-        return "Tuple[int, ...]"
     elif isinstance(t, int):
         return str(t)
     elif isinstance(t, List):
@@ -5663,9 +5910,13 @@ def type_str(t):
             return f"Transformation[{type_str(t._wp_scalar_type_)}]"
         raise TypeError("Invalid vector or matrix dimensions")
-    elif typing.get_origin(t) in (List, Mapping, Sequence, Union, Tuple):
-        args_repr = ", ".join(type_str(x) for x in typing.get_args(t))
-        return f"{t.__name__}[{args_repr}]"
+    elif warp.codegen.get_type_origin(t) in (list, tuple):
+        args_repr = ", ".join(type_str(x) for x in warp.codegen.get_type_args(t))
+        return f"{t._name}[{args_repr}]"
+    elif t is Ellipsis:
+        return "..."
+    elif warp.types.is_tile(t):
+        return "Tile"
     return t.__name__
@@ -5826,9 +6077,6 @@ def export_stubs(file):  # pragma: no cover
     print('Cols = TypeVar("Cols", bound=int)', file=file)
     print('DType = TypeVar("DType")', file=file)
-    print('Int = TypeVar("Int")', file=file)
-    print('Float = TypeVar("Float")', file=file)
-    print('Scalar = TypeVar("Scalar")', file=file)
     print("Vector = Generic[Length, Scalar]", file=file)
     print("Matrix = Generic[Rows, Cols, Scalar]", file=file)
     print("Quaternion = Generic[Float]", file=file)