PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/build_dll.py CHANGED Viewed

@@ -13,16 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 import os
 import platform
 import subprocess
 import sys
-from typing import List, Optional
 from warp.utils import ScopedTimer
 verbose_cmd = True  # print command lines before executing them
+MIN_CTK_VERSION = (12, 0)
 def machine_architecture() -> str:
     """Return a canonical machine architecture string.
@@ -120,7 +123,7 @@ def find_host_compiler():
         return run_cmd("which g++").decode()
-def get_cuda_toolkit_version(cuda_home):
+def get_cuda_toolkit_version(cuda_home) -> tuple[int, int]:
     try:
         # the toolkit version can be obtained by running "nvcc --version"
         nvcc_path = os.path.join(cuda_home, "bin", "nvcc")
@@ -128,14 +131,16 @@ def get_cuda_toolkit_version(cuda_home):
         # search for release substring (e.g., "release 11.5")
         import re
-        m = re.search(r"(?<=release )\d+\.\d+", nvcc_version_output)
+        m = re.search(r"release (\d+)\.(\d+)", nvcc_version_output)
         if m is not None:
-            return tuple(int(x) for x in m.group(0).split("."))
+            major, minor = map(int, m.groups())
+            return (major, minor)
         else:
             raise Exception("Failed to parse NVCC output")
     except Exception as e:
-        print(f"Failed to determine CUDA Toolkit version: {e}")
+        print(f"Warning: Failed to determine CUDA Toolkit version: {e}")
+        return MIN_CTK_VERSION
 def quote(path):
@@ -169,138 +174,363 @@ def add_llvm_bin_to_path(args):
     return True
-def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: Optional[List[str]] = None, mode=None):
-    mode = args.mode if (mode is None) else mode
-    cuda_home = args.cuda_path
-    cuda_cmd = None
+def _get_architectures_cu12(
+    ctk_version: tuple[int, int], arch: str, target_platform: str, quick_build: bool = False
+) -> tuple[list[str], list[str]]:
+    """Get architecture flags for CUDA 12.x."""
+    gencode_opts = []
+    clang_arch_flags = []
-    # Add LLVM bin directory to PATH
-    add_llvm_bin_to_path(args)
-    if args.quick or cu_path is None:
-        cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
+    if quick_build:
+        gencode_opts = ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
+        clang_arch_flags = ["--cuda-gpu-arch=sm_52", "--cuda-gpu-arch=sm_75"]
     else:
-        cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"
+        if arch == "aarch64" and target_platform == "linux" and ctk_version == (12, 9):
+            # Skip certain architectures for aarch64 with CUDA 12.9 due to CCCL bug
+            print(
+                "[INFO] Skipping sm_52, sm_60, sm_61, and sm_70 targets for ARM due to a CUDA Toolkit bug. "
+                "See https://nvidia.github.io/warp/installation.html#cuda-12-9-limitation-on-linux-arm-platforms "
+                "for details."
+            )
+        else:
+            gencode_opts.extend(
+                [
+                    "-gencode=arch=compute_52,code=sm_52",  # Maxwell
+                    "-gencode=arch=compute_60,code=sm_60",  # Pascal
+                    "-gencode=arch=compute_61,code=sm_61",
+                    "-gencode=arch=compute_70,code=sm_70",  # Volta
+                ]
+            )
+            clang_arch_flags.extend(
+                [
+                    "--cuda-gpu-arch=sm_52",
+                    "--cuda-gpu-arch=sm_60",
+                    "--cuda-gpu-arch=sm_61",
+                    "--cuda-gpu-arch=sm_70",
+                ]
+            )
-    if libs is None:
-        libs = []
+        # Desktop architectures
+        gencode_opts.extend(
+            [
+                "-gencode=arch=compute_75,code=sm_75",  # Turing
+                "-gencode=arch=compute_75,code=compute_75",  # Turing (PTX)
+                "-gencode=arch=compute_80,code=sm_80",  # Ampere
+                "-gencode=arch=compute_86,code=sm_86",
+                "-gencode=arch=compute_89,code=sm_89",  # Ada
+                "-gencode=arch=compute_90,code=sm_90",  # Hopper
+            ]
+        )
+        clang_arch_flags.extend(
+            [
+                "--cuda-gpu-arch=sm_75",  # Turing
+                "--cuda-gpu-arch=sm_80",  # Ampere
+                "--cuda-gpu-arch=sm_86",
+                "--cuda-gpu-arch=sm_89",  # Ada
+                "--cuda-gpu-arch=sm_90",  # Hopper
+            ]
+        )
-    import pathlib
+        if ctk_version >= (12, 8):
+            gencode_opts.extend(["-gencode=arch=compute_100,code=sm_100", "-gencode=arch=compute_120,code=sm_120"])
+            clang_arch_flags.extend(["--cuda-gpu-arch=sm_100", "--cuda-gpu-arch=sm_120"])
-    warp_home_path = pathlib.Path(__file__).parent
-    warp_home = warp_home_path.resolve()
+        # Mobile architectures for aarch64 Linux
+        if arch == "aarch64" and target_platform == "linux":
+            gencode_opts.extend(
+                [
+                    "-gencode=arch=compute_87,code=sm_87",  # Orin
+                    "-gencode=arch=compute_53,code=sm_53",  # X1
+                    "-gencode=arch=compute_62,code=sm_62",  # X2
+                    "-gencode=arch=compute_72,code=sm_72",  # Xavier
+                ]
+            )
+            clang_arch_flags.extend(
+                [
+                    "--cuda-gpu-arch=sm_87",
+                    "--cuda-gpu-arch=sm_53",
+                    "--cuda-gpu-arch=sm_62",
+                    "--cuda-gpu-arch=sm_72",
+                ]
+            )
-    if args.verbose:
-        print(f"Building {dll_path}")
+            # Thor support in CUDA 12.8+
+            if ctk_version >= (12, 8):
+                gencode_opts.append("-gencode=arch=compute_101,code=sm_101")  # Thor (CUDA 12 numbering)
+                clang_arch_flags.append("--cuda-gpu-arch=sm_101")
+            if ctk_version >= (12, 9):
+                gencode_opts.append("-gencode=arch=compute_121,code=sm_121")
+                clang_arch_flags.append("--cuda-gpu-arch=sm_121")
+        # PTX for future hardware (use highest available compute capability)
+        if ctk_version >= (12, 9):
+            gencode_opts.extend(["-gencode=arch=compute_121,code=compute_121"])
+        elif ctk_version >= (12, 8):
+            gencode_opts.extend(["-gencode=arch=compute_120,code=compute_120"])
+        else:
+            gencode_opts.append("-gencode=arch=compute_90,code=compute_90")
-    native_dir = os.path.join(warp_home, "native")
+    return gencode_opts, clang_arch_flags
-    if cu_path:
-        # check CUDA Toolkit version
-        min_ctk_version = (11, 5)
-        ctk_version = get_cuda_toolkit_version(cuda_home) or min_ctk_version
-        if ctk_version < min_ctk_version:
-            raise Exception(
-                f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
+def _get_architectures_cu13(
+    ctk_version: tuple[int, int], arch: str, target_platform: str, quick_build: bool = False
+) -> tuple[list[str], list[str]]:
+    """Get architecture flags for CUDA 13.x."""
+    gencode_opts = []
+    clang_arch_flags = []
+    if quick_build:
+        gencode_opts = ["-gencode=arch=compute_75,code=compute_75"]
+        clang_arch_flags = ["--cuda-gpu-arch=sm_75"]
+    else:
+        # Desktop architectures
+        gencode_opts.extend(
+            [
+                "-gencode=arch=compute_75,code=sm_75",  # Turing
+                "-gencode=arch=compute_75,code=compute_75",  # Turing (PTX)
+                "-gencode=arch=compute_80,code=sm_80",  # Ampere
+                "-gencode=arch=compute_86,code=sm_86",
+                "-gencode=arch=compute_89,code=sm_89",  # Ada
+                "-gencode=arch=compute_90,code=sm_90",  # Hopper
+                "-gencode=arch=compute_100,code=sm_100",  # Blackwell
+                "-gencode=arch=compute_120,code=sm_120",  # Blackwell
+            ]
+        )
+        clang_arch_flags.extend(
+            [
+                "--cuda-gpu-arch=sm_75",  # Turing
+                "--cuda-gpu-arch=sm_80",  # Ampere
+                "--cuda-gpu-arch=sm_86",
+                "--cuda-gpu-arch=sm_89",  # Ada
+                "--cuda-gpu-arch=sm_90",  # Hopper
+                "--cuda-gpu-arch=sm_100",  # Blackwell
+                "--cuda-gpu-arch=sm_120",  # Blackwell
+            ]
+        )
+        # Mobile architectures for aarch64 Linux
+        if arch == "aarch64" and target_platform == "linux":
+            gencode_opts.extend(
+                [
+                    "-gencode=arch=compute_87,code=sm_87",  # Orin
+                    "-gencode=arch=compute_110,code=sm_110",  # Thor
+                    "-gencode=arch=compute_121,code=sm_121",  # Spark
+                ]
+            )
+            clang_arch_flags.extend(
+                [
+                    "--cuda-gpu-arch=sm_87",
+                    "--cuda-gpu-arch=sm_110",
+                    "--cuda-gpu-arch=sm_121",
+                ]
             )
-        if ctk_version[0] < 12 and args.libmathdx_path:
-            print("MathDx support requires at least CUDA 12, skipping")
-            args.libmathdx_path = None
+        # PTX for future hardware (use highest available compute capability)
+        gencode_opts.extend(["-gencode=arch=compute_121,code=compute_121"])
-        # NVCC gencode options
-        gencode_opts = []
+    return gencode_opts, clang_arch_flags
-        # Clang architecture flags
-        clang_arch_flags = []
-        if args.quick:
-            # minimum supported architectures (PTX)
-            gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
-            clang_arch_flags += ["--cuda-gpu-arch=sm_52", "--cuda-gpu-arch=sm_75"]
+def _get_architectures_cu12(
+    ctk_version: tuple[int, int], arch: str, target_platform: str, quick_build: bool = False
+) -> tuple[list[str], list[str]]:
+    """Get architecture flags for CUDA 12.x."""
+    gencode_opts = []
+    clang_arch_flags = []
+    if quick_build:
+        gencode_opts = ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
+        clang_arch_flags = ["--cuda-gpu-arch=sm_52", "--cuda-gpu-arch=sm_75"]
+    else:
+        if arch == "aarch64" and target_platform == "linux" and ctk_version == (12, 9):
+            # Skip certain architectures for aarch64 with CUDA 12.9 due to CCCL bug
+            print(
+                "[INFO] Skipping sm_52, sm_60, sm_61, and sm_70 targets for ARM due to a CUDA Toolkit bug. "
+                "See https://nvidia.github.io/warp/installation.html#cuda-12-9-limitation-on-linux-arm-platforms "
+                "for details."
+            )
         else:
-            # generate code for all supported architectures
-            gencode_opts += [
-                # SASS for supported desktop/datacenter architectures
-                "-gencode=arch=compute_52,code=sm_52",  # Maxwell
-                "-gencode=arch=compute_60,code=sm_60",  # Pascal
-                "-gencode=arch=compute_61,code=sm_61",
-                "-gencode=arch=compute_70,code=sm_70",  # Volta
+            gencode_opts.extend(
+                [
+                    "-gencode=arch=compute_52,code=sm_52",  # Maxwell
+                    "-gencode=arch=compute_60,code=sm_60",  # Pascal
+                    "-gencode=arch=compute_61,code=sm_61",
+                    "-gencode=arch=compute_70,code=sm_70",  # Volta
+                ]
+            )
+            clang_arch_flags.extend(
+                [
+                    "--cuda-gpu-arch=sm_52",
+                    "--cuda-gpu-arch=sm_60",
+                    "--cuda-gpu-arch=sm_61",
+                    "--cuda-gpu-arch=sm_70",
+                ]
+            )
+        # Desktop architectures
+        gencode_opts.extend(
+            [
                 "-gencode=arch=compute_75,code=sm_75",  # Turing
                 "-gencode=arch=compute_75,code=compute_75",  # Turing (PTX)
                 "-gencode=arch=compute_80,code=sm_80",  # Ampere
                 "-gencode=arch=compute_86,code=sm_86",
+                "-gencode=arch=compute_89,code=sm_89",  # Ada
+                "-gencode=arch=compute_90,code=sm_90",  # Hopper
             ]
-            # TODO: Get this working with sm_52, sm_60, sm_61
-            clang_arch_flags += [
-                # SASS for supported desktop/datacenter architectures
-                "--cuda-gpu-arch=sm_52",
-                "--cuda-gpu-arch=sm_60",
-                "--cuda-gpu-arch=sm_61",
-                "--cuda-gpu-arch=sm_70",  # Volta
+        )
+        clang_arch_flags.extend(
+            [
                 "--cuda-gpu-arch=sm_75",  # Turing
                 "--cuda-gpu-arch=sm_80",  # Ampere
                 "--cuda-gpu-arch=sm_86",
+                "--cuda-gpu-arch=sm_89",  # Ada
+                "--cuda-gpu-arch=sm_90",  # Hopper
             ]
+        )
+        if ctk_version >= (12, 8):
+            gencode_opts.extend(["-gencode=arch=compute_100,code=sm_100", "-gencode=arch=compute_120,code=sm_120"])
+            clang_arch_flags.extend(["--cuda-gpu-arch=sm_100", "--cuda-gpu-arch=sm_120"])
-            if arch == "aarch64" and sys.platform == "linux":
-                gencode_opts += [
-                    # SASS for supported mobile architectures (e.g. Tegra/Jetson)
+        # Mobile architectures for aarch64 Linux
+        if arch == "aarch64" and target_platform == "linux":
+            gencode_opts.extend(
+                [
+                    "-gencode=arch=compute_87,code=sm_87",  # Orin
                     "-gencode=arch=compute_53,code=sm_53",  # X1
                     "-gencode=arch=compute_62,code=sm_62",  # X2
                     "-gencode=arch=compute_72,code=sm_72",  # Xavier
-                    "-gencode=arch=compute_87,code=sm_87",  # Orin
                 ]
-                clang_arch_flags += [
-                    # SASS for supported mobile architectures
-                    "--cuda-gpu-arch=sm_53",  # X1
-                    "--cuda-gpu-arch=sm_62",  # X2
-                    "--cuda-gpu-arch=sm_72",  # Xavier
-                    "--cuda-gpu-arch=sm_87",  # Orin
+            )
+            clang_arch_flags.extend(
+                [
+                    "--cuda-gpu-arch=sm_87",
+                    "--cuda-gpu-arch=sm_53",
+                    "--cuda-gpu-arch=sm_62",
+                    "--cuda-gpu-arch=sm_72",
                 ]
+            )
-                if ctk_version >= (12, 8):
-                    gencode_opts += ["-gencode=arch=compute_101,code=sm_101"]  # Thor (CUDA 12 numbering)
-                    clang_arch_flags += ["--cuda-gpu-arch=sm_101"]
+            # Thor support in CUDA 12.8+
             if ctk_version >= (12, 8):
-                # Support for Blackwell is available with CUDA Toolkit 12.8+
-                gencode_opts += [
-                    "-gencode=arch=compute_89,code=sm_89",  # Ada
-                    "-gencode=arch=compute_90,code=sm_90",  # Hopper
-                    "-gencode=arch=compute_100,code=sm_100",  # Blackwell
-                    "-gencode=arch=compute_120,code=sm_120",  # Blackwell
-                    "-gencode=arch=compute_120,code=compute_120",  # PTX for future hardware
-                ]
+                gencode_opts.append("-gencode=arch=compute_101,code=sm_101")  # Thor (CUDA 12 numbering)
+                clang_arch_flags.append("--cuda-gpu-arch=sm_101")
+            if ctk_version >= (12, 9):
+                gencode_opts.append("-gencode=arch=compute_121,code=sm_121")
+                clang_arch_flags.append("--cuda-gpu-arch=sm_121")
+        # PTX for future hardware (use highest available compute capability)
+        if ctk_version >= (12, 9):
+            gencode_opts.extend(["-gencode=arch=compute_121,code=compute_121"])
+        elif ctk_version >= (12, 8):
+            gencode_opts.extend(["-gencode=arch=compute_120,code=compute_120"])
+        else:
+            gencode_opts.append("-gencode=arch=compute_90,code=compute_90")
+    return gencode_opts, clang_arch_flags
-                clang_arch_flags += [
-                    "--cuda-gpu-arch=sm_89",  # Ada
-                    "--cuda-gpu-arch=sm_90",  # Hopper
-                    "--cuda-gpu-arch=sm_100",  # Blackwell
-                    "--cuda-gpu-arch=sm_120",  # Blackwell
-                ]
-            elif ctk_version >= (11, 8):
-                # Support for Ada and Hopper is available with CUDA Toolkit 11.8+
-                gencode_opts += [
-                    "-gencode=arch=compute_89,code=sm_89",  # Ada
-                    "-gencode=arch=compute_90,code=sm_90",  # Hopper
-                    "-gencode=arch=compute_90,code=compute_90",  # PTX for future hardware
-                ]
-                clang_arch_flags += [
-                    "--cuda-gpu-arch=sm_89",  # Ada
-                    "--cuda-gpu-arch=sm_90",  # Hopper
+def _get_architectures_cu13(
+    ctk_version: tuple[int, int], arch: str, target_platform: str, quick_build: bool = False
+) -> tuple[list[str], list[str]]:
+    """Get architecture flags for CUDA 13.x."""
+    gencode_opts = []
+    clang_arch_flags = []
+    if quick_build:
+        gencode_opts = ["-gencode=arch=compute_75,code=compute_75"]
+        clang_arch_flags = ["--cuda-gpu-arch=sm_75"]
+    else:
+        # Desktop architectures
+        gencode_opts.extend(
+            [
+                "-gencode=arch=compute_75,code=sm_75",  # Turing
+                "-gencode=arch=compute_75,code=compute_75",  # Turing (PTX)
+                "-gencode=arch=compute_80,code=sm_80",  # Ampere
+                "-gencode=arch=compute_86,code=sm_86",
+                "-gencode=arch=compute_89,code=sm_89",  # Ada
+                "-gencode=arch=compute_90,code=sm_90",  # Hopper
+                "-gencode=arch=compute_100,code=sm_100",  # Blackwell
+                "-gencode=arch=compute_120,code=sm_120",  # Blackwell
+            ]
+        )
+        clang_arch_flags.extend(
+            [
+                "--cuda-gpu-arch=sm_75",  # Turing
+                "--cuda-gpu-arch=sm_80",  # Ampere
+                "--cuda-gpu-arch=sm_86",
+                "--cuda-gpu-arch=sm_89",  # Ada
+                "--cuda-gpu-arch=sm_90",  # Hopper
+                "--cuda-gpu-arch=sm_100",  # Blackwell
+                "--cuda-gpu-arch=sm_120",  # Blackwell
+            ]
+        )
+        # Mobile architectures for aarch64 Linux
+        if arch == "aarch64" and target_platform == "linux":
+            gencode_opts.extend(
+                [
+                    "-gencode=arch=compute_87,code=sm_87",  # Orin
+                    "-gencode=arch=compute_110,code=sm_110",  # Thor
+                    "-gencode=arch=compute_121,code=sm_121",  # Spark
                 ]
-            else:
-                gencode_opts += [
-                    "-gencode=arch=compute_86,code=compute_86",  # PTX for future hardware
+            )
+            clang_arch_flags.extend(
+                [
+                    "--cuda-gpu-arch=sm_87",
+                    "--cuda-gpu-arch=sm_110",
+                    "--cuda-gpu-arch=sm_121",
                 ]
+            )
-                clang_arch_flags += [
-                    "--cuda-gpu-arch=sm_86",  # PTX for future hardware
-                ]
+        # PTX for future hardware (use highest available compute capability)
+        gencode_opts.extend(["-gencode=arch=compute_121,code=compute_121"])
+    return gencode_opts, clang_arch_flags
+def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str] | None = None, mode=None):
+    mode = args.mode if (mode is None) else mode
+    cuda_home = args.cuda_path
+    cuda_cmd = None
+    # Add LLVM bin directory to PATH
+    add_llvm_bin_to_path(args)
+    if args.quick or cu_path is None:
+        cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
+    else:
+        cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"
+    if libs is None:
+        libs = []
+    import pathlib
+    warp_home_path = pathlib.Path(__file__).parent
+    warp_home = warp_home_path.resolve()
+    if args.verbose:
+        print(f"Building {dll_path}")
+    native_dir = os.path.join(warp_home, "native")
+    if cu_path:
+        # check CUDA Toolkit version
+        ctk_version = get_cuda_toolkit_version(cuda_home)
+        if ctk_version < MIN_CTK_VERSION:
+            raise Exception(
+                f"CUDA Toolkit version {MIN_CTK_VERSION[0]}.{MIN_CTK_VERSION[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
+            )
+        # Get architecture flags based on CUDA version
+        if ctk_version >= (13, 0):
+            gencode_opts, clang_arch_flags = _get_architectures_cu13(ctk_version, arch, sys.platform, args.quick)
+        else:
+            gencode_opts, clang_arch_flags = _get_architectures_cu12(ctk_version, arch, sys.platform, args.quick)
         nvcc_opts = [
             *gencode_opts,