PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (134) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +93 -30
warp/build_dll.py +47 -67
warp/builtins.py +955 -137
warp/codegen.py +312 -206
warp/config.py +1 -1
warp/context.py +1249 -784
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +2 -1
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +82 -5
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +283 -69
warp/native/vec.h +381 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +323 -192
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +85 -6
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +56 -5
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +184 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/RECORD +131 -121
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/build.py CHANGED Viewed

@@ -51,7 +51,7 @@ def build_cuda(
         output_path = output_path.encode("utf-8")
         if warp.config.llvm_cuda:
-            warp.context.runtime.llvm.compile_cuda(src, cu_path_bytes, inc_path, output_path, False)
+            warp.context.runtime.llvm.wp_compile_cuda(src, cu_path_bytes, inc_path, output_path, False)
         else:
             if ltoirs is None:
@@ -67,7 +67,7 @@ def build_cuda(
                 fatbins
             )
             arr_link_input_types = (ctypes.c_int * num_link)(*link_input_types)
-            err = warp.context.runtime.core.cuda_compile_program(
+            err = warp.context.runtime.core.wp_cuda_compile_program(
                 src,
                 program_name_bytes,
                 arch,
@@ -96,7 +96,7 @@ def load_cuda(input_path, device):
     if not device.is_cuda:
         raise RuntimeError("Not a CUDA device")
-    return warp.context.runtime.core.cuda_load_module(device.context, input_path.encode("utf-8"))
+    return warp.context.runtime.core.wp_cuda_load_module(device.context, input_path.encode("utf-8"))
 def build_cpu(obj_path, cpp_path, mode="release", verify_fp=False, fast_math=False, fuse_fp=True):
@@ -106,7 +106,7 @@ def build_cpu(obj_path, cpp_path, mode="release", verify_fp=False, fast_math=Fal
         inc_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "native").encode("utf-8")
         obj_path = obj_path.encode("utf-8")
-        err = warp.context.runtime.llvm.compile_cpp(
+        err = warp.context.runtime.llvm.wp_compile_cpp(
             src, cpp_path, inc_path, obj_path, mode == "debug", verify_fp, fuse_fp
         )
         if err != 0:
@@ -129,6 +129,15 @@ def init_kernel_cache(path=None):
     else:
         cache_root_dir = appdirs.user_cache_dir(appname="warp", appauthor="NVIDIA", version=warp.config.version)
+        if os.name == "nt" and os.path.isabs(cache_root_dir) and not cache_root_dir.startswith("\\\\?\\"):
+            # Add Windows long-path prefix, accounting for UNC shares.
+            if cache_root_dir.startswith("\\\\"):
+                # UNC path  \\server\share\…  →  \\?\UNC\server\share\…
+                cache_root_dir = "\\\\?\\UNC\\" + cache_root_dir.lstrip("\\")
+            else:
+                # Drive-letter path  C:\…  →  \\?\C:\…
+                cache_root_dir = "\\\\?\\" + cache_root_dir
     warp.config.kernel_cache_dir = cache_root_dir
     os.makedirs(warp.config.kernel_cache_dir, exist_ok=True)
@@ -246,7 +255,12 @@ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
             the cached file data.
     Returns:
-        Tuple containing lto_code_data followed by any extra data from extra_files
+        Tuple where the first element is a success flag (``bool``). The second
+        element is the LTO code as bytes (or ``None`` on failure).
+        If ``extra_files`` is provided, additional elements follow in the same
+        order as the keys in ``extra_files``:
+          - ``".meta"``: int (shared memory bytes).
+          - ``"_fatbin.lto"``: bytes (universal fatbin).
     """
     if extra_files is None:
         extra_files = {}
@@ -283,9 +297,9 @@ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
         if all_files_cached:
             if not extra_files:
-                return (lto_code_data,)
+                return (True, lto_code_data)
             else:
-                return (lto_code_data, *[extra_files[ext] for ext in extra_files.keys()])
+                return (True, lto_code_data, *[extra_files[ext] for ext in extra_files.keys()])
     # Create process-dependent temporary build directory
     build_dir = f"{lto_dir}_p{os.getpid()}"
@@ -303,21 +317,24 @@ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
         for path in temp_file_paths.values():
             if Path(path).exists():
                 Path(path).unlink()
-        raise RuntimeError(f"Failed to compile {lto_symbol}")
-    # Move outputs to cache
-    safe_rename(build_dir, lto_dir)
-    # If build_dir couldn't be moved by a rename, move the outputs one-by-one to lto_dir
-    if os.path.exists(lto_dir):
-        for ext, path in file_paths.items():
-            if not os.path.exists(path):
-                try:
-                    # copy output file to the destination lto dir
-                    os.rename(temp_file_paths[ext], path)
-                except (OSError, FileExistsError):
-                    # another process likely updated the lto dir first
-                    pass
+        outputs[".lto"] = None
+        for ext in extra_files.keys():
+            outputs[ext] = None
+    else:
+        # Move outputs to cache
+        safe_rename(build_dir, lto_dir)
+        # If build_dir couldn't be moved by a rename, move the outputs one-by-one to lto_dir
+        if os.path.exists(lto_dir):
+            for ext, path in file_paths.items():
+                if not os.path.exists(path):
+                    try:
+                        # copy output file to the destination lto dir
+                        os.rename(temp_file_paths[ext], path)
+                    except (OSError, FileExistsError):
+                        # another process likely updated the lto dir first
+                        pass
     # Clean up the temporary build directory
     if build_dir:
@@ -326,9 +343,9 @@ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
         shutil.rmtree(build_dir, ignore_errors=True)
     if not extra_files:
-        return (outputs[".lto"],)
+        return (result, outputs[".lto"])
     else:
-        return (outputs[".lto"], *[outputs[ext] for ext in extra_files.keys()])
+        return (result, outputs[".lto"], *[outputs[ext] for ext in extra_files.keys()])
 def build_lto_dot(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout, arch, num_threads, builder):
@@ -372,7 +389,7 @@ def build_lto_dot(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout, ar
     lto_symbol = f"dot_{M}_{N}_{K}_{arch}_{num_threads}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
     def compile_lto_dot(temp_paths):
-        result = warp.context.runtime.core.cuda_compile_dot(
+        result = warp.context.runtime.core.wp_cuda_compile_dot(
             temp_paths[".lto"].encode("utf-8"),
             lto_symbol.encode("utf-8"),
             0,
@@ -402,7 +419,13 @@ def build_lto_dot(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout, ar
     if lto_symbol in builder.ltoirs:
         lto_code_data = builder.ltoirs[lto_symbol]
     else:
-        (lto_code_data,) = _build_lto_base(lto_symbol, compile_lto_dot, builder, {})
+        (result, lto_code_data) = _build_lto_base(lto_symbol, compile_lto_dot, builder, {})
+        if not result:
+            raise RuntimeError(
+                f"Failed to compile LTO '{lto_symbol}'. "
+                "Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
+            )
         # Update builder
         builder.ltoirs[lto_symbol] = lto_code_data
@@ -429,6 +452,7 @@ def build_lto_solver(
     num_threads,
     parameter_list,
     builder,
+    smem_estimate_bytes=None,
 ):
     arch = 120 if arch > 121 else arch
@@ -446,7 +470,7 @@ def build_lto_solver(
     def compile_lto_solver(temp_paths):
         # compile LTO
-        result = warp.context.runtime.core.cuda_compile_solver(
+        result = warp.context.runtime.core.wp_cuda_compile_solver(
             temp_paths["_fatbin.lto"].encode("utf-8"),
             temp_paths[".lto"].encode("utf-8"),
             lto_symbol.encode("utf-8"),
@@ -479,10 +503,43 @@ def build_lto_solver(
     if lto_symbol in builder.ltoirs:
         lto_code_data = builder.ltoirs[lto_symbol]
     else:
-        lto_code_data, universal_fatbin_code_data = _build_lto_base(
+        (result, lto_code_data, universal_fatbin_code_data) = _build_lto_base(
             lto_symbol, compile_lto_solver, builder, {"_fatbin.lto": get_cached_lto}
         )
+        if not result:
+            hint = ""
+            if smem_estimate_bytes:
+                max_smem_bytes = 232448
+                max_smem_is_estimate = True
+                for d in warp.get_cuda_devices():
+                    if d.arch == arch:
+                        # We can directly query the max shared memory for this device
+                        queried_bytes = warp.context.runtime.core.wp_cuda_get_max_shared_memory(d.context)
+                        if queried_bytes > 0:
+                            max_smem_bytes = queried_bytes
+                            max_smem_is_estimate = False
+                            break
+                if smem_estimate_bytes > max_smem_bytes:
+                    source = "estimated limit" if max_smem_is_estimate else "device-reported limit"
+                    hint = (
+                        f"Estimated shared memory requirement is {smem_estimate_bytes}B, "
+                        f"but the {source} is {max_smem_bytes}B. "
+                        "The tile size(s) may be too large for this device."
+                    )
+            if warp.context.runtime.toolkit_version < (12, 6):
+                raise RuntimeError(
+                    "cuSolverDx requires CUDA Toolkit 12.6.3 or later. This version of Warp was built against CUDA Toolkit "
+                    f"{warp.context.runtime.toolkit_version[0]}.{warp.context.runtime.toolkit_version[1]}. "
+                    "Upgrade your CUDA Toolkit and rebuild Warp, or install a Warp wheel built with CUDA >= 12.6.3."
+                )
+            else:
+                raise RuntimeError(
+                    f"Failed to compile LTO '{lto_symbol}'. {hint}"
+                    " Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
+                )
         # Update builder
         builder.ltoirs[lto_symbol] = lto_code_data
         builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}{parameter_list};"
@@ -499,7 +556,7 @@ def build_lto_fft(arch, size, ept, direction, dir, precision, builder):
     def compile_lto_fft(temp_paths):
         shared_memory_size = ctypes.c_int(0)
-        result = warp.context.runtime.core.cuda_compile_fft(
+        result = warp.context.runtime.core.wp_cuda_compile_fft(
             temp_paths[".lto"].encode("utf-8"),
             lto_symbol.encode("utf-8"),
             0,
@@ -535,10 +592,16 @@ def build_lto_fft(arch, size, ept, direction, dir, precision, builder):
         lto_code_data = builder.ltoirs[lto_symbol]
         shared_memory_bytes = builder.shared_memory_bytes[lto_symbol]
     else:
-        lto_code_data, shared_memory_bytes = _build_lto_base(
+        (result, lto_code_data, shared_memory_bytes) = _build_lto_base(
             lto_symbol, compile_lto_fft, builder, {".meta": lambda path: get_cached_lto_meta(path, lto_symbol)}
         )
+        if not result:
+            raise RuntimeError(
+                f"Failed to compile LTO '{lto_symbol}'."
+                "Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
+            )
         # Update builder
         builder.ltoirs[lto_symbol] = lto_code_data
         builder.shared_memory_bytes[lto_symbol] = shared_memory_bytes

warp/build_dll.py CHANGED Viewed

@@ -13,16 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 import os
 import platform
 import subprocess
 import sys
-from typing import List, Optional
 from warp.utils import ScopedTimer
 verbose_cmd = True  # print command lines before executing them
+MIN_CTK_VERSION = (12, 0)
 def machine_architecture() -> str:
     """Return a canonical machine architecture string.
@@ -120,7 +123,7 @@ def find_host_compiler():
         return run_cmd("which g++").decode()
-def get_cuda_toolkit_version(cuda_home):
+def get_cuda_toolkit_version(cuda_home) -> tuple[int, int]:
     try:
         # the toolkit version can be obtained by running "nvcc --version"
         nvcc_path = os.path.join(cuda_home, "bin", "nvcc")
@@ -128,14 +131,16 @@ def get_cuda_toolkit_version(cuda_home):
         # search for release substring (e.g., "release 11.5")
         import re
-        m = re.search(r"(?<=release )\d+\.\d+", nvcc_version_output)
+        m = re.search(r"release (\d+)\.(\d+)", nvcc_version_output)
         if m is not None:
-            return tuple(int(x) for x in m.group(0).split("."))
+            major, minor = map(int, m.groups())
+            return (major, minor)
         else:
             raise Exception("Failed to parse NVCC output")
     except Exception as e:
-        print(f"Failed to determine CUDA Toolkit version: {e}")
+        print(f"Warning: Failed to determine CUDA Toolkit version: {e}")
+        return MIN_CTK_VERSION
 def quote(path):
@@ -169,7 +174,7 @@ def add_llvm_bin_to_path(args):
     return True
-def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: Optional[List[str]] = None, mode=None):
+def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str] | None = None, mode=None):
     mode = args.mode if (mode is None) else mode
     cuda_home = args.cuda_path
     cuda_cmd = None
@@ -197,17 +202,12 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: Optional[
     if cu_path:
         # check CUDA Toolkit version
-        min_ctk_version = (11, 5)
-        ctk_version = get_cuda_toolkit_version(cuda_home) or min_ctk_version
-        if ctk_version < min_ctk_version:
+        ctk_version = get_cuda_toolkit_version(cuda_home)
+        if ctk_version < MIN_CTK_VERSION:
             raise Exception(
-                f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
+                f"CUDA Toolkit version {MIN_CTK_VERSION[0]}.{MIN_CTK_VERSION[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
             )
-        if ctk_version[0] < 12 and args.libmathdx_path:
-            print("MathDx support requires at least CUDA 12, skipping")
-            args.libmathdx_path = None
         # NVCC gencode options
         gencode_opts = []
@@ -216,91 +216,71 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: Optional[
         if args.quick:
             # minimum supported architectures (PTX)
-            gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
-            clang_arch_flags += ["--cuda-gpu-arch=sm_52", "--cuda-gpu-arch=sm_75"]
+            if ctk_version >= (13, 0):
+                gencode_opts += ["-gencode=arch=compute_75,code=compute_75"]
+                clang_arch_flags += ["--cuda-gpu-arch=sm_75"]
+            else:
+                gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
+                clang_arch_flags += ["--cuda-gpu-arch=sm_52", "--cuda-gpu-arch=sm_75"]
         else:
             # generate code for all supported architectures
             gencode_opts += [
                 # SASS for supported desktop/datacenter architectures
-                "-gencode=arch=compute_52,code=sm_52",  # Maxwell
-                "-gencode=arch=compute_60,code=sm_60",  # Pascal
-                "-gencode=arch=compute_61,code=sm_61",
-                "-gencode=arch=compute_70,code=sm_70",  # Volta
                 "-gencode=arch=compute_75,code=sm_75",  # Turing
                 "-gencode=arch=compute_75,code=compute_75",  # Turing (PTX)
                 "-gencode=arch=compute_80,code=sm_80",  # Ampere
                 "-gencode=arch=compute_86,code=sm_86",
+                "-gencode=arch=compute_89,code=sm_89",  # Ada
+                "-gencode=arch=compute_90,code=sm_90",  # Hopper
             ]
-            # TODO: Get this working with sm_52, sm_60, sm_61
             clang_arch_flags += [
                 # SASS for supported desktop/datacenter architectures
-                "--cuda-gpu-arch=sm_52",
-                "--cuda-gpu-arch=sm_60",
-                "--cuda-gpu-arch=sm_61",
-                "--cuda-gpu-arch=sm_70",  # Volta
                 "--cuda-gpu-arch=sm_75",  # Turing
                 "--cuda-gpu-arch=sm_80",  # Ampere
                 "--cuda-gpu-arch=sm_86",
+                "--cuda-gpu-arch=sm_89",  # Ada
+                "--cuda-gpu-arch=sm_90",  # Hopper
             ]
             if arch == "aarch64" and sys.platform == "linux":
-                gencode_opts += [
-                    # SASS for supported mobile architectures (e.g. Tegra/Jetson)
-                    "-gencode=arch=compute_53,code=sm_53",  # X1
-                    "-gencode=arch=compute_62,code=sm_62",  # X2
-                    "-gencode=arch=compute_72,code=sm_72",  # Xavier
-                    "-gencode=arch=compute_87,code=sm_87",  # Orin
-                ]
-                clang_arch_flags += [
-                    # SASS for supported mobile architectures
-                    "--cuda-gpu-arch=sm_53",  # X1
-                    "--cuda-gpu-arch=sm_62",  # X2
-                    "--cuda-gpu-arch=sm_72",  # Xavier
-                    "--cuda-gpu-arch=sm_87",  # Orin
-                ]
-                if ctk_version >= (12, 8):
-                    gencode_opts += ["-gencode=arch=compute_101,code=sm_101"]  # Thor (CUDA 12 numbering)
-                    clang_arch_flags += ["--cuda-gpu-arch=sm_101"]
+                # SASS for supported mobile architectures (e.g. Tegra/Jetson)
+                gencode_opts += ["-gencode=arch=compute_87,code=sm_87"]  # Orin
+                clang_arch_flags += ["--cuda-gpu-arch=sm_87"]
+                if ctk_version >= (13, 0):
+                    gencode_opts += ["-gencode=arch=compute_110,code=sm_110"]  # Thor
+                    clang_arch_flags += ["--cuda-gpu-arch=sm_110"]
+                else:
+                    gencode_opts += [
+                        "-gencode=arch=compute_53,code=sm_53",  # X1
+                        "-gencode=arch=compute_62,code=sm_62",  # X2
+                        "-gencode=arch=compute_72,code=sm_72",  # Xavier
+                    ]
+                    clang_arch_flags += [
+                        "--cuda-gpu-arch=sm_53",
+                        "--cuda-gpu-arch=sm_62",
+                        "--cuda-gpu-arch=sm_72",
+                    ]
+                    if ctk_version >= (12, 8):
+                        gencode_opts += ["-gencode=arch=compute_101,code=sm_101"]  # Thor (CUDA 12 numbering)
+                        clang_arch_flags += ["--cuda-gpu-arch=sm_101"]
             if ctk_version >= (12, 8):
                 # Support for Blackwell is available with CUDA Toolkit 12.8+
                 gencode_opts += [
-                    "-gencode=arch=compute_89,code=sm_89",  # Ada
-                    "-gencode=arch=compute_90,code=sm_90",  # Hopper
                     "-gencode=arch=compute_100,code=sm_100",  # Blackwell
                     "-gencode=arch=compute_120,code=sm_120",  # Blackwell
                     "-gencode=arch=compute_120,code=compute_120",  # PTX for future hardware
                 ]
                 clang_arch_flags += [
-                    "--cuda-gpu-arch=sm_89",  # Ada
-                    "--cuda-gpu-arch=sm_90",  # Hopper
                     "--cuda-gpu-arch=sm_100",  # Blackwell
                     "--cuda-gpu-arch=sm_120",  # Blackwell
                 ]
-            elif ctk_version >= (11, 8):
-                # Support for Ada and Hopper is available with CUDA Toolkit 11.8+
-                gencode_opts += [
-                    "-gencode=arch=compute_89,code=sm_89",  # Ada
-                    "-gencode=arch=compute_90,code=sm_90",  # Hopper
-                    "-gencode=arch=compute_90,code=compute_90",  # PTX for future hardware
-                ]
-                clang_arch_flags += [
-                    "--cuda-gpu-arch=sm_89",  # Ada
-                    "--cuda-gpu-arch=sm_90",  # Hopper
-                ]
             else:
-                gencode_opts += [
-                    "-gencode=arch=compute_86,code=compute_86",  # PTX for future hardware
-                ]
-                clang_arch_flags += [
-                    "--cuda-gpu-arch=sm_86",  # PTX for future hardware
-                ]
+                gencode_opts += ["-gencode=arch=compute_90,code=compute_90"]  # PTX for future hardware
         nvcc_opts = [
             *gencode_opts,