PyPI - returnn - Versions diffs - 1.20251027.232712__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl - Mend

returnn 1.20251027.232712py3-none-any.whl → 1.20260119.15400py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

returnn/PKG-INFO +2 -2
returnn/__old_mod_loader__.py +26 -2
returnn/_setup_info_generated.py +2 -2
returnn/datasets/lm.py +130 -42
returnn/datasets/meta.py +93 -43
returnn/datasets/postprocessing.py +597 -108
returnn/datasets/util/vocabulary.py +90 -0
returnn/frontend/__init__.py +1 -0
returnn/frontend/_backend.py +41 -0
returnn/frontend/_native/__init__.py +22 -0
returnn/frontend/_numpy_backend.py +7 -0
returnn/frontend/_utils.py +1 -1
returnn/frontend/array_.py +48 -2
returnn/frontend/assert_.py +35 -0
returnn/frontend/attention.py +54 -20
returnn/frontend/conv.py +273 -54
returnn/frontend/device.py +14 -1
returnn/frontend/encoder/conformer.py +20 -0
returnn/frontend/encoder/transformer.py +2 -0
returnn/frontend/loss.py +222 -3
returnn/frontend/math_.py +54 -14
returnn/native_op.cpp +182 -172
returnn/native_op.py +36 -31
returnn/sprint/cache.py +12 -13
returnn/tensor/_dim_extra.py +7 -7
returnn/tensor/_tensor_extra.py +10 -10
returnn/tensor/utils.py +8 -5
returnn/tf/frontend_layers/_backend.py +7 -3
returnn/tf/layers/basic.py +27 -40
returnn/tf/native_op.py +27 -63
returnn/tf/network.py +1 -1
returnn/tf/util/basic.py +22 -197
returnn/torch/engine.py +157 -6
returnn/torch/frontend/_backend.py +280 -29
returnn/torch/frontend/bridge.py +61 -0
returnn/torch/frontend/compile_helper.py +106 -0
returnn/torch/util/array_.py +30 -0
returnn/torch/util/assert_.py +122 -0
returnn/torch/util/exception_helper.py +7 -1
returnn/torch/util/native_op.py +885 -0
returnn/torch/util/native_op_code_compiler.py +308 -0
returnn/util/basic.py +6 -7
returnn/util/better_exchook.py +4 -0
returnn/util/cuda_env.py +332 -0
returnn/util/debug.py +12 -2
returnn/util/file_cache.py +15 -1
returnn/util/fsa.py +17 -13
returnn/util/native_code_compiler.py +104 -47
returnn/util/task_system.py +1 -1
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +2 -2
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +54 -48
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0

returnn/torch/util/native_op_code_compiler.py ADDED Viewed

@@ -0,0 +1,308 @@
+"""
+Helper to compile Torch ops on-the-fly, similar to Theano / :class:`returnn.tf.util.basic.OpCodeCompiler`,
+similar to :mod:`torch.utils.cpp_extension`.
+See :class:`OpCodeCompiler`.
+"""
+from __future__ import annotations
+from typing import Union, Optional, Sequence, Dict, List
+import os
+import sysconfig
+import torch
+from torch.utils import cpp_extension
+from returnn.util.basic import NativeCodeCompiler
+from returnn.util.cuda_env import CudaEnv as _CudaEnvBase, get_best_nvcc_path_for_cuda_version
+class OpCodeCompiler(NativeCodeCompiler):
+    """
+    Helper class to compile Torch ops on-the-fly, similar to Theano,
+    and similar to :class:`returnn.tf.util.basic.OpCodeCompiler`.
+    Note that PyTorch already has its own code for this,
+    see :mod:`torch.utils.cpp_extension`, :func:`torch.utils.cpp_extension.load_inline`, etc.
+    However, there are some shortcomings there that we try to do better:
+    * The way we find CUDA/nvcc is more robust.
+    * The way we find the C/C++ compiler is more robust.
+    * The automatic selection of options for nvcc is more robust.
+      E.g. the compute version is not higher than what the selected CUDA supports.
+    https://docs.pytorch.org/tutorials/advanced/cpp_custom_ops.html
+    """
+    CacheDirName = "returnn_torch_cache/ops"
+    def __init__(
+        self,
+        base_name: str,
+        *,
+        code: str,
+        use_cuda_if_available: bool = True,
+        cuda_auto_min_compute_capability: bool = True,
+        include_paths: Sequence[str] = (),
+        ld_flags: Sequence[str] = (),
+        c_macro_defines: Optional[Dict[str, Union[str, int, None]]] = None,
+        is_python_module: bool = False,
+        **kwargs,
+    ):
+        self._cuda_env = None
+        if use_cuda_if_available and torch.cuda.is_available():
+            self._cuda_env = CudaEnv.get_instance()
+            # Currently we assume that if we provide CUDA code (thus set use_cuda_if_available=True),
+            # that if there is a GPU available (as TF reports it),
+            # we also expect that we find CUDA.
+            # Otherwise you would end up with ops compiled for CPU only although they support CUDA
+            # and the user expects them to run on GPU.
+            assert self._with_cuda(), "OpCodeCompiler: use_cuda_if_available=True but no CUDA found"
+        self._nvcc_opts = []
+        if self._with_cuda() and cuda_auto_min_compute_capability:
+            # Get CUDA compute capability of the current GPU device.
+            min_compute_capability = _get_available_gpu_cuda_min_compute_capability()
+            if min_compute_capability:
+                min_compute_capability = min(min_compute_capability, self._cuda_env.get_max_compute_capability())
+                self._nvcc_opts += ["-arch", "compute_%i" % int(min_compute_capability * 10)]
+        if self._with_cuda():
+            self._nvcc_opts += cpp_extension.COMMON_NVCC_FLAGS
+        # Example call from torch.utils.cpp_extension:
+        # /usr/local/cuda-11.0/bin/nvcc
+        # --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d
+        # -DTORCH_EXTENSION_NAME=async_assert_ext
+        # -DTORCH_API_INCLUDE_EXTENSION_H
+        # -isystem /home/az/py-venv/py3.12-torch2.9/lib/python3.12/site-packages/torch/include
+        # -isystem /home/az/py-venv/py3.12-torch2.9/lib/python3.12/site-packages/torch/include/torch/csrc/api/include
+        # -isystem /usr/local/cuda-11.0/include -isystem /usr/include/python3.12
+        # -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__
+        # -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__
+        # --expt-relaxed-constexpr
+        # -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86
+        # --compiler-options '-fPIC' -std=c++17
+        # -c /home/az/.cache/torch_extensions/py312_cu128/async_assert_ext/cuda.cu -o cuda.cuda.o
+        torch_path = os.path.dirname(torch.__file__)
+        torch_include = torch_path + "/include"
+        assert os.path.isdir(torch_include)
+        python_include = sysconfig.get_path("include", scheme="posix_prefix")
+        include_paths = list(include_paths) + [torch_include, torch_include + "/torch/csrc/api/include", python_include]
+        c_macro_defines = {} if c_macro_defines is None else c_macro_defines.copy()
+        c_macro_defines.setdefault("TORCH_EXTENSION_NAME", base_name)
+        c_macro_defines.setdefault("TORCH_API_INCLUDE_EXTENSION_H", "")
+        # We have some assert in our kernels that we want to disable.
+        c_macro_defines.setdefault("NDEBUG", 1)
+        ld_flags = list(ld_flags)
+        ld_flags.append("--no-as-needed")
+        ld_flags.append(f"-L{cpp_extension.TORCH_LIB_PATH}")
+        ld_flags.append("-lc10")
+        if self._with_cuda():
+            ld_flags.append("-lc10_cuda")
+        ld_flags.append("-ltorch_cpu")
+        if self._with_cuda():
+            ld_flags.append("-ltorch_cuda")
+        ld_flags.append("-ltorch")
+        ld_flags.append("-ltorch_python")
+        if self._with_cuda():
+            ld_flags.append(self._cuda_env.get_ld_flag_for_linking_cudart())
+            # maybe add CUDNN?
+        # noinspection PyUnresolvedReferences,PyProtectedMember
+        use_cxx11_abi = torch._C._GLIBCXX_USE_CXX11_ABI
+        super().__init__(
+            base_name=base_name,
+            code=code,
+            include_paths=include_paths,
+            c_macro_defines=c_macro_defines,
+            ld_flags=ld_flags,
+            use_cxx11_abi=use_cxx11_abi,
+            **kwargs,
+        )
+        self.is_python_module = is_python_module
+        self._mod = None
+    def __repr__(self):
+        return "<%s %r CUDA %s in %r>" % (self.__class__.__name__, self.base_name, self._with_cuda(), self._mod_path)
+    _relevant_info_keys = NativeCodeCompiler._relevant_info_keys + (
+        "torch_version",
+        "with_cuda",
+        "cuda_path",
+        "nvcc_opts",
+    )
+    def _make_info_dict(self):
+        from returnn.util.basic import describe_torch_version
+        d = super()._make_info_dict()
+        d.update(
+            {
+                "torch_version": describe_torch_version(),
+                "with_cuda": self._with_cuda(),
+                "cuda_path": self._cuda_env.cuda_path if self._with_cuda() else None,
+                "nvcc_opts": (
+                    (tuple(self._cuda_env.get_compiler_opts()) + tuple(self._nvcc_opts)) if self._with_cuda() else None
+                ),
+            }
+        )
+        return d
+    @classmethod
+    def cuda_available(cls):
+        """
+        :return: whether CUDA is available. if True, and you initiate with use_cuda_if_available=True,
+          then _with_cuda() should also be True.
+        :rtype: bool
+        """
+        if not torch.cuda.is_available():
+            return False
+        cuda_env = CudaEnv.get_instance()
+        return cuda_env.is_available()
+    def _with_cuda(self):
+        return bool(self._cuda_env and self._cuda_env.is_available())
+    cpp_version = 17
+    def _get_compiler_bin(self):
+        if self._with_cuda():
+            return self._cuda_env.get_compiler_bin()
+        return super()._get_compiler_bin()
+    def _transform_compiler_opts(self, opts: List[str]) -> List[str]:
+        if self._with_cuda():
+            nvcc_opts = self._cuda_env.get_compiler_opts()
+            for opt in opts:
+                nvcc_opts += ["-Xcompiler", opt]
+            nvcc_opts += self._nvcc_opts
+            return nvcc_opts
+        return super()._transform_compiler_opts(opts)
+    def _transform_ld_flags(self, opts: Sequence[str]) -> Sequence[str]:
+        if self._with_cuda():
+            res = []
+            for opt in opts:
+                if opt.startswith("-L") or opt.startswith("-l"):
+                    res.append(opt)
+                else:
+                    res += ["-Xlinker", opt]
+            return res
+        return super()._transform_ld_flags(opts)
+    def load_module(self):
+        """
+        :return: module
+        """
+        if self._mod:
+            return self._mod
+        self._maybe_compile()
+        if self.is_python_module:
+            # Load as a Python module.
+            # E.g. PYBIND11_MODULE or so was used in the code.
+            import importlib.util
+            # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
+            spec = importlib.util.spec_from_file_location(self.base_name, self._so_filename)
+            assert spec is not None
+            module = importlib.util.module_from_spec(spec)
+            assert isinstance(spec.loader, importlib.abc.Loader)
+            spec.loader.exec_module(module)
+        else:
+            # Load as a Torch extension.
+            # TORCH_LIBRARY / TORCH_LIBRARY_IMPL was used in the code.
+            torch.ops.load_library(self._so_filename)
+            module = getattr(torch.ops, self.base_name)
+        self._mod = module
+        return module
+class CudaEnv(_CudaEnvBase):
+    """specialized CudaEnv for PyTorch"""
+    # If cudart is loaded (e.g. via Torch), we really want to use that one.
+    _runtime_libcudart_path_must_be_valid = True
+    def __init__(self):
+        super().__init__()
+        from returnn.util.basic import find_libcudart_from_runtime
+        self._runtime_libcudart = find_libcudart_from_runtime()
+        self._compiler_bin = None
+        if self.cuda_path:
+            if os.path.exists(f"{self.cuda_path}/bin/nvcc"):
+                self._compiler_bin = f"{self.cuda_path}/bin/nvcc"
+            else:
+                self._compiler_bin = get_best_nvcc_path_for_cuda_version(self.get_cuda_version())
+    @classmethod
+    def _check_valid_cuda_path(cls, p: str) -> bool:
+        """
+        :param p: path to CUDA, e.g. "/usr/local/cuda-8.0"
+        :return: whether this is a valid CUDA path, i.e. we find all what we need
+        """
+        if cls.verbose_find_cuda:
+            print("check valid CUDA path: %s" % p)
+        # Don't check nvcc here yet.
+        # The pip package might not have it, but otherwise provides lib + headers
+        # that we want to use, as this is likely the same that PyTorch uses.
+        if not os.path.exists("%s/include/cuda.h" % p):
+            return False
+        if p.endswith("/site-packages/nvidia/cuda_runtime"):
+            # special case for the nvidia CUDA pip package
+            if not any(name.startswith("libcudart.") for name in os.listdir(p + "/lib")):
+                return False
+        else:
+            if not os.path.exists("%s/%s/libcudart.so" % (p, cls._get_lib_dir_name(p))):
+                return False
+        return True
+    def get_lib_dir_path(self) -> str:
+        """
+        :return: path
+        """
+        if self._runtime_libcudart:
+            return os.path.dirname(self._runtime_libcudart)
+        return super().get_lib_dir_path()
+    def get_ld_flag_for_linking_cudart(self) -> str:
+        """ld flag"""
+        if self._runtime_libcudart:
+            return f"-l:{os.path.basename(self._runtime_libcudart)}"
+        return "-lcudart"
+    def get_compiler_bin(self) -> str:
+        """
+        :return: path
+        """
+        return self._compiler_bin
+def _get_available_gpu_cuda_min_compute_capability() -> Optional[float]:
+    """
+    Uses :func:`get_available_gpu_devices`.
+    :return: e.g. 3.0, or 5.0, etc, or None
+    """
+    count = torch.cuda.device_count()
+    cap = None
+    for i in range(count):
+        props = torch.cuda.get_device_properties(i)
+        dev_cap = float(f"{props.major}.{props.minor}")
+        if cap is None:
+            cap = dev_cap
+        else:
+            cap = min(cap, dev_cap)
+    return cap

returnn/util/basic.py CHANGED Viewed

@@ -365,12 +365,9 @@ def get_checkpoint_filepattern(filepath):
     :return: CheckpointLoader compatible filepattern
     :rtype: str
     """
-    if filepath.endswith(".meta"):
-        return filepath[: -len(".meta")]
-    elif filepath.endswith(".index"):
-        return filepath[: -len(".index")]
-    elif filepath.endswith(".pt"):
-        return filepath[: -len(".pt")]
+    for ext in [".meta", ".index", ".pt"]:
+        if filepath.endswith(ext):
+            return filepath[: -len(ext)]
     return filepath
@@ -3819,6 +3816,8 @@ def should_write_to_disk(config):
             return False
     if config.is_true("dry_run"):
         return False
+    if config.is_true("torch_profile"):
+        return False
     return True
@@ -4505,7 +4504,7 @@ _find_libcudart_from_runtime_cached = None
 def find_libcudart_from_runtime():
     """
     Looks through all libs via :func:`collect_proc_maps_exec_files`,
-    and searches for all which have the ``sgemm`` symbol.
+    and searches for libcudart.
     Currently only works on Linux (because collect_proc_maps_exec_files).
     :return: list of libs (their path)

returnn/util/better_exchook.py CHANGED Viewed

@@ -1093,6 +1093,7 @@ def format_tb(
     with_color=None,
     with_vars=None,
     clear_frames=True,
+    colorize=None,
 ):
     """
     Formats a traceback into a list of strings, each corresponding to one frame.
@@ -1110,11 +1111,14 @@ def format_tb(
         That will potentially fix some mem leaks regarding locals, so it can be important.
         Also see https://github.com/python/cpython/issues/113939.
         However, any further access to frame locals will not work (e.g., if you want to use a debugger afterward).
+    :param colorize: for compat with Python >=3.13, currently ignored
     :return: list of strings, each corresponding to one frame in the traceback.
         Each string contains the file name, line number, function name, source code line, maybe relevant variables,
         etc., and a final newline.
     :rtype: list[str]
     """
+    if colorize is not None and with_color is None:
+        with_color = colorize
     color = Color(enable=with_color)
     output = _OutputLinesCollector(color=color)

returnn/util/cuda_env.py ADDED Viewed

@@ -0,0 +1,332 @@
+"""
+CUDA environment detection and information.
+"""
+from __future__ import annotations
+from typing import Dict, Tuple, List
+import os
+import re
+class CudaEnv:
+    """
+    Information about the Nvidia CUDA environment, and library.
+    Also path to ``nvcc``, the CUDA compiler.
+    """
+    _instance_per_cls: Dict[type, CudaEnv] = {}
+    verbose_find_cuda = False
+    def __init__(self):
+        from returnn.util.basic import to_bool
+        if to_bool(os.environ.get("DISABLE_CUDA", "0")):
+            self.cuda_path = None
+            if self.verbose_find_cuda:
+                print("CUDA disabled via env DISABLE_CUDA.")
+        elif os.environ.get("CUDA_VISIBLE_DEVICES", None) in ["", "-1"]:
+            self.cuda_path = None
+            if self.verbose_find_cuda:
+                print(f"CUDA disabled via env CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']!r}.")
+        else:
+            self.cuda_path = self._find_cuda_path()
+            if self.verbose_find_cuda:
+                print("CUDA path:", self.cuda_path)
+        self._max_compute_capability = None
+        self._cuda_version = None
+    @classmethod
+    def _find_nvcc_in_path(cls):
+        """
+        :return: yields full path to nvcc
+        :rtype: list[str]
+        """
+        for p in os.environ["PATH"].split(":"):
+            pp = "%s/nvcc" % p
+            if os.path.exists(pp):
+                yield pp
+    @classmethod
+    def _find_lib_in_ld_path(cls):
+        """
+        :return: yields full path to libcudart.so
+        :rtype: list[str]
+        """
+        from returnn.util.basic import get_ld_paths
+        for p in get_ld_paths():
+            pp = "%s/libcudart.so" % p
+            if os.path.exists(pp):
+                yield pp
+    @classmethod
+    def _get_lib_dir_name(cls, base_path):
+        """
+        :return: dir name in base path
+        :rtype: str
+        """
+        from returnn.util.basic import is_64bit_platform, get_ld_paths
+        for ld_path in get_ld_paths():
+            # We also want to allow "lib/x86_64-linux-gnu" for "/usr".
+            # However, this logic should not be triggered for incorrect cases.
+            # E.g. base_path="/usr" would be the prefix for most LD paths.
+            if ld_path.startswith(base_path + "/lib") and os.path.exists("%s/libcudart.so" % ld_path):
+                return ld_path[len(base_path) + 1 :]
+        if is_64bit_platform():
+            return "lib64"
+        return "lib"
+    _runtime_libcudart_path_must_be_valid: bool = False
+    @classmethod
+    def _cuda_path_candidate_via_proc_map_libcudart(cls):
+        from returnn.util.basic import find_libcudart_from_runtime
+        fn = find_libcudart_from_runtime()
+        if cls.verbose_find_cuda:
+            print("libcudart.so found from /proc/maps:", fn)
+        if not fn:
+            return None
+        # fn is e.g. '/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcudart.so.8.0.61',
+        # or maybe '/usr/local/cuda-8.0/lib64/libcudart.so'
+        # or maybe ".../site-packages/nvidia/cuda_runtime/lib/libcudart.so.12"
+        # or ".../site-packages/nvidia/cu13/lib/libcudart.so.13"
+        p = os.path.dirname(os.path.dirname(fn))
+        while not cls._check_valid_cuda_path(p):
+            p = os.path.dirname(p)
+            if p in ["", "/"]:
+                if cls.verbose_find_cuda:
+                    print(f"Loaded lib {fn} does not seem to be in valid CUDA path.")
+                assert not cls._runtime_libcudart_path_must_be_valid
+                return None
+        assert cls._check_valid_cuda_path(p)
+        return p
+    @classmethod
+    def _cuda_path_candidates(cls):
+        p = cls._cuda_path_candidate_via_proc_map_libcudart()
+        if p:
+            yield p
+        if os.environ.get("CUDA_HOME"):
+            yield os.environ.get("CUDA_HOME")
+        if os.environ.get("CUDA_PATH"):
+            yield os.environ.get("CUDA_PATH")
+        for p in cls._find_nvcc_in_path():
+            # Expect p == "/usr/local/cuda-8.0/bin/nvcc" or so.
+            postfix = "/bin/nvcc"
+            if cls.verbose_find_cuda:
+                print("found cuda nvcc (wanted postfix: %r): %s" % (postfix, p))
+            if not p.endswith(postfix):
+                continue
+            yield p[: -len(postfix)] or "/"
+        for p in cls._find_lib_in_ld_path():
+            # Expect p == "/usr/local/cuda-8.0/lib64/libcudart.so" or so.
+            d = "/".join(p.split("/")[:-2]) or "/"  # Get "/usr/local/cuda-8.0".
+            if cls.verbose_find_cuda:
+                print("found cuda lib: %s (path %s)" % (p, d))
+            yield d
+        # Check common installation location.
+        for path in get_cuda_path_candidates_from_common_install_locations():
+            yield path
+    @classmethod
+    def _check_valid_cuda_path(cls, p):
+        """
+        :param str p: path to CUDA, e.g. "/usr/local/cuda-8.0"
+        :return: whether this is a valid CUDA path, i.e. we find all what we need
+        :rtype: bool
+        """
+        if cls.verbose_find_cuda:
+            print("check valid CUDA path: %s" % p)
+        if not os.path.exists("%s/bin/nvcc" % p):
+            return False
+        if not os.path.exists("%s/include/cuda.h" % p):
+            return False
+        if not os.path.exists("%s/%s/libcudart.so" % (p, cls._get_lib_dir_name(p))):
+            return False
+        return True
+    @classmethod
+    def _find_cuda_path(cls):
+        """
+        :return: base CUDA path if we find one, otherwise None
+        :rtype: str|None
+        """
+        for p in cls._cuda_path_candidates():
+            if cls._check_valid_cuda_path(p):
+                return p
+        return None
+    def is_available(self):
+        """
+        :rtype: bool
+        """
+        return bool(self.cuda_path)
+    def get_cuda_version(self) -> Tuple[int, int]:
+        """
+        Get CUDA version as (major, minor).
+        """
+        if self._cuda_version:
+            return self._cuda_version
+        assert self.cuda_path
+        # Parse CUDA_VERSION from cuda.h.
+        cuda_h_path = f"{self.cuda_path}/include/cuda.h"
+        self._cuda_version = _parse_cuda_version_from_cuda_h(cuda_h_path)
+        return self._cuda_version
+    def get_max_compute_capability(self):
+        """
+        :return: the highest compute capability supported by nvcc, or float("inf") if not known
+        :rtype: float
+        """
+        if self._max_compute_capability is None:
+            cuda_occupancy_path = "%s/include/cuda_occupancy.h" % self.cuda_path
+            if os.path.exists(cuda_occupancy_path):
+                major, minor = None, 0
+                for line in open(cuda_occupancy_path).read().splitlines():
+                    m = re.match("^#define\\s+__CUDA_OCC_(MAJOR|MINOR)__\\s+([0-9]+)$", line)
+                    if m:
+                        s, v = m.groups()
+                        v = int(v)
+                        if s == "MAJOR":
+                            major = v
+                        else:
+                            minor = v
+                if major:
+                    self._max_compute_capability = float(major) + float(minor) * 0.1
+        if self._max_compute_capability is None:
+            self._max_compute_capability = float("inf")
+        return self._max_compute_capability
+    @staticmethod
+    def get_cc_bin() -> str:
+        """
+        :return: path
+        """
+        from .native_code_compiler import get_cc_bin
+        return get_cc_bin()
+    def get_compiler_opts(self):
+        """
+        :rtype: list[str]
+        """
+        return [
+            "-ccbin",
+            self.get_cc_bin(),
+            "-I",
+            "%s/targets/x86_64-linux/include" % self.cuda_path,
+            "-I",
+            "%s/include" % self.cuda_path,
+            "-L",
+            self.get_lib_dir_path(),
+            "-x",
+            "cu",
+            "-v",
+        ]
+    def get_lib_dir_path(self) -> str:
+        """library path"""
+        return "%s/%s" % (self.cuda_path, self._get_lib_dir_name(self.cuda_path))
+    def get_compiler_bin(self):
+        """
+        :return: path
+        :rtype: str
+        """
+        assert self.cuda_path
+        return "%s/bin/nvcc" % self.cuda_path
+    @classmethod
+    def get_instance(cls) -> CudaEnv:
+        """
+        :return: instance for this class
+        """
+        if cls._instance_per_cls.get(cls) is not None:
+            return cls._instance_per_cls[cls]
+        cls._instance_per_cls[cls] = cls()
+        return cls._instance_per_cls[cls]
+def get_cuda_path_candidates_from_common_install_locations() -> List[str]:
+    """
+    :return: list of possible CUDA installation paths from common locations
+    """
+    cuda_paths = []
+    if os.path.exists("/usr/local"):
+        for name in sorted(os.listdir("/usr/local")):
+            if name.startswith("cuda-") or name == "cuda":
+                p = f"/usr/local/{name}"
+                if _check_valid_cuda_path_with_nvcc(p):
+                    version = _parse_cuda_version_from_cuda_h(f"{p}/include/cuda.h")
+                    cuda_paths.append((version, p))
+    # (stable) sort by version, highest version first
+    cuda_paths.sort(key=lambda x: x[0], reverse=True)
+    return [p for (_, p) in cuda_paths]
+def get_best_nvcc_path_for_cuda_version(cuda_version: Tuple[int, int]) -> str:
+    """
+    :return: path to nvcc
+    :rtype: str
+    """
+    cuda_paths = []
+    # noinspection PyProtectedMember
+    for p in CudaEnv._cuda_path_candidates():
+        if _check_valid_cuda_path_with_nvcc(p):
+            version = _parse_cuda_version_from_cuda_h(f"{p}/include/cuda.h")
+            if version == cuda_version:
+                # if we found a matching one, directly return it
+                return f"{p}/bin/nvcc"
+            cuda_paths.append((version, p))
+    if not cuda_paths:
+        raise RuntimeError(f"No valid CUDA installation found for version {cuda_version}.")
+    only_higher_versions = [(version, p) for (version, p) in cuda_paths if version >= cuda_version]
+    if only_higher_versions:
+        only_higher_versions.sort(key=lambda x: x[0])
+        # return the lowest higher version
+        if only_higher_versions[0][0] != cuda_version[0]:  # major version differs
+            print(
+                f"Warning: No exact match for CUDA version {cuda_version}, "
+                f"using version {only_higher_versions[0]} instead."
+            )
+        return f"{only_higher_versions[0][1]}/bin/nvcc"
+    cuda_paths.sort(key=lambda x: x[0])
+    # return the highest lower version
+    print(f"Warning: No exact match for CUDA version {cuda_version}, using lower version {cuda_paths[-1][0]} instead.")
+    return f"{cuda_paths[-1][1]}/bin/nvcc"
+def _check_valid_cuda_path_with_nvcc(p: str) -> bool:
+    """
+    :param str p: path to CUDA, e.g. "/usr/local/cuda-8.0"
+    :return: whether this is a valid CUDA path, i.e. we find all what we need
+    :rtype: bool
+    """
+    if not os.path.exists("%s/bin/nvcc" % p):
+        return False
+    if not os.path.exists("%s/include/cuda.h" % p):
+        return False
+    return True
+def _parse_cuda_version_from_cuda_h(cuda_h_path: str) -> Tuple[int, int]:
+    assert os.path.exists(cuda_h_path)
+    for line in open(cuda_h_path).read().splitlines():
+        # Like: #define CUDA_VERSION 12080
+        m = re.match(r"^#define\s+CUDA_VERSION\s+([0-9]+)$", line)
+        if m:
+            version_num = int(m.group(1))
+            major = version_num // 1000
+            minor = (version_num % 1000) // 10
+            return major, minor
+    raise RuntimeError(f"Could not determine CUDA version from {cuda_h_path}.")

returnn 1.20251027.232712__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl

returnn 1.20251027.232712py3-none-any.whl → 1.20260119.15400py3-none-any.whl