PyPI - gpu-container - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gpu-container 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

gpu_container/__init__.py +9 -0
gpu_container/__main__.py +60 -0
gpu_container/errors.py +72 -0
gpu_container/planner/__init__.py +17 -0
gpu_container/planner/activation.py +225 -0
gpu_container/planner/calibration.py +224 -0
gpu_container/planner/calibration_seed.json +44 -0
gpu_container/planner/cli.py +101 -0
gpu_container/planner/concentration_cli.py +120 -0
gpu_container/planner/placement.py +198 -0
gpu_container/planner/receipt.py +155 -0
gpu_container/planner/receipt_cli.py +143 -0
gpu_container/profiler/__init__.py +24 -0
gpu_container/profiler/baseline.py +122 -0
gpu_container/profiler/cli.py +151 -0
gpu_container/profiler/cuda_bench.py +306 -0
gpu_container/profiler/hardware.py +304 -0
gpu_container/profiler/model.py +178 -0
gpu_container/profiler/nvme_bench.py +158 -0
gpu_container/profiler/schema.py +245 -0
gpu_container/watchdog.py +563 -0
gpu_container-0.1.0.dist-info/METADATA +100 -0
gpu_container-0.1.0.dist-info/RECORD +26 -0
gpu_container-0.1.0.dist-info/WHEEL +4 -0
gpu_container-0.1.0.dist-info/entry_points.txt +7 -0
gpu_container-0.1.0.dist-info/licenses/LICENSE +21 -0

gpu_container/profiler/cuda_bench.py ADDED Viewed

@@ -0,0 +1,306 @@
+"""CUDA bandwidth + pinned-memory benchmarks via ctypes against libcudart.
+No PyTorch, no CuPy, no nvcc: we `dlopen` the CUDA Runtime library that ships in the
+`nvidia/cuda:*-runtime` base image and call it directly. cudaMemcpy and cudaHostAlloc
+are copy-engine / driver operations — they do NOT launch a compiled device kernel — so
+this works on sm_120 (Blackwell / RTX 5090) without a kernel image targeting it.
+Methodology is the docker-knowledge wave-2 `hw-measurement` spec (run INSIDE the container,
+the only honest vantage):
+  - PCIe: PINNED (page-locked) host buffer, large transfer (>=64 MB, we use 256 MB),
+    one untimed warmup, median of N copies timed by cudaEvent. H2D and D2H measured
+    SEPARATELY (asymmetry is real). Report achieved GB/s — NEVER the 64 GB/s theoretical.
+  - Pinnable ceiling: WSL2/WDDM collapses cudaHostAlloc to ~300-500 MB inside Docker-on-WSL2
+    (vs GBs native). MEASURE it with an escalating alloc probe — do not assume.
+Every entry point degrades to an honest error dict (never raises) so the profiler can record
+`None` + provenance rather than crash or guess.
+"""
+from __future__ import annotations
+import ctypes
+import statistics
+from ctypes.util import find_library
+from typing import Optional
+# --- CUDA Runtime constants ------------------------------------------------------------
+_cudaSuccess = 0
+_cudaErrorMemoryAllocation = 2
+_cudaMemcpyHostToDevice = 1
+_cudaMemcpyDeviceToHost = 2
+_cudaHostAllocDefault = 0
+_MIB = 1024 * 1024
+# Sonames to try, newest first. The runtime image ships libcudart.so.12; we stay
+# version-agnostic so a 12.x or 13.x base both work.
+_CUDART_NAMES = [
+    "libcudart.so.13", "libcudart.so.12", "libcudart.so",
+    "cudart64_13.dll", "cudart64_12.dll",
+]
+_cudart: Optional[ctypes.CDLL] = None
+_load_error: Optional[str] = None
+def _load_cudart() -> Optional[ctypes.CDLL]:
+    """Load libcudart once and pin the ctypes prototypes. Returns None if unavailable."""
+    global _cudart, _load_error
+    if _cudart is not None or _load_error is not None:
+        return _cudart
+    lib = None
+    for name in _CUDART_NAMES:
+        try:
+            lib = ctypes.CDLL(name)
+            break
+        except OSError:
+            continue
+    if lib is None:
+        found = find_library("cudart")
+        if found:
+            try:
+                lib = ctypes.CDLL(found)
+            except OSError:
+                lib = None
+    if lib is None:
+        _load_error = "libcudart not found (expected in an nvidia/cuda:*-runtime image)"
+        return None
+    cvp, cvpp = ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)
+    ci, cipp = ctypes.c_int, ctypes.POINTER(ctypes.c_int)
+    cf, cfp = ctypes.c_float, ctypes.POINTER(ctypes.c_float)
+    csz = ctypes.c_size_t
+    sigs = {
+        "cudaGetDeviceCount": [cipp],
+        "cudaSetDevice": [ci],
+        "cudaMalloc": [cvpp, csz],
+        "cudaFree": [cvp],
+        "cudaHostAlloc": [cvpp, csz, ctypes.c_uint],
+        "cudaFreeHost": [cvp],
+        "cudaMemcpy": [cvp, cvp, csz, ci],
+        "cudaEventCreate": [cvpp],
+        "cudaEventRecord": [cvp, cvp],
+        "cudaEventSynchronize": [cvp],
+        "cudaEventElapsedTime": [cfp, cvp, cvp],
+        "cudaEventDestroy": [cvp],
+        "cudaDeviceSynchronize": [],
+        "cudaGetLastError": [],
+        "cudaRuntimeGetVersion": [cipp],
+    }
+    for fn, argtypes in sigs.items():
+        f = getattr(lib, fn)
+        f.argtypes = argtypes
+        f.restype = ctypes.c_int
+    lib.cudaGetErrorString.argtypes = [ci]
+    lib.cudaGetErrorString.restype = ctypes.c_char_p
+    _cudart = lib
+    return _cudart
+def _errstr(lib: ctypes.CDLL, rc: int) -> str:
+    try:
+        s = lib.cudaGetErrorString(rc)
+        return s.decode() if s else f"cuda error {rc}"
+    except Exception:
+        return f"cuda error {rc}"
+def available() -> bool:
+    """True if libcudart loaded and at least one CUDA device is visible."""
+    lib = _load_cudart()
+    if lib is None:
+        return False
+    cnt = ctypes.c_int(0)
+    rc = lib.cudaGetDeviceCount(ctypes.byref(cnt))
+    return rc == _cudaSuccess and cnt.value > 0
+def runtime_version() -> Optional[str]:
+    lib = _load_cudart()
+    if lib is None:
+        return None
+    v = ctypes.c_int(0)
+    if lib.cudaRuntimeGetVersion(ctypes.byref(v)) != _cudaSuccess:
+        return None
+    # encoded as 1000*major + 10*minor
+    return f"{v.value // 1000}.{(v.value % 1000) // 10}"
+def load_error() -> Optional[str]:
+    _load_cudart()
+    return _load_error
+# --- PCIe bandwidth --------------------------------------------------------------------
+def _time_copies(lib, dst, src, nbytes, kind, iters) -> Optional[list]:
+    """Return per-iteration milliseconds for cudaMemcpy, timed by cudaEvent. None on failure."""
+    start, stop = ctypes.c_void_p(), ctypes.c_void_p()
+    if lib.cudaEventCreate(ctypes.byref(start)) != _cudaSuccess:
+        return None
+    if lib.cudaEventCreate(ctypes.byref(stop)) != _cudaSuccess:
+        lib.cudaEventDestroy(start)
+        return None
+    times = []
+    try:
+        for _ in range(iters):
+            lib.cudaEventRecord(start, None)
+            rc = lib.cudaMemcpy(dst, src, nbytes, kind)
+            lib.cudaEventRecord(stop, None)
+            lib.cudaEventSynchronize(stop)
+            if rc != _cudaSuccess:
+                return None
+            ms = ctypes.c_float(0.0)
+            if lib.cudaEventElapsedTime(ctypes.byref(ms), start, stop) != _cudaSuccess:
+                return None
+            times.append(ms.value)
+        return times
+    finally:
+        lib.cudaEventDestroy(start)
+        lib.cudaEventDestroy(stop)
+def measure_pcie(buffer_mib: int = 256, iters: int = 11, warmup: int = 3) -> dict:
+    """Measure achieved pinned H2D and D2H PCIe bandwidth (GB/s, decimal 1e9 convention).
+    Returns a dict with `h2d_gbps`, `d2h_gbps`, and provenance; on any failure returns
+    `{"error": ...}` with whatever was obtained left as None — the caller records None,
+    never a guess.
+    """
+    out: dict = {
+        "h2d_gbps": None, "d2h_gbps": None, "buffer_mib": None,
+        "iters": iters, "warmup": warmup, "buffer": "pinned",
+        "convention": "GB/s = bytes / seconds / 1e9 (decimal, matches nvbandwidth)",
+    }
+    lib = _load_cudart()
+    if lib is None:
+        out["error"] = _load_error
+        return out
+    if not available():
+        out["error"] = "no CUDA device visible"
+        return out
+    lib.cudaSetDevice(0)
+    # Allocate the PINNED host buffer first, shrinking on failure (WSL2 caps this low).
+    h = ctypes.c_void_p()
+    nbytes = 0
+    for mib in (buffer_mib, 128, 64):
+        rc = lib.cudaHostAlloc(ctypes.byref(h), mib * _MIB, _cudaHostAllocDefault)
+        if rc == _cudaSuccess:
+            nbytes = mib * _MIB
+            out["buffer_mib"] = mib
+            break
+        lib.cudaGetLastError()  # clear the (non-sticky) alloc error before retry
+    if nbytes == 0:
+        out["error"] = "cudaHostAlloc failed even at 64 MiB (pinned-memory ceiling too low)"
+        return out
+    d = ctypes.c_void_p()
+    rc = lib.cudaMalloc(ctypes.byref(d), nbytes)
+    if rc != _cudaSuccess:
+        lib.cudaFreeHost(h)
+        out["error"] = f"cudaMalloc({nbytes}) failed: {_errstr(lib, rc)}"
+        return out
+    try:
+        # Warmup (untimed) per direction, then sync, to leave the cold/launch regime.
+        for _ in range(max(1, warmup)):
+            lib.cudaMemcpy(d, h, nbytes, _cudaMemcpyHostToDevice)
+            lib.cudaMemcpy(h, d, nbytes, _cudaMemcpyDeviceToHost)
+        lib.cudaDeviceSynchronize()
+        h2d = _time_copies(lib, d, h, nbytes, _cudaMemcpyHostToDevice, iters)
+        d2h = _time_copies(lib, h, d, nbytes, _cudaMemcpyDeviceToHost, iters)
+        def gbps(times):
+            if not times:
+                return None
+            med = statistics.median(times)
+            return round(nbytes / (med / 1000.0) / 1e9, 2) if med > 0 else None
+        out["h2d_gbps"] = gbps(h2d)
+        out["d2h_gbps"] = gbps(d2h)
+        if h2d:
+            out["h2d_median_ms"] = round(statistics.median(h2d), 4)
+            out["h2d_min_ms"] = round(min(h2d), 4)
+        if d2h:
+            out["d2h_median_ms"] = round(statistics.median(d2h), 4)
+        if out["h2d_gbps"] is None and out["d2h_gbps"] is None:
+            out["error"] = "all timed copies failed"
+    finally:
+        lib.cudaFree(d)
+        lib.cudaFreeHost(h)
+    return out
+# --- Pinnable host-RAM ceiling ---------------------------------------------------------
+def _can_pin(lib, mib: int) -> bool:
+    p = ctypes.c_void_p()
+    rc = lib.cudaHostAlloc(ctypes.byref(p), mib * _MIB, _cudaHostAllocDefault)
+    if rc == _cudaSuccess:
+        lib.cudaFreeHost(p)
+        return True
+    lib.cudaGetLastError()  # clear non-sticky alloc error
+    return False
+def measure_pinnable_ceiling(
+    start_mib: int = 128, max_mib: int = 16384, resolution_mib: int = 32
+) -> dict:
+    """Find the largest single cudaHostAlloc that succeeds (escalate by doubling, then bisect).
+    Historically WSL2 collapsed this to a few hundred MB; newer drivers can lift it to many
+    GB (MEASURE, don't assume — that's the point). `max_mib` is a SAFETY cap (the caller
+    sizes it to a fraction of RAM so the probe never tries to pin the whole VM). `capped=True`
+    means the cap itself allocated without failing, so the ceiling is a LOWER BOUND (≥ value).
+    """
+    out: dict = {
+        "ceiling_mib": None, "ceiling_gib": None, "capped": None,
+        "method": f"escalating cudaHostAlloc probe (start={start_mib} MiB, "
+                  f"safety cap {max_mib} MiB, bisect to {resolution_mib} MiB)",
+    }
+    lib = _load_cudart()
+    if lib is None:
+        out["error"] = _load_error
+        return out
+    if not available():
+        out["error"] = "no CUDA device visible"
+        return out
+    lib.cudaSetDevice(0)
+    # Doubling ladder from start up to (and including) the safety cap.
+    ladder, mib = [], max(1, start_mib)
+    while mib < max_mib:
+        ladder.append(mib)
+        mib *= 2
+    ladder.append(max_mib)
+    last_ok, first_fail = 0, None
+    for size in ladder:
+        if _can_pin(lib, size):
+            last_ok = size
+        else:
+            first_fail = size
+            break
+    if first_fail is None:
+        # Reached the safety cap with no failure -> ceiling is a lower bound.
+        out["ceiling_mib"] = last_ok
+        out["ceiling_gib"] = round(last_ok / 1024, 3)
+        out["capped"] = True
+        return out
+    lo, hi = last_ok, first_fail
+    while hi - lo > resolution_mib:
+        mid = (lo + hi) // 2
+        if mid > 0 and _can_pin(lib, mid):
+            lo = mid
+        else:
+            hi = mid
+    out["ceiling_mib"] = lo
+    out["ceiling_gib"] = round(lo / 1024, 3)
+    out["capped"] = False
+    return out

gpu_container/profiler/hardware.py ADDED Viewed

@@ -0,0 +1,304 @@
+"""Hardware profiler — detect and measure the rig from inside the container.
+REAL: GPU identity/VRAM/driver/compute-cap via pynvml (preferred — NVML directly, v2 so
+driver-`reserved` VRAM is not miscounted as `used`) with an nvidia-smi text fallback;
+platform (os / WSL2 / container / nvidia-runtime) detection; system RAM.
+MEASURED (docker-knowledge wave-2 `hw-measurement`, via `cuda_bench` + `nvme_bench`):
+PCIe H2D/D2H (pinned cudaMemcpy timed by cudaEvent), NVMe sequential + random-QD1 (fio
+direct-io on a validated mount), and the WSL2 pinnable-RAM ceiling (cudaHostAlloc probe).
+Design rule (wave-1): a measurement we have NOT taken is `None`, never a guessed number —
+honest refusal downstream depends on honest inputs.
+"""
+from __future__ import annotations
+import os
+import platform
+import subprocess
+from typing import List, Optional
+from . import cuda_bench, nvme_bench
+from .schema import BandwidthInfo, GpuInfo, HardwareProfile, MemoryInfo, PlatformInfo
+_SMI_FIELDS = [
+    "name", "driver_version", "memory.total", "memory.free",
+    "compute_cap", "pcie.link.gen.max", "pcie.link.width.max",
+]
+def _nvidia_smi_query() -> Optional[List[str]]:
+    try:
+        out = subprocess.run(
+            ["nvidia-smi", f"--query-gpu={','.join(_SMI_FIELDS)}",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=15,
+        )
+    except (FileNotFoundError, subprocess.SubprocessError):
+        return None
+    if out.returncode != 0 or not out.stdout.strip():
+        return None
+    # first GPU only (single-GPU product)
+    return [c.strip() for c in out.stdout.strip().splitlines()[0].split(",")]
+def _as_int(s: Optional[str]) -> Optional[int]:
+    try:
+        return int(float(s))
+    except (TypeError, ValueError):
+        return None
+def _clean(s: Optional[str]) -> Optional[str]:
+    if not s or s in ("[Not Supported]", "[N/A]", "N/A"):
+        return None
+    return s
+def _refine_vram_pynvml(gpu: GpuInfo) -> GpuInfo:
+    """Override VRAM total/free with NVML values read directly (pynvml), preferring v2.
+    v2 (`nvmlMemory_v2`) reports driver-`reserved` separately; v1 folds it into `used`, so
+    v1 `free` under-reports. If pynvml is absent or NVML init fails (can happen in some
+    Docker-on-WSL2 vintages), we silently keep the nvidia-smi values.
+    """
+    try:
+        import pynvml  # optional [gpu] dependency
+    except Exception:
+        return gpu
+    try:
+        pynvml.nvmlInit()
+    except Exception:
+        return gpu
+    try:
+        h = pynvml.nvmlDeviceGetHandleByIndex(0)
+        mem, src = None, None
+        try:
+            mem = pynvml.nvmlDeviceGetMemoryInfo(h, version=pynvml.nvmlMemory_v2)
+            src = "pynvml-v2"
+        except Exception:
+            try:
+                mem = pynvml.nvmlDeviceGetMemoryInfo(h)
+                src = "pynvml-v1"
+            except Exception:
+                mem = None
+        if mem is not None:
+            gpu.vram_total_mib = int(mem.total // (1024 * 1024))
+            gpu.vram_free_mib = int(mem.free // (1024 * 1024))
+            reserved = getattr(mem, "reserved", None)
+            gpu.vram_reserved_mib = int(reserved // (1024 * 1024)) if reserved else None
+            gpu.vram_source = src
+        # compute capability, if smi left it unknown
+        if gpu.compute_capability is None:
+            try:
+                major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(h)
+                gpu.compute_capability = f"{major}.{minor}"
+            except Exception:
+                pass
+    finally:
+        try:
+            pynvml.nvmlShutdown()
+        except Exception:
+            pass
+    return gpu
+def detect_gpu() -> GpuInfo:
+    vals = _nvidia_smi_query()
+    if not vals or len(vals) < 4:
+        gpu = GpuInfo(name="unknown (nvidia-smi unavailable)")
+    else:
+        g = dict(zip(_SMI_FIELDS, vals + [None] * (len(_SMI_FIELDS) - len(vals))))
+        gpu = GpuInfo(
+            name=_clean(g["name"]) or "unknown",
+            vram_total_mib=_as_int(g["memory.total"]),
+            vram_free_mib=_as_int(g["memory.free"]),
+            driver_version=_clean(g["driver_version"]),
+            compute_capability=_clean(g["compute_cap"]),
+            # NVML pcie.link.* are advisory under WSL2 (often N/A / downclocked) — capture but
+            # the effective link is DERIVED from measured bandwidth, not from these fields.
+            pcie_gen=_as_int(g["pcie.link.gen.max"]),
+            pcie_width=_as_int(g["pcie.link.width.max"]),
+            vram_source="nvidia-smi" if _as_int(g["memory.total"]) is not None else None,
+        )
+    gpu.cuda_version = cuda_bench.runtime_version()
+    return _refine_vram_pynvml(gpu)
+def _cgroup_container_token() -> Optional[str]:
+    """Return the container engine hinted by /proc/1/cgroup, or None."""
+    try:
+        with open("/proc/1/cgroup", "r", encoding="utf-8", errors="ignore") as f:
+            blob = f.read().lower()
+    except OSError:
+        return None
+    for tok in ("docker", "containerd", "kubepods", "libpod", "podman"):
+        if tok in blob:
+            return "docker" if tok in ("docker", "containerd") else tok
+    return None
+def _is_wsl2() -> bool:
+    # "microsoft" in the kernel version => WSL2 kernel. NOTE: a container ON the WSL2 backend
+    # inherits this too, so wsl2=True means "running on the WSL2 kernel" regardless of
+    # containerization; combine with in_container to tell the two apart.
+    for path in ("/proc/version", "/proc/sys/kernel/osrelease"):
+        try:
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                if "microsoft" in f.read().lower():
+                    return True
+        except OSError:
+            continue
+    return False
+def detect_platform() -> PlatformInfo:
+    osname = platform.system().lower()  # "windows" | "linux" | "darwin"
+    dockerenv = os.path.exists("/.dockerenv") or os.path.exists("/run/.containerenv")
+    cgroup_tok = _cgroup_container_token()
+    in_container = dockerenv or cgroup_tok is not None
+    runtime: Optional[str] = "docker" if dockerenv else cgroup_tok
+    wsl2 = _is_wsl2()
+    # GPU passthrough device differs by platform: /dev/nvidia* (native-Linux container, via
+    # the NVIDIA Container Toolkit prestart hook) vs /dev/dxg (WSL2 — the GPU rides the
+    # WDDM/DirectX path and NO /dev/nvidia* node exists, so checking only that gives a false
+    # negative even though the GPU is fully usable; verified on this rig).
+    nvidia_runtime: Optional[bool] = None
+    if in_container:
+        nvidia_runtime = any(os.path.exists(p) for p in
+                             ("/dev/nvidia0", "/dev/dxg", "/proc/driver/nvidia/version"))
+    # UVM oversubscription is unavailable on windows/wsl2 (docker-knowledge container-runtime).
+    uvm = False if (osname == "windows" or wsl2) else None
+    return PlatformInfo(
+        os=osname,
+        in_container=in_container,
+        wsl2=wsl2,
+        container_runtime=runtime,
+        nvidia_runtime=nvidia_runtime,
+        uvm_oversubscription=uvm,
+    )
+def detect_memory() -> MemoryInfo:
+    try:
+        import psutil  # optional dependency
+        vm = psutil.virtual_memory()
+        return MemoryInfo(
+            ram_total_gib=round(vm.total / 1024**3, 2),
+            ram_available_gib=round(vm.available / 1024**3, 2),
+        )
+    except Exception:
+        pass
+    # Linux fallback without psutil
+    try:
+        with open("/proc/meminfo", "r", encoding="utf-8") as f:
+            kb = {ln.split(":")[0]: ln.split()[1] for ln in f if ":" in ln}
+        total = int(kb.get("MemTotal", 0)) / 1024**2
+        avail = int(kb.get("MemAvailable", 0)) / 1024**2
+        return MemoryInfo(ram_total_gib=round(total, 2) or None,
+                          ram_available_gib=round(avail, 2) or None)
+    except OSError:
+        return MemoryInfo()
+def measure_bandwidth(bench_dir: Optional[str] = None) -> BandwidthInfo:
+    """Run the PCIe (cuda_bench) and NVMe (nvme_bench) measurements -> BandwidthInfo.
+    Each axis fills in independently; an axis that cannot be measured stays `None` and its
+    reason is recorded in `details`/`method`. Numbers are achieved/measured, never spec-sheet.
+    """
+    pcie = cuda_bench.measure_pcie()
+    nvme = nvme_bench.measure_nvme(bench_dir=bench_dir)
+    bw = BandwidthInfo(
+        pcie_h2d_gbps=pcie.get("h2d_gbps"),
+        pcie_d2h_gbps=pcie.get("d2h_gbps"),
+        nvme_seq_read_gbps=nvme.get("seq_read_gbps"),
+        nvme_rand_qd1_read_iops=nvme.get("rand_qd1_iops"),
+        nvme_rand_qd1_read_mbps=nvme.get("rand_qd1_mbps"),
+    )
+    methods: List[str] = []
+    if bw.pcie_h2d_gbps is not None:
+        methods.append("pcie:cudaMemcpy-pinned-cudaEvent")
+    else:
+        methods.append(f"pcie:none ({pcie.get('error', 'unknown')})")
+    if bw.nvme_seq_read_gbps is not None or bw.nvme_rand_qd1_read_iops is not None:
+        methods.append("nvme:fio-direct-libaio")
+    else:
+        methods.append(f"nvme:none ({nvme.get('error', 'unknown')})")
+    bw.method = "; ".join(methods)
+    # Sanity flag: an achieved H2D far below Gen5 expectation points at an x8 link, a
+    # downclocked link, or WSL2 perturbation — flag, don't silently trust (wave-2).
+    if bw.pcie_h2d_gbps is not None and bw.pcie_h2d_gbps < 30:
+        pcie["sanity"] = "H2D below Gen5 expectation (~50 GB/s); check link width / WSL2 perturbation"
+    bw.details = {"pcie": pcie, "nvme": nvme}
+    return bw
+def measure_cpu_mem_bw(array_mib: int = 256, iters: int = 7) -> dict:
+    """CPU RAM read+write bandwidth (GB/s) via a large out-of-cache numpy copy — the input the
+    MoE CPU-offload throughput model keys off (CPU computes its experts at RAM bandwidth). Honest
+    None if numpy is unavailable; the planner then flags a labelled default."""
+    out = {"gbps": None, "method": None}
+    try:
+        import statistics
+        import time
+        import numpy as np
+    except Exception:
+        out["method"] = "not-measured: numpy unavailable"
+        return out
+    try:
+        n = (array_mib * 1024 * 1024) // 8  # float64 elements; 256 MiB >> L3 to defeat cache
+        a = np.empty(n, dtype=np.float64)
+        b = np.ones(n, dtype=np.float64)
+        np.copyto(a, b)  # warmup
+        times = []
+        for _ in range(iters):
+            t0 = time.perf_counter()
+            np.copyto(a, b)  # read b + write a == 2 * n * 8 bytes
+            times.append(time.perf_counter() - t0)
+        med = statistics.median(times)
+        out["gbps"] = round((2 * n * 8) / med / 1e9, 1) if med > 0 else None
+        out["method"] = f"numpy copy (read+write), {array_mib} MiB, median of {iters}"
+    except Exception as e:
+        out["method"] = f"not-measured: {e}"
+    return out
+def _probe_pinnable(mem: MemoryInfo) -> None:
+    """Fill the pinnable-RAM ceiling on `mem` via a cudaHostAlloc probe (in place).
+    The probe is capped at ~75% of available RAM (absolute max 24 GiB): pinned memory is
+    page-locked and physically resident, so an unbounded probe could destabilize the host.
+    A `capped` result therefore means "ceiling is at least this" — a safe lower bound.
+    """
+    avail = mem.ram_available_gib or mem.ram_total_gib or 16.0
+    safe_max_mib = max(512, min(int(avail * 1024 * 0.75), 24576))
+    pin = cuda_bench.measure_pinnable_ceiling(max_mib=safe_max_mib)
+    if pin.get("ceiling_gib") is not None:
+        mem.pinnable_ceiling_gib = pin["ceiling_gib"]
+        mem.pinnable_capped = pin.get("capped")
+        mem.pinnable_method = pin.get("method")
+    else:
+        mem.pinnable_method = f"not-measured: {pin.get('error', 'unknown')}"
+def profile_hardware(created: str, run_benches: bool = True,
+                     bench_dir: Optional[str] = None) -> HardwareProfile:
+    gpu = detect_gpu()
+    plat = detect_platform()
+    mem = detect_memory()
+    if run_benches:
+        bw = measure_bandwidth(bench_dir=bench_dir)
+        _probe_pinnable(mem)
+        cb = measure_cpu_mem_bw()
+        mem.cpu_mem_bw_gbps = cb["gbps"]
+        mem.cpu_mem_bw_method = cb["method"]
+    else:
+        bw = BandwidthInfo(method="not-measured (--no-bench): identity detection only")
+    return HardwareProfile(gpu=gpu, platform=plat, bandwidth=bw, memory=mem)