gpu-container 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,306 @@
1
+ """CUDA bandwidth + pinned-memory benchmarks via ctypes against libcudart.
2
+
3
+ No PyTorch, no CuPy, no nvcc: we `dlopen` the CUDA Runtime library that ships in the
4
+ `nvidia/cuda:*-runtime` base image and call it directly. cudaMemcpy and cudaHostAlloc
5
+ are copy-engine / driver operations — they do NOT launch a compiled device kernel — so
6
+ this works on sm_120 (Blackwell / RTX 5090) without a kernel image targeting it.
7
+
8
+ Methodology is the docker-knowledge wave-2 `hw-measurement` spec (run INSIDE the container,
9
+ the only honest vantage):
10
+ - PCIe: PINNED (page-locked) host buffer, large transfer (>=64 MB, we use 256 MB),
11
+ one untimed warmup, median of N copies timed by cudaEvent. H2D and D2H measured
12
+ SEPARATELY (asymmetry is real). Report achieved GB/s — NEVER the 64 GB/s theoretical.
13
+ - Pinnable ceiling: WSL2/WDDM collapses cudaHostAlloc to ~300-500 MB inside Docker-on-WSL2
14
+ (vs GBs native). MEASURE it with an escalating alloc probe — do not assume.
15
+
16
+ Every entry point degrades to an honest error dict (never raises) so the profiler can record
17
+ `None` + provenance rather than crash or guess.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import ctypes
22
+ import statistics
23
+ from ctypes.util import find_library
24
+ from typing import Optional
25
+
26
+ # --- CUDA Runtime constants ------------------------------------------------------------
27
+ _cudaSuccess = 0
28
+ _cudaErrorMemoryAllocation = 2
29
+ _cudaMemcpyHostToDevice = 1
30
+ _cudaMemcpyDeviceToHost = 2
31
+ _cudaHostAllocDefault = 0
32
+
33
+ _MIB = 1024 * 1024
34
+
35
+ # Sonames to try, newest first. The runtime image ships libcudart.so.12; we stay
36
+ # version-agnostic so a 12.x or 13.x base both work.
37
+ _CUDART_NAMES = [
38
+ "libcudart.so.13", "libcudart.so.12", "libcudart.so",
39
+ "cudart64_13.dll", "cudart64_12.dll",
40
+ ]
41
+
42
+ _cudart: Optional[ctypes.CDLL] = None
43
+ _load_error: Optional[str] = None
44
+
45
+
46
+ def _load_cudart() -> Optional[ctypes.CDLL]:
47
+ """Load libcudart once and pin the ctypes prototypes. Returns None if unavailable."""
48
+ global _cudart, _load_error
49
+ if _cudart is not None or _load_error is not None:
50
+ return _cudart
51
+
52
+ lib = None
53
+ for name in _CUDART_NAMES:
54
+ try:
55
+ lib = ctypes.CDLL(name)
56
+ break
57
+ except OSError:
58
+ continue
59
+ if lib is None:
60
+ found = find_library("cudart")
61
+ if found:
62
+ try:
63
+ lib = ctypes.CDLL(found)
64
+ except OSError:
65
+ lib = None
66
+ if lib is None:
67
+ _load_error = "libcudart not found (expected in an nvidia/cuda:*-runtime image)"
68
+ return None
69
+
70
+ cvp, cvpp = ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)
71
+ ci, cipp = ctypes.c_int, ctypes.POINTER(ctypes.c_int)
72
+ cf, cfp = ctypes.c_float, ctypes.POINTER(ctypes.c_float)
73
+ csz = ctypes.c_size_t
74
+
75
+ sigs = {
76
+ "cudaGetDeviceCount": [cipp],
77
+ "cudaSetDevice": [ci],
78
+ "cudaMalloc": [cvpp, csz],
79
+ "cudaFree": [cvp],
80
+ "cudaHostAlloc": [cvpp, csz, ctypes.c_uint],
81
+ "cudaFreeHost": [cvp],
82
+ "cudaMemcpy": [cvp, cvp, csz, ci],
83
+ "cudaEventCreate": [cvpp],
84
+ "cudaEventRecord": [cvp, cvp],
85
+ "cudaEventSynchronize": [cvp],
86
+ "cudaEventElapsedTime": [cfp, cvp, cvp],
87
+ "cudaEventDestroy": [cvp],
88
+ "cudaDeviceSynchronize": [],
89
+ "cudaGetLastError": [],
90
+ "cudaRuntimeGetVersion": [cipp],
91
+ }
92
+ for fn, argtypes in sigs.items():
93
+ f = getattr(lib, fn)
94
+ f.argtypes = argtypes
95
+ f.restype = ctypes.c_int
96
+ lib.cudaGetErrorString.argtypes = [ci]
97
+ lib.cudaGetErrorString.restype = ctypes.c_char_p
98
+
99
+ _cudart = lib
100
+ return _cudart
101
+
102
+
103
+ def _errstr(lib: ctypes.CDLL, rc: int) -> str:
104
+ try:
105
+ s = lib.cudaGetErrorString(rc)
106
+ return s.decode() if s else f"cuda error {rc}"
107
+ except Exception:
108
+ return f"cuda error {rc}"
109
+
110
+
111
+ def available() -> bool:
112
+ """True if libcudart loaded and at least one CUDA device is visible."""
113
+ lib = _load_cudart()
114
+ if lib is None:
115
+ return False
116
+ cnt = ctypes.c_int(0)
117
+ rc = lib.cudaGetDeviceCount(ctypes.byref(cnt))
118
+ return rc == _cudaSuccess and cnt.value > 0
119
+
120
+
121
+ def runtime_version() -> Optional[str]:
122
+ lib = _load_cudart()
123
+ if lib is None:
124
+ return None
125
+ v = ctypes.c_int(0)
126
+ if lib.cudaRuntimeGetVersion(ctypes.byref(v)) != _cudaSuccess:
127
+ return None
128
+ # encoded as 1000*major + 10*minor
129
+ return f"{v.value // 1000}.{(v.value % 1000) // 10}"
130
+
131
+
132
+ def load_error() -> Optional[str]:
133
+ _load_cudart()
134
+ return _load_error
135
+
136
+
137
+ # --- PCIe bandwidth --------------------------------------------------------------------
138
+ def _time_copies(lib, dst, src, nbytes, kind, iters) -> Optional[list]:
139
+ """Return per-iteration milliseconds for cudaMemcpy, timed by cudaEvent. None on failure."""
140
+ start, stop = ctypes.c_void_p(), ctypes.c_void_p()
141
+ if lib.cudaEventCreate(ctypes.byref(start)) != _cudaSuccess:
142
+ return None
143
+ if lib.cudaEventCreate(ctypes.byref(stop)) != _cudaSuccess:
144
+ lib.cudaEventDestroy(start)
145
+ return None
146
+ times = []
147
+ try:
148
+ for _ in range(iters):
149
+ lib.cudaEventRecord(start, None)
150
+ rc = lib.cudaMemcpy(dst, src, nbytes, kind)
151
+ lib.cudaEventRecord(stop, None)
152
+ lib.cudaEventSynchronize(stop)
153
+ if rc != _cudaSuccess:
154
+ return None
155
+ ms = ctypes.c_float(0.0)
156
+ if lib.cudaEventElapsedTime(ctypes.byref(ms), start, stop) != _cudaSuccess:
157
+ return None
158
+ times.append(ms.value)
159
+ return times
160
+ finally:
161
+ lib.cudaEventDestroy(start)
162
+ lib.cudaEventDestroy(stop)
163
+
164
+
165
+ def measure_pcie(buffer_mib: int = 256, iters: int = 11, warmup: int = 3) -> dict:
166
+ """Measure achieved pinned H2D and D2H PCIe bandwidth (GB/s, decimal 1e9 convention).
167
+
168
+ Returns a dict with `h2d_gbps`, `d2h_gbps`, and provenance; on any failure returns
169
+ `{"error": ...}` with whatever was obtained left as None — the caller records None,
170
+ never a guess.
171
+ """
172
+ out: dict = {
173
+ "h2d_gbps": None, "d2h_gbps": None, "buffer_mib": None,
174
+ "iters": iters, "warmup": warmup, "buffer": "pinned",
175
+ "convention": "GB/s = bytes / seconds / 1e9 (decimal, matches nvbandwidth)",
176
+ }
177
+ lib = _load_cudart()
178
+ if lib is None:
179
+ out["error"] = _load_error
180
+ return out
181
+ if not available():
182
+ out["error"] = "no CUDA device visible"
183
+ return out
184
+
185
+ lib.cudaSetDevice(0)
186
+
187
+ # Allocate the PINNED host buffer first, shrinking on failure (WSL2 caps this low).
188
+ h = ctypes.c_void_p()
189
+ nbytes = 0
190
+ for mib in (buffer_mib, 128, 64):
191
+ rc = lib.cudaHostAlloc(ctypes.byref(h), mib * _MIB, _cudaHostAllocDefault)
192
+ if rc == _cudaSuccess:
193
+ nbytes = mib * _MIB
194
+ out["buffer_mib"] = mib
195
+ break
196
+ lib.cudaGetLastError() # clear the (non-sticky) alloc error before retry
197
+ if nbytes == 0:
198
+ out["error"] = "cudaHostAlloc failed even at 64 MiB (pinned-memory ceiling too low)"
199
+ return out
200
+
201
+ d = ctypes.c_void_p()
202
+ rc = lib.cudaMalloc(ctypes.byref(d), nbytes)
203
+ if rc != _cudaSuccess:
204
+ lib.cudaFreeHost(h)
205
+ out["error"] = f"cudaMalloc({nbytes}) failed: {_errstr(lib, rc)}"
206
+ return out
207
+
208
+ try:
209
+ # Warmup (untimed) per direction, then sync, to leave the cold/launch regime.
210
+ for _ in range(max(1, warmup)):
211
+ lib.cudaMemcpy(d, h, nbytes, _cudaMemcpyHostToDevice)
212
+ lib.cudaMemcpy(h, d, nbytes, _cudaMemcpyDeviceToHost)
213
+ lib.cudaDeviceSynchronize()
214
+
215
+ h2d = _time_copies(lib, d, h, nbytes, _cudaMemcpyHostToDevice, iters)
216
+ d2h = _time_copies(lib, h, d, nbytes, _cudaMemcpyDeviceToHost, iters)
217
+
218
+ def gbps(times):
219
+ if not times:
220
+ return None
221
+ med = statistics.median(times)
222
+ return round(nbytes / (med / 1000.0) / 1e9, 2) if med > 0 else None
223
+
224
+ out["h2d_gbps"] = gbps(h2d)
225
+ out["d2h_gbps"] = gbps(d2h)
226
+ if h2d:
227
+ out["h2d_median_ms"] = round(statistics.median(h2d), 4)
228
+ out["h2d_min_ms"] = round(min(h2d), 4)
229
+ if d2h:
230
+ out["d2h_median_ms"] = round(statistics.median(d2h), 4)
231
+ if out["h2d_gbps"] is None and out["d2h_gbps"] is None:
232
+ out["error"] = "all timed copies failed"
233
+ finally:
234
+ lib.cudaFree(d)
235
+ lib.cudaFreeHost(h)
236
+ return out
237
+
238
+
239
+ # --- Pinnable host-RAM ceiling ---------------------------------------------------------
240
+ def _can_pin(lib, mib: int) -> bool:
241
+ p = ctypes.c_void_p()
242
+ rc = lib.cudaHostAlloc(ctypes.byref(p), mib * _MIB, _cudaHostAllocDefault)
243
+ if rc == _cudaSuccess:
244
+ lib.cudaFreeHost(p)
245
+ return True
246
+ lib.cudaGetLastError() # clear non-sticky alloc error
247
+ return False
248
+
249
+
250
+ def measure_pinnable_ceiling(
251
+ start_mib: int = 128, max_mib: int = 16384, resolution_mib: int = 32
252
+ ) -> dict:
253
+ """Find the largest single cudaHostAlloc that succeeds (escalate by doubling, then bisect).
254
+
255
+ Historically WSL2 collapsed this to a few hundred MB; newer drivers can lift it to many
256
+ GB (MEASURE, don't assume — that's the point). `max_mib` is a SAFETY cap (the caller
257
+ sizes it to a fraction of RAM so the probe never tries to pin the whole VM). `capped=True`
258
+ means the cap itself allocated without failing, so the ceiling is a LOWER BOUND (≥ value).
259
+ """
260
+ out: dict = {
261
+ "ceiling_mib": None, "ceiling_gib": None, "capped": None,
262
+ "method": f"escalating cudaHostAlloc probe (start={start_mib} MiB, "
263
+ f"safety cap {max_mib} MiB, bisect to {resolution_mib} MiB)",
264
+ }
265
+ lib = _load_cudart()
266
+ if lib is None:
267
+ out["error"] = _load_error
268
+ return out
269
+ if not available():
270
+ out["error"] = "no CUDA device visible"
271
+ return out
272
+ lib.cudaSetDevice(0)
273
+
274
+ # Doubling ladder from start up to (and including) the safety cap.
275
+ ladder, mib = [], max(1, start_mib)
276
+ while mib < max_mib:
277
+ ladder.append(mib)
278
+ mib *= 2
279
+ ladder.append(max_mib)
280
+
281
+ last_ok, first_fail = 0, None
282
+ for size in ladder:
283
+ if _can_pin(lib, size):
284
+ last_ok = size
285
+ else:
286
+ first_fail = size
287
+ break
288
+
289
+ if first_fail is None:
290
+ # Reached the safety cap with no failure -> ceiling is a lower bound.
291
+ out["ceiling_mib"] = last_ok
292
+ out["ceiling_gib"] = round(last_ok / 1024, 3)
293
+ out["capped"] = True
294
+ return out
295
+
296
+ lo, hi = last_ok, first_fail
297
+ while hi - lo > resolution_mib:
298
+ mid = (lo + hi) // 2
299
+ if mid > 0 and _can_pin(lib, mid):
300
+ lo = mid
301
+ else:
302
+ hi = mid
303
+ out["ceiling_mib"] = lo
304
+ out["ceiling_gib"] = round(lo / 1024, 3)
305
+ out["capped"] = False
306
+ return out
@@ -0,0 +1,304 @@
1
+ """Hardware profiler — detect and measure the rig from inside the container.
2
+
3
+ REAL: GPU identity/VRAM/driver/compute-cap via pynvml (preferred — NVML directly, v2 so
4
+ driver-`reserved` VRAM is not miscounted as `used`) with an nvidia-smi text fallback;
5
+ platform (os / WSL2 / container / nvidia-runtime) detection; system RAM.
6
+
7
+ MEASURED (docker-knowledge wave-2 `hw-measurement`, via `cuda_bench` + `nvme_bench`):
8
+ PCIe H2D/D2H (pinned cudaMemcpy timed by cudaEvent), NVMe sequential + random-QD1 (fio
9
+ direct-io on a validated mount), and the WSL2 pinnable-RAM ceiling (cudaHostAlloc probe).
10
+
11
+ Design rule (wave-1): a measurement we have NOT taken is `None`, never a guessed number —
12
+ honest refusal downstream depends on honest inputs.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import platform
18
+ import subprocess
19
+ from typing import List, Optional
20
+
21
+ from . import cuda_bench, nvme_bench
22
+ from .schema import BandwidthInfo, GpuInfo, HardwareProfile, MemoryInfo, PlatformInfo
23
+
24
+ _SMI_FIELDS = [
25
+ "name", "driver_version", "memory.total", "memory.free",
26
+ "compute_cap", "pcie.link.gen.max", "pcie.link.width.max",
27
+ ]
28
+
29
+
30
+ def _nvidia_smi_query() -> Optional[List[str]]:
31
+ try:
32
+ out = subprocess.run(
33
+ ["nvidia-smi", f"--query-gpu={','.join(_SMI_FIELDS)}",
34
+ "--format=csv,noheader,nounits"],
35
+ capture_output=True, text=True, timeout=15,
36
+ )
37
+ except (FileNotFoundError, subprocess.SubprocessError):
38
+ return None
39
+ if out.returncode != 0 or not out.stdout.strip():
40
+ return None
41
+ # first GPU only (single-GPU product)
42
+ return [c.strip() for c in out.stdout.strip().splitlines()[0].split(",")]
43
+
44
+
45
+ def _as_int(s: Optional[str]) -> Optional[int]:
46
+ try:
47
+ return int(float(s))
48
+ except (TypeError, ValueError):
49
+ return None
50
+
51
+
52
+ def _clean(s: Optional[str]) -> Optional[str]:
53
+ if not s or s in ("[Not Supported]", "[N/A]", "N/A"):
54
+ return None
55
+ return s
56
+
57
+
58
+ def _refine_vram_pynvml(gpu: GpuInfo) -> GpuInfo:
59
+ """Override VRAM total/free with NVML values read directly (pynvml), preferring v2.
60
+
61
+ v2 (`nvmlMemory_v2`) reports driver-`reserved` separately; v1 folds it into `used`, so
62
+ v1 `free` under-reports. If pynvml is absent or NVML init fails (can happen in some
63
+ Docker-on-WSL2 vintages), we silently keep the nvidia-smi values.
64
+ """
65
+ try:
66
+ import pynvml # optional [gpu] dependency
67
+ except Exception:
68
+ return gpu
69
+ try:
70
+ pynvml.nvmlInit()
71
+ except Exception:
72
+ return gpu
73
+ try:
74
+ h = pynvml.nvmlDeviceGetHandleByIndex(0)
75
+ mem, src = None, None
76
+ try:
77
+ mem = pynvml.nvmlDeviceGetMemoryInfo(h, version=pynvml.nvmlMemory_v2)
78
+ src = "pynvml-v2"
79
+ except Exception:
80
+ try:
81
+ mem = pynvml.nvmlDeviceGetMemoryInfo(h)
82
+ src = "pynvml-v1"
83
+ except Exception:
84
+ mem = None
85
+ if mem is not None:
86
+ gpu.vram_total_mib = int(mem.total // (1024 * 1024))
87
+ gpu.vram_free_mib = int(mem.free // (1024 * 1024))
88
+ reserved = getattr(mem, "reserved", None)
89
+ gpu.vram_reserved_mib = int(reserved // (1024 * 1024)) if reserved else None
90
+ gpu.vram_source = src
91
+ # compute capability, if smi left it unknown
92
+ if gpu.compute_capability is None:
93
+ try:
94
+ major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(h)
95
+ gpu.compute_capability = f"{major}.{minor}"
96
+ except Exception:
97
+ pass
98
+ finally:
99
+ try:
100
+ pynvml.nvmlShutdown()
101
+ except Exception:
102
+ pass
103
+ return gpu
104
+
105
+
106
+ def detect_gpu() -> GpuInfo:
107
+ vals = _nvidia_smi_query()
108
+ if not vals or len(vals) < 4:
109
+ gpu = GpuInfo(name="unknown (nvidia-smi unavailable)")
110
+ else:
111
+ g = dict(zip(_SMI_FIELDS, vals + [None] * (len(_SMI_FIELDS) - len(vals))))
112
+ gpu = GpuInfo(
113
+ name=_clean(g["name"]) or "unknown",
114
+ vram_total_mib=_as_int(g["memory.total"]),
115
+ vram_free_mib=_as_int(g["memory.free"]),
116
+ driver_version=_clean(g["driver_version"]),
117
+ compute_capability=_clean(g["compute_cap"]),
118
+ # NVML pcie.link.* are advisory under WSL2 (often N/A / downclocked) — capture but
119
+ # the effective link is DERIVED from measured bandwidth, not from these fields.
120
+ pcie_gen=_as_int(g["pcie.link.gen.max"]),
121
+ pcie_width=_as_int(g["pcie.link.width.max"]),
122
+ vram_source="nvidia-smi" if _as_int(g["memory.total"]) is not None else None,
123
+ )
124
+ gpu.cuda_version = cuda_bench.runtime_version()
125
+ return _refine_vram_pynvml(gpu)
126
+
127
+
128
+ def _cgroup_container_token() -> Optional[str]:
129
+ """Return the container engine hinted by /proc/1/cgroup, or None."""
130
+ try:
131
+ with open("/proc/1/cgroup", "r", encoding="utf-8", errors="ignore") as f:
132
+ blob = f.read().lower()
133
+ except OSError:
134
+ return None
135
+ for tok in ("docker", "containerd", "kubepods", "libpod", "podman"):
136
+ if tok in blob:
137
+ return "docker" if tok in ("docker", "containerd") else tok
138
+ return None
139
+
140
+
141
+ def _is_wsl2() -> bool:
142
+ # "microsoft" in the kernel version => WSL2 kernel. NOTE: a container ON the WSL2 backend
143
+ # inherits this too, so wsl2=True means "running on the WSL2 kernel" regardless of
144
+ # containerization; combine with in_container to tell the two apart.
145
+ for path in ("/proc/version", "/proc/sys/kernel/osrelease"):
146
+ try:
147
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
148
+ if "microsoft" in f.read().lower():
149
+ return True
150
+ except OSError:
151
+ continue
152
+ return False
153
+
154
+
155
+ def detect_platform() -> PlatformInfo:
156
+ osname = platform.system().lower() # "windows" | "linux" | "darwin"
157
+ dockerenv = os.path.exists("/.dockerenv") or os.path.exists("/run/.containerenv")
158
+ cgroup_tok = _cgroup_container_token()
159
+ in_container = dockerenv or cgroup_tok is not None
160
+ runtime: Optional[str] = "docker" if dockerenv else cgroup_tok
161
+ wsl2 = _is_wsl2()
162
+
163
+ # GPU passthrough device differs by platform: /dev/nvidia* (native-Linux container, via
164
+ # the NVIDIA Container Toolkit prestart hook) vs /dev/dxg (WSL2 — the GPU rides the
165
+ # WDDM/DirectX path and NO /dev/nvidia* node exists, so checking only that gives a false
166
+ # negative even though the GPU is fully usable; verified on this rig).
167
+ nvidia_runtime: Optional[bool] = None
168
+ if in_container:
169
+ nvidia_runtime = any(os.path.exists(p) for p in
170
+ ("/dev/nvidia0", "/dev/dxg", "/proc/driver/nvidia/version"))
171
+
172
+ # UVM oversubscription is unavailable on windows/wsl2 (docker-knowledge container-runtime).
173
+ uvm = False if (osname == "windows" or wsl2) else None
174
+ return PlatformInfo(
175
+ os=osname,
176
+ in_container=in_container,
177
+ wsl2=wsl2,
178
+ container_runtime=runtime,
179
+ nvidia_runtime=nvidia_runtime,
180
+ uvm_oversubscription=uvm,
181
+ )
182
+
183
+
184
+ def detect_memory() -> MemoryInfo:
185
+ try:
186
+ import psutil # optional dependency
187
+ vm = psutil.virtual_memory()
188
+ return MemoryInfo(
189
+ ram_total_gib=round(vm.total / 1024**3, 2),
190
+ ram_available_gib=round(vm.available / 1024**3, 2),
191
+ )
192
+ except Exception:
193
+ pass
194
+ # Linux fallback without psutil
195
+ try:
196
+ with open("/proc/meminfo", "r", encoding="utf-8") as f:
197
+ kb = {ln.split(":")[0]: ln.split()[1] for ln in f if ":" in ln}
198
+ total = int(kb.get("MemTotal", 0)) / 1024**2
199
+ avail = int(kb.get("MemAvailable", 0)) / 1024**2
200
+ return MemoryInfo(ram_total_gib=round(total, 2) or None,
201
+ ram_available_gib=round(avail, 2) or None)
202
+ except OSError:
203
+ return MemoryInfo()
204
+
205
+
206
+ def measure_bandwidth(bench_dir: Optional[str] = None) -> BandwidthInfo:
207
+ """Run the PCIe (cuda_bench) and NVMe (nvme_bench) measurements -> BandwidthInfo.
208
+
209
+ Each axis fills in independently; an axis that cannot be measured stays `None` and its
210
+ reason is recorded in `details`/`method`. Numbers are achieved/measured, never spec-sheet.
211
+ """
212
+ pcie = cuda_bench.measure_pcie()
213
+ nvme = nvme_bench.measure_nvme(bench_dir=bench_dir)
214
+
215
+ bw = BandwidthInfo(
216
+ pcie_h2d_gbps=pcie.get("h2d_gbps"),
217
+ pcie_d2h_gbps=pcie.get("d2h_gbps"),
218
+ nvme_seq_read_gbps=nvme.get("seq_read_gbps"),
219
+ nvme_rand_qd1_read_iops=nvme.get("rand_qd1_iops"),
220
+ nvme_rand_qd1_read_mbps=nvme.get("rand_qd1_mbps"),
221
+ )
222
+
223
+ methods: List[str] = []
224
+ if bw.pcie_h2d_gbps is not None:
225
+ methods.append("pcie:cudaMemcpy-pinned-cudaEvent")
226
+ else:
227
+ methods.append(f"pcie:none ({pcie.get('error', 'unknown')})")
228
+ if bw.nvme_seq_read_gbps is not None or bw.nvme_rand_qd1_read_iops is not None:
229
+ methods.append("nvme:fio-direct-libaio")
230
+ else:
231
+ methods.append(f"nvme:none ({nvme.get('error', 'unknown')})")
232
+ bw.method = "; ".join(methods)
233
+
234
+ # Sanity flag: an achieved H2D far below Gen5 expectation points at an x8 link, a
235
+ # downclocked link, or WSL2 perturbation — flag, don't silently trust (wave-2).
236
+ if bw.pcie_h2d_gbps is not None and bw.pcie_h2d_gbps < 30:
237
+ pcie["sanity"] = "H2D below Gen5 expectation (~50 GB/s); check link width / WSL2 perturbation"
238
+ bw.details = {"pcie": pcie, "nvme": nvme}
239
+ return bw
240
+
241
+
242
+ def measure_cpu_mem_bw(array_mib: int = 256, iters: int = 7) -> dict:
243
+ """CPU RAM read+write bandwidth (GB/s) via a large out-of-cache numpy copy — the input the
244
+ MoE CPU-offload throughput model keys off (CPU computes its experts at RAM bandwidth). Honest
245
+ None if numpy is unavailable; the planner then flags a labelled default."""
246
+ out = {"gbps": None, "method": None}
247
+ try:
248
+ import statistics
249
+ import time
250
+
251
+ import numpy as np
252
+ except Exception:
253
+ out["method"] = "not-measured: numpy unavailable"
254
+ return out
255
+ try:
256
+ n = (array_mib * 1024 * 1024) // 8 # float64 elements; 256 MiB >> L3 to defeat cache
257
+ a = np.empty(n, dtype=np.float64)
258
+ b = np.ones(n, dtype=np.float64)
259
+ np.copyto(a, b) # warmup
260
+ times = []
261
+ for _ in range(iters):
262
+ t0 = time.perf_counter()
263
+ np.copyto(a, b) # read b + write a == 2 * n * 8 bytes
264
+ times.append(time.perf_counter() - t0)
265
+ med = statistics.median(times)
266
+ out["gbps"] = round((2 * n * 8) / med / 1e9, 1) if med > 0 else None
267
+ out["method"] = f"numpy copy (read+write), {array_mib} MiB, median of {iters}"
268
+ except Exception as e:
269
+ out["method"] = f"not-measured: {e}"
270
+ return out
271
+
272
+
273
+ def _probe_pinnable(mem: MemoryInfo) -> None:
274
+ """Fill the pinnable-RAM ceiling on `mem` via a cudaHostAlloc probe (in place).
275
+
276
+ The probe is capped at ~75% of available RAM (absolute max 24 GiB): pinned memory is
277
+ page-locked and physically resident, so an unbounded probe could destabilize the host.
278
+ A `capped` result therefore means "ceiling is at least this" — a safe lower bound.
279
+ """
280
+ avail = mem.ram_available_gib or mem.ram_total_gib or 16.0
281
+ safe_max_mib = max(512, min(int(avail * 1024 * 0.75), 24576))
282
+ pin = cuda_bench.measure_pinnable_ceiling(max_mib=safe_max_mib)
283
+ if pin.get("ceiling_gib") is not None:
284
+ mem.pinnable_ceiling_gib = pin["ceiling_gib"]
285
+ mem.pinnable_capped = pin.get("capped")
286
+ mem.pinnable_method = pin.get("method")
287
+ else:
288
+ mem.pinnable_method = f"not-measured: {pin.get('error', 'unknown')}"
289
+
290
+
291
+ def profile_hardware(created: str, run_benches: bool = True,
292
+ bench_dir: Optional[str] = None) -> HardwareProfile:
293
+ gpu = detect_gpu()
294
+ plat = detect_platform()
295
+ mem = detect_memory()
296
+ if run_benches:
297
+ bw = measure_bandwidth(bench_dir=bench_dir)
298
+ _probe_pinnable(mem)
299
+ cb = measure_cpu_mem_bw()
300
+ mem.cpu_mem_bw_gbps = cb["gbps"]
301
+ mem.cpu_mem_bw_method = cb["method"]
302
+ else:
303
+ bw = BandwidthInfo(method="not-measured (--no-bench): identity detection only")
304
+ return HardwareProfile(gpu=gpu, platform=plat, bandwidth=bw, memory=mem)