gpu-container 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_container/__init__.py +9 -0
- gpu_container/__main__.py +60 -0
- gpu_container/errors.py +72 -0
- gpu_container/planner/__init__.py +17 -0
- gpu_container/planner/activation.py +225 -0
- gpu_container/planner/calibration.py +224 -0
- gpu_container/planner/calibration_seed.json +44 -0
- gpu_container/planner/cli.py +101 -0
- gpu_container/planner/concentration_cli.py +120 -0
- gpu_container/planner/placement.py +198 -0
- gpu_container/planner/receipt.py +155 -0
- gpu_container/planner/receipt_cli.py +143 -0
- gpu_container/profiler/__init__.py +24 -0
- gpu_container/profiler/baseline.py +122 -0
- gpu_container/profiler/cli.py +151 -0
- gpu_container/profiler/cuda_bench.py +306 -0
- gpu_container/profiler/hardware.py +304 -0
- gpu_container/profiler/model.py +178 -0
- gpu_container/profiler/nvme_bench.py +158 -0
- gpu_container/profiler/schema.py +245 -0
- gpu_container/watchdog.py +563 -0
- gpu_container-0.1.0.dist-info/METADATA +100 -0
- gpu_container-0.1.0.dist-info/RECORD +26 -0
- gpu_container-0.1.0.dist-info/WHEEL +4 -0
- gpu_container-0.1.0.dist-info/entry_points.txt +7 -0
- gpu_container-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Model profiler — analyze a model's architecture + memory growth before loading.
|
|
2
|
+
|
|
3
|
+
What's REAL here today (closed-form, deterministic — docker-knowledge throughput-prediction
|
|
4
|
+
finding "Memory is exact"):
|
|
5
|
+
- KV-cache bytes/token and at a given context (linear in context).
|
|
6
|
+
- dense vs MoE detection, expert structure, from a HuggingFace config dict.
|
|
7
|
+
|
|
8
|
+
What's STUBBED (needs the file on disk / a download — a later Phase-1 step):
|
|
9
|
+
- total_params and per-layer/per-expert byte accounting from safetensors/GGUF headers.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from .schema import ExpertInfo, ModelProfile
|
|
16
|
+
|
|
17
|
+
# dtype name -> bytes per element
|
|
18
|
+
_DTYPE_BYTES = {
|
|
19
|
+
"float32": 4.0, "float": 4.0, "fp32": 4.0,
|
|
20
|
+
"float16": 2.0, "fp16": 2.0, "half": 2.0,
|
|
21
|
+
"bfloat16": 2.0, "bf16": 2.0,
|
|
22
|
+
"float8": 1.0, "fp8": 1.0, "e4m3": 1.0, "e5m2": 1.0,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def kv_bytes_per_token(n_layers: int, n_kv_heads: int, head_dim: int, dtype_bytes: float = 2.0) -> int:
|
|
27
|
+
"""Closed-form KV-cache size per token, in bytes.
|
|
28
|
+
|
|
29
|
+
bytes/token = 2 (K and V) * n_layers * n_kv_heads * head_dim * dtype_bytes
|
|
30
|
+
|
|
31
|
+
Per NVIDIA "Mastering LLM Techniques: Inference Optimization" (docker-knowledge
|
|
32
|
+
throughput-prediction). GQA/MQA shrink this via n_kv_heads < n_attention_heads.
|
|
33
|
+
"""
|
|
34
|
+
return int(2 * n_layers * n_kv_heads * head_dim * dtype_bytes)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _dtype_bytes_from(cfg: dict) -> float:
|
|
38
|
+
td = str(cfg.get("torch_dtype") or cfg.get("dtype") or "bfloat16").lower()
|
|
39
|
+
return _DTYPE_BYTES.get(td, 2.0)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Effective GGUF bits-per-weight (incl. quant metadata overhead) — llama.cpp quant tables.
|
|
43
|
+
# Used to turn a closed-form param count into a realistic on-device byte footprint.
|
|
44
|
+
_BPW = {
|
|
45
|
+
"q2_k": 3.35, "q3_k_s": 3.50, "q3_k_m": 3.91, "q3_k_l": 4.27,
|
|
46
|
+
"q4_0": 4.55, "q4_1": 4.78, "q4_k_s": 4.57, "q4_k_m": 4.83,
|
|
47
|
+
"q5_0": 5.54, "q5_1": 6.00, "q5_k_s": 5.52, "q5_k_m": 5.67,
|
|
48
|
+
"q6_k": 6.56, "q8_0": 8.50,
|
|
49
|
+
"iq2_xxs": 2.06, "iq3_xxs": 3.06, "iq4_xs": 4.25, "iq4_nl": 4.50,
|
|
50
|
+
"mxfp4": 4.25, "fp8": 8.0, "f16": 16.0, "bf16": 16.0, "f32": 32.0,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def bytes_per_weight(quant: Optional[str], dtype_bytes: float = 2.0) -> float:
|
|
55
|
+
"""Bytes per weight for a quant tag (e.g. 'gguf-q4_k_m' -> 0.60), or the dtype default."""
|
|
56
|
+
if quant:
|
|
57
|
+
k = quant.lower().replace("gguf-", "").replace("-", "_").strip()
|
|
58
|
+
if k in _BPW:
|
|
59
|
+
return _BPW[k] / 8.0
|
|
60
|
+
return dtype_bytes
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# Quants that quantize ONLY the routed experts, leaving attention/router/embeddings/head near f16
|
|
64
|
+
# (notably MXFP4 / gpt-oss). For these the always-resident non-expert weights are far heavier per
|
|
65
|
+
# parameter than the headline quant implies, so budgeting the GPU floor at the expert bpw would
|
|
66
|
+
# UNDER-count VRAM (the optimistic, OOM-prone direction). The split keeps None-not-guess honest.
|
|
67
|
+
_EXPERT_ONLY_QUANTS = {"mxfp4"}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def non_expert_bytes_per_weight(quant: Optional[str], dtype_bytes: float = 2.0) -> float:
|
|
71
|
+
"""Bytes/weight for the ALWAYS-RESIDENT (non-expert) tensors.
|
|
72
|
+
|
|
73
|
+
Equal to the expert bpw for whole-model quants (Q4_K_M, Q8_0, ...). For expert-only quants
|
|
74
|
+
(MXFP4) the non-expert tensors stay near f16, so this returns 2.0 — a conservative upper bound
|
|
75
|
+
on the GPU floor (slightly over-budgets VRAM, which is the SAFE side for a must-offload plan).
|
|
76
|
+
"""
|
|
77
|
+
if quant:
|
|
78
|
+
k = quant.lower().replace("gguf-", "").replace("-", "_").strip()
|
|
79
|
+
if k in _EXPERT_ONLY_QUANTS:
|
|
80
|
+
return max(2.0, _BPW.get(k, 8.0) / 8.0)
|
|
81
|
+
return bytes_per_weight(quant, dtype_bytes)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def estimate_param_split(cfg: dict, num_experts: Optional[int]) -> Optional[dict]:
|
|
85
|
+
"""Closed-form param split from a HF config — no safetensors needed.
|
|
86
|
+
|
|
87
|
+
Returns the two quantities the placement planner keys off:
|
|
88
|
+
- `expert_total` / `expert_each` — routed-expert params (`--n-cpu-moe` can move these to CPU),
|
|
89
|
+
- `non_expert` — attention + router + embeddings + head + shared experts + dense layers
|
|
90
|
+
(ALWAYS on the GPU), plus `total` and `n_moe_layers`.
|
|
91
|
+
Approximations (documented, receipt-confirmed): SwiGLU expert = 3·H·I; biases/norms omitted
|
|
92
|
+
(sub-1%); a real GGUF may keep embeddings/output at higher precision than the headline quant.
|
|
93
|
+
"""
|
|
94
|
+
H = cfg.get("hidden_size")
|
|
95
|
+
L = cfg.get("num_hidden_layers")
|
|
96
|
+
if not (H and L):
|
|
97
|
+
return None
|
|
98
|
+
n_heads = cfg.get("num_attention_heads") or 0
|
|
99
|
+
n_kv = cfg.get("num_key_value_heads") or n_heads
|
|
100
|
+
head_dim = cfg.get("head_dim") or (H // n_heads if n_heads else 0)
|
|
101
|
+
vocab = cfg.get("vocab_size") or 0
|
|
102
|
+
inter = cfg.get("intermediate_size") or 0
|
|
103
|
+
moe_inter = cfg.get("moe_intermediate_size") or inter
|
|
104
|
+
tied = bool(cfg.get("tie_word_embeddings", False))
|
|
105
|
+
n_dense = cfg.get("first_k_dense_replace") or cfg.get("num_dense_layers") or 0
|
|
106
|
+
n_shared = cfg.get("n_shared_experts") or cfg.get("num_shared_experts") or 0
|
|
107
|
+
shared_inter = cfg.get("shared_expert_intermediate_size") or moe_inter
|
|
108
|
+
|
|
109
|
+
n_moe_layers = max(0, L - n_dense) if num_experts else 0
|
|
110
|
+
dense_count = L - n_moe_layers # dense FFN layers (= n_dense for MoE, = L for dense models)
|
|
111
|
+
|
|
112
|
+
expert_each = 3 * H * moe_inter if (num_experts and moe_inter) else 0
|
|
113
|
+
expert_total = (num_experts or 0) * expert_each * n_moe_layers
|
|
114
|
+
|
|
115
|
+
attn_total = (2 * H * n_heads * head_dim + 2 * H * n_kv * head_dim) * L # q,o + k,v
|
|
116
|
+
router_total = (num_experts or 0) * H * n_moe_layers
|
|
117
|
+
shared_total = (n_shared * 3 * H * shared_inter) * n_moe_layers if n_shared else 0
|
|
118
|
+
dense_ffn_total = (3 * H * inter) * dense_count
|
|
119
|
+
embed = vocab * H
|
|
120
|
+
head = 0 if tied else vocab * H
|
|
121
|
+
non_expert = attn_total + router_total + shared_total + dense_ffn_total + embed + head
|
|
122
|
+
return {
|
|
123
|
+
"expert_total": int(expert_total), "expert_each": int(expert_each),
|
|
124
|
+
"non_expert": int(non_expert), "total": int(non_expert + expert_total),
|
|
125
|
+
"n_moe_layers": int(n_moe_layers),
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def analyze_config(config: dict, name: Optional[str] = None, quant: Optional[str] = None) -> ModelProfile:
|
|
130
|
+
"""Build a ModelProfile from a HuggingFace-style config.json dict.
|
|
131
|
+
|
|
132
|
+
Recognizes the common MoE keys across Mixtral / Qwen-MoE / DeepSeek-V2/V3 / OLMoE.
|
|
133
|
+
Fields it cannot determine are left None (never guessed).
|
|
134
|
+
"""
|
|
135
|
+
name = name or config.get("_name_or_path") or config.get("model_type") or "unknown"
|
|
136
|
+
n_layers = config.get("num_hidden_layers")
|
|
137
|
+
n_attn_heads = config.get("num_attention_heads")
|
|
138
|
+
n_kv_heads = config.get("num_key_value_heads") or n_attn_heads
|
|
139
|
+
hidden = config.get("hidden_size")
|
|
140
|
+
head_dim = config.get("head_dim")
|
|
141
|
+
if head_dim is None and hidden and n_attn_heads:
|
|
142
|
+
head_dim = hidden // n_attn_heads
|
|
143
|
+
dtype_bytes = _dtype_bytes_from(config)
|
|
144
|
+
|
|
145
|
+
# MoE detection — the key name varies by family
|
|
146
|
+
num_experts = (config.get("num_local_experts") or config.get("num_experts")
|
|
147
|
+
or config.get("n_routed_experts"))
|
|
148
|
+
experts_per_token = (config.get("num_experts_per_tok") or config.get("num_experts_per_token")
|
|
149
|
+
or config.get("moe_topk"))
|
|
150
|
+
is_moe = num_experts is not None
|
|
151
|
+
split = estimate_param_split(config, num_experts)
|
|
152
|
+
expert = ExpertInfo(
|
|
153
|
+
is_moe=is_moe,
|
|
154
|
+
num_experts=num_experts,
|
|
155
|
+
experts_per_token=experts_per_token,
|
|
156
|
+
shared_params=config.get("n_shared_experts"),
|
|
157
|
+
expert_params_each=(split or {}).get("expert_each") or None,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
kvbpt = None
|
|
161
|
+
if n_layers and n_kv_heads and head_dim:
|
|
162
|
+
kvbpt = kv_bytes_per_token(n_layers, n_kv_heads, head_dim, dtype_bytes)
|
|
163
|
+
|
|
164
|
+
return ModelProfile(
|
|
165
|
+
name=name,
|
|
166
|
+
architecture="moe" if is_moe else ("dense" if n_layers else "unknown"),
|
|
167
|
+
total_params=(split or {}).get("total"),
|
|
168
|
+
n_layers=n_layers,
|
|
169
|
+
n_kv_heads=n_kv_heads,
|
|
170
|
+
head_dim=head_dim,
|
|
171
|
+
dtype_bytes=dtype_bytes,
|
|
172
|
+
quant=quant,
|
|
173
|
+
expert=expert,
|
|
174
|
+
kv_bytes_per_token=kvbpt,
|
|
175
|
+
expert_params_total=(split or {}).get("expert_total"),
|
|
176
|
+
non_expert_params=(split or {}).get("non_expert"),
|
|
177
|
+
n_moe_layers=(split or {}).get("n_moe_layers"),
|
|
178
|
+
)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""NVMe benchmark via fio — sequential AND random-QD1, on the path that actually matters.
|
|
2
|
+
|
|
3
|
+
docker-knowledge wave-2 `hw-measurement` spec:
|
|
4
|
+
- Two passes: SEQUENTIAL (`--rw=read --bs=256k --iodepth=64`) and RANDOM-QD1
|
|
5
|
+
(`--rw=randread --bs=4k --iodepth=1`). QD1 4k is the latency-bound figure the
|
|
6
|
+
cold-expert / KV-spill streaming path actually hits; the sequential headline overstates
|
|
7
|
+
offload throughput by ~10x, so the planner keys streaming math off the QD1 number.
|
|
8
|
+
- `--direct=1 --ioengine=libaio` is mandatory to bypass the OS page cache (otherwise we
|
|
9
|
+
measure RAM, not the SSD). If O_DIRECT is unsupported we REFUSE — never a silent
|
|
10
|
+
buffered fallback that reports a dishonest number.
|
|
11
|
+
- Target a bind-mounted / named volume on the ext4 vdisk. The container's overlay2 layer
|
|
12
|
+
breaks O_DIRECT and mismeasures; a `/mnt/<letter>` drvfs/9p path is ~5-10x slower than
|
|
13
|
+
real ext4. We detect the mount type and refuse the wrong filesystem rather than lie.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
from typing import Optional, Tuple
|
|
22
|
+
|
|
23
|
+
# Filesystems we must NOT measure on (they produce dishonest numbers).
|
|
24
|
+
_BAD_FS = {"overlay", "overlayfs", "9p", "drvfs", "v9fs", "fuse.drvfs"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _mount_for(path: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
|
28
|
+
"""Return (mountpoint, fstype, device) for the filesystem backing `path`.
|
|
29
|
+
|
|
30
|
+
Reads /proc/mounts and picks the longest mountpoint that is a prefix of `path`.
|
|
31
|
+
Returns (None, None, None) where /proc/mounts is unavailable (e.g. a Windows host).
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
with open("/proc/mounts", "r", encoding="utf-8", errors="ignore") as f:
|
|
35
|
+
entries = []
|
|
36
|
+
for line in f:
|
|
37
|
+
parts = line.split()
|
|
38
|
+
if len(parts) >= 3:
|
|
39
|
+
entries.append((parts[1], parts[2], parts[0])) # mountpoint, fstype, device
|
|
40
|
+
except OSError:
|
|
41
|
+
return (None, None, None)
|
|
42
|
+
target = os.path.abspath(path)
|
|
43
|
+
best = (None, None, None)
|
|
44
|
+
best_len = -1
|
|
45
|
+
for mp, fstype, dev in entries:
|
|
46
|
+
if (target == mp or target.startswith(mp.rstrip("/") + "/") or mp == "/") and len(mp) > best_len:
|
|
47
|
+
best, best_len = (mp, fstype, dev), len(mp)
|
|
48
|
+
return best
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def resolve_bench_dir(explicit: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
|
|
52
|
+
"""Pick the directory to benchmark. Returns (dir, reason_if_none)."""
|
|
53
|
+
candidate = explicit or os.environ.get("GPU_CONTAINER_BENCH_DIR")
|
|
54
|
+
if candidate:
|
|
55
|
+
return (candidate, None)
|
|
56
|
+
if os.path.isdir("/bench"): # the conventional bind-mount target (see Dockerfile)
|
|
57
|
+
return ("/bench", None)
|
|
58
|
+
return (None, "no bench dir: pass --bench-dir or mount an ext4 volume at /bench "
|
|
59
|
+
"(-v <host-nvme-path>:/bench)")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _run_fio(fio: str, testfile: str, rw: str, bs: str, qd: int,
|
|
63
|
+
size_gib: int, runtime_s: int, ramp_s: int) -> dict:
|
|
64
|
+
cmd = [
|
|
65
|
+
fio, "--name=gpc", f"--filename={testfile}",
|
|
66
|
+
"--direct=1", "--ioengine=libaio",
|
|
67
|
+
f"--rw={rw}", f"--bs={bs}", f"--iodepth={qd}", "--numjobs=1",
|
|
68
|
+
f"--size={size_gib}G", "--time_based", f"--runtime={runtime_s}",
|
|
69
|
+
f"--ramp_time={ramp_s}", "--group_reporting", "--output-format=json",
|
|
70
|
+
]
|
|
71
|
+
try:
|
|
72
|
+
p = subprocess.run(cmd, capture_output=True, text=True, timeout=runtime_s + ramp_s + 180)
|
|
73
|
+
except (FileNotFoundError, subprocess.SubprocessError) as e:
|
|
74
|
+
return {"error": f"fio invocation failed: {e}"}
|
|
75
|
+
|
|
76
|
+
blob = (p.stdout or "") + "\n" + (p.stderr or "")
|
|
77
|
+
if "O_DIRECT" in blob or "does not support" in blob or "Operation not supported" in blob:
|
|
78
|
+
return {"error": "O_DIRECT unsupported on this path (overlay fs?) — refusing buffered "
|
|
79
|
+
"fallback; mount an ext4 volume", "stderr": (p.stderr or "")[:400]}
|
|
80
|
+
if p.returncode != 0:
|
|
81
|
+
return {"error": f"fio exit {p.returncode}", "stderr": (p.stderr or "")[:400]}
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
j = json.loads(p.stdout)
|
|
85
|
+
r = j["jobs"][0]["read"]
|
|
86
|
+
except (ValueError, KeyError, IndexError) as e:
|
|
87
|
+
return {"error": f"could not parse fio json: {e}"}
|
|
88
|
+
|
|
89
|
+
lat = r.get("lat_ns") or r.get("clat_ns") or {}
|
|
90
|
+
return {
|
|
91
|
+
"bw_bytes": r.get("bw_bytes"), # bytes/sec
|
|
92
|
+
"iops": r.get("iops"),
|
|
93
|
+
"lat_us_mean": round(lat["mean"] / 1000.0, 2) if lat.get("mean") is not None else None,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def measure_nvme(bench_dir: Optional[str] = None, size_gib: int = 4,
|
|
98
|
+
runtime_s: int = 8, ramp_s: int = 2) -> dict:
|
|
99
|
+
"""Run the seq + random-QD1 fio passes on a validated mount. Honest dict, never raises."""
|
|
100
|
+
out: dict = {
|
|
101
|
+
"seq_read_gbps": None, "rand_qd1_iops": None, "rand_qd1_mbps": None,
|
|
102
|
+
"rand_qd1_lat_us": None, "fs_type": None, "mount": None, "bench_dir": None,
|
|
103
|
+
"direct": True, "ioengine": "libaio", "size_gib": size_gib,
|
|
104
|
+
}
|
|
105
|
+
fio = shutil.which("fio")
|
|
106
|
+
if not fio:
|
|
107
|
+
out["error"] = "fio not found (install fio in the container)"
|
|
108
|
+
return out
|
|
109
|
+
|
|
110
|
+
target, reason = resolve_bench_dir(bench_dir)
|
|
111
|
+
if target is None:
|
|
112
|
+
out["error"] = reason
|
|
113
|
+
return out
|
|
114
|
+
out["bench_dir"] = target
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
os.makedirs(target, exist_ok=True)
|
|
118
|
+
except OSError as e:
|
|
119
|
+
out["error"] = f"bench dir not writable: {e}"
|
|
120
|
+
return out
|
|
121
|
+
|
|
122
|
+
mp, fstype, _dev = _mount_for(target)
|
|
123
|
+
out["mount"], out["fs_type"] = mp, fstype
|
|
124
|
+
if fstype and fstype.lower() in _BAD_FS:
|
|
125
|
+
out["error"] = (f"refusing to benchmark fs '{fstype}' at {mp}: overlay/drvfs/9p "
|
|
126
|
+
f"mismeasure NVMe — mount an ext4 volume at {target}")
|
|
127
|
+
return out
|
|
128
|
+
if target.startswith("/mnt/") and fstype not in (None,):
|
|
129
|
+
out["error"] = f"refusing /mnt drvfs path {target} (~5-10x slower than ext4)"
|
|
130
|
+
return out
|
|
131
|
+
|
|
132
|
+
testfile = os.path.join(target, ".gpu_container_fio_test.bin")
|
|
133
|
+
try:
|
|
134
|
+
seq = _run_fio(fio, testfile, "read", "256k", 64, size_gib, runtime_s, ramp_s)
|
|
135
|
+
qd1 = _run_fio(fio, testfile, "randread", "4k", 1, size_gib, runtime_s, ramp_s)
|
|
136
|
+
finally:
|
|
137
|
+
try:
|
|
138
|
+
if os.path.exists(testfile):
|
|
139
|
+
os.remove(testfile)
|
|
140
|
+
except OSError:
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
errors = []
|
|
144
|
+
if "error" in seq:
|
|
145
|
+
errors.append("seq: " + seq["error"])
|
|
146
|
+
elif seq.get("bw_bytes"):
|
|
147
|
+
out["seq_read_gbps"] = round(seq["bw_bytes"] / 1e9, 3)
|
|
148
|
+
if "error" in qd1:
|
|
149
|
+
errors.append("qd1: " + qd1["error"])
|
|
150
|
+
else:
|
|
151
|
+
if qd1.get("iops") is not None:
|
|
152
|
+
out["rand_qd1_iops"] = round(qd1["iops"], 1)
|
|
153
|
+
if qd1.get("bw_bytes"):
|
|
154
|
+
out["rand_qd1_mbps"] = round(qd1["bw_bytes"] / 1e6, 2)
|
|
155
|
+
out["rand_qd1_lat_us"] = qd1.get("lat_us_mean")
|
|
156
|
+
if errors:
|
|
157
|
+
out["error"] = " | ".join(errors)
|
|
158
|
+
return out
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Profile schema — the contract every downstream component (planner, receipt) reads.
|
|
2
|
+
|
|
3
|
+
A `Profile` = `HardwareProfile` + optional `ModelProfile` + provenance, fully
|
|
4
|
+
JSON-serializable. Measurement fields are `Optional` and default to `None`: a value
|
|
5
|
+
of `None` means "not measured / unknown", and the planner MUST NOT treat it as zero
|
|
6
|
+
or assume a spec-sheet number (docker-knowledge wave-1, lane `hw-measurement`: honest
|
|
7
|
+
refusal depends on honest inputs; consumer cards are PCIe-bound and NVMe random-QD1 is
|
|
8
|
+
far below sequential, so guessing here silently corrupts every downstream plan).
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from dataclasses import asdict, dataclass, field, fields, is_dataclass
|
|
14
|
+
from typing import Any, List, Optional
|
|
15
|
+
|
|
16
|
+
SCHEMA_VERSION = "0.1.0"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class GpuInfo:
|
|
21
|
+
name: str
|
|
22
|
+
vram_total_mib: Optional[int] = None
|
|
23
|
+
vram_free_mib: Optional[int] = None
|
|
24
|
+
vram_reserved_mib: Optional[int] = None # driver-reserved (pynvml v2 only); v1 folds this into 'used'
|
|
25
|
+
driver_version: Optional[str] = None
|
|
26
|
+
cuda_version: Optional[str] = None
|
|
27
|
+
compute_capability: Optional[str] = None # e.g. "12.0" for sm_120 (desktop Blackwell)
|
|
28
|
+
pcie_gen: Optional[int] = None
|
|
29
|
+
pcie_width: Optional[int] = None # lanes, e.g. 16
|
|
30
|
+
vram_source: Optional[str] = None # "pynvml-v2" | "pynvml-v1" | "nvidia-smi" (provenance)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class PlatformInfo:
|
|
35
|
+
os: str # "windows" | "linux"
|
|
36
|
+
in_container: bool = False
|
|
37
|
+
wsl2: bool = False
|
|
38
|
+
container_runtime: Optional[str] = None # "docker" | None
|
|
39
|
+
nvidia_runtime: Optional[bool] = None # NVIDIA Container Toolkit wired in
|
|
40
|
+
# The load-bearing positioning (docker-knowledge lane `container-runtime`):
|
|
41
|
+
# CUDA UVM oversubscription is unavailable on windows/wsl2 -> explicit placement only.
|
|
42
|
+
uvm_oversubscription: Optional[bool] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class BandwidthInfo:
|
|
47
|
+
"""Measured, never spec-sheet. None until benchmarked (hardened in docker-knowledge wave-2)."""
|
|
48
|
+
pcie_h2d_gbps: Optional[float] = None # achieved pinned (~50-55 on Gen5; never the 64 theoretical)
|
|
49
|
+
pcie_d2h_gbps: Optional[float] = None # measured separately — asymmetry is real
|
|
50
|
+
nvme_seq_read_gbps: Optional[float] = None # optimistic ceiling only
|
|
51
|
+
nvme_rand_qd1_read_iops: Optional[float] = None # the one a sequential assumption gets wrong
|
|
52
|
+
nvme_rand_qd1_read_mbps: Optional[float] = None # what cold-expert / KV-spill streaming math keys off
|
|
53
|
+
method: Optional[str] = None # how it was measured (provenance summary)
|
|
54
|
+
details: Optional[dict] = None # structured provenance: buffer sizes, fs, samples, flags
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class MemoryInfo:
|
|
59
|
+
ram_total_gib: Optional[float] = None
|
|
60
|
+
ram_available_gib: Optional[float] = None
|
|
61
|
+
pinnable_ceiling_gib: Optional[float] = None # WSL2 limits this (docker-knowledge container-runtime)
|
|
62
|
+
pinnable_method: Optional[str] = None # how the ceiling was probed (provenance)
|
|
63
|
+
pinnable_capped: Optional[bool] = None # True => probe hit its max; ceiling is a lower bound
|
|
64
|
+
cpu_mem_bw_gbps: Optional[float] = None # measured CPU RAM bandwidth — the MoE-offload throughput input
|
|
65
|
+
cpu_mem_bw_method: Optional[str] = None # how it was measured (provenance)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class HardwareProfile:
|
|
70
|
+
gpu: GpuInfo
|
|
71
|
+
platform: PlatformInfo
|
|
72
|
+
bandwidth: BandwidthInfo = field(default_factory=lambda: BandwidthInfo())
|
|
73
|
+
memory: MemoryInfo = field(default_factory=lambda: MemoryInfo())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class ExpertInfo:
|
|
78
|
+
is_moe: bool = False
|
|
79
|
+
num_experts: Optional[int] = None
|
|
80
|
+
experts_per_token: Optional[int] = None # top-k
|
|
81
|
+
shared_params: Optional[int] = None
|
|
82
|
+
expert_params_each: Optional[int] = None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class ModelProfile:
|
|
87
|
+
name: str
|
|
88
|
+
architecture: str = "unknown" # "dense" | "moe" | "unknown"
|
|
89
|
+
total_params: Optional[int] = None
|
|
90
|
+
n_layers: Optional[int] = None
|
|
91
|
+
n_kv_heads: Optional[int] = None
|
|
92
|
+
head_dim: Optional[int] = None
|
|
93
|
+
dtype_bytes: float = 2.0 # fp16/bf16 default
|
|
94
|
+
quant: Optional[str] = None # "gguf-q4_k_m" | "gptq" | "awq" | "fp8" | None
|
|
95
|
+
expert: ExpertInfo = field(default_factory=ExpertInfo)
|
|
96
|
+
kv_bytes_per_token: Optional[int] = None # closed-form; see model.kv_bytes_per_token()
|
|
97
|
+
# Closed-form param split (model.analyze_config) — the planner's placement math keys off these.
|
|
98
|
+
# `expert_params_total` is the part `--n-cpu-moe` can move to CPU; `non_expert_params` (attention,
|
|
99
|
+
# router, embeddings, head, shared experts, dense layers) ALWAYS stays on the GPU.
|
|
100
|
+
expert_params_total: Optional[int] = None
|
|
101
|
+
non_expert_params: Optional[int] = None
|
|
102
|
+
n_moe_layers: Optional[int] = None # layers that actually carry routed experts
|
|
103
|
+
|
|
104
|
+
def kv_bytes_at(self, context_tokens: int, batch: int = 1) -> Optional[int]:
|
|
105
|
+
"""KV-cache bytes at a given context — linear in context (docker-knowledge throughput-prediction)."""
|
|
106
|
+
if self.kv_bytes_per_token is None:
|
|
107
|
+
return None
|
|
108
|
+
return self.kv_bytes_per_token * context_tokens * batch
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class Profile:
|
|
113
|
+
schema_version: str
|
|
114
|
+
created: str # ISO date, passed in (workflows/runners have no clock)
|
|
115
|
+
hardware: HardwareProfile
|
|
116
|
+
model: Optional[ModelProfile] = None
|
|
117
|
+
notes: List[str] = field(default_factory=list)
|
|
118
|
+
|
|
119
|
+
# --- serialization -------------------------------------------------------
|
|
120
|
+
def to_dict(self) -> dict:
|
|
121
|
+
return asdict(self)
|
|
122
|
+
|
|
123
|
+
def to_json(self, indent: int = 2) -> str:
|
|
124
|
+
return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def from_dict(cls, d: dict) -> "Profile":
|
|
128
|
+
return _build(cls, d)
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def from_json(cls, s: str) -> "Profile":
|
|
132
|
+
return cls.from_dict(json.loads(s))
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass
|
|
136
|
+
class PlacementPlan:
|
|
137
|
+
"""The planner's output: how to place an MoE model across VRAM/RAM for a runtime, with a
|
|
138
|
+
predicted memory map + throughput and an honest ship/refuse verdict (>1 tok/s floor)."""
|
|
139
|
+
fits: bool
|
|
140
|
+
verdict: str # "ship" | "refuse"
|
|
141
|
+
runtime: str = "llama.cpp"
|
|
142
|
+
n_cpu_moe: Optional[int] = None # the --n-cpu-moe value (MoE layers whose experts -> CPU RAM)
|
|
143
|
+
n_moe_layers: Optional[int] = None
|
|
144
|
+
llama_flags: Optional[str] = None # the exact flag string to launch with
|
|
145
|
+
vram_budget_mib: Optional[float] = None
|
|
146
|
+
vram_used_mib: Optional[float] = None
|
|
147
|
+
ram_used_mib: Optional[float] = None # CPU-resident expert bytes (regular host RAM)
|
|
148
|
+
predicted_decode_tok_s: Optional[float] = None # the CALIBRATED forecast (== ceiling when uncalibrated)
|
|
149
|
+
ceiling_decode_tok_s: Optional[float] = None # the roofline upper bound (real decode is a fraction of it)
|
|
150
|
+
predicted_band_low_tok_s: Optional[float] = None # calibrated band low (None when uncalibrated)
|
|
151
|
+
predicted_band_high_tok_s: Optional[float] = None # calibrated band high
|
|
152
|
+
throughput_basis: Optional[str] = None # "in-VRAM (±10%)" | "cpu-offload estimate — confirmed by receipt"
|
|
153
|
+
calibration_basis: Optional[str] = None # how the forecast was derived (calibrated vs raw ceiling)
|
|
154
|
+
calibration_n_samples: Optional[int] = None # receipts informing the forecast (None/0 = uncalibrated)
|
|
155
|
+
floor_tok_s: float = 1.0
|
|
156
|
+
message: Optional[str] = None # plan summary / contrastive refusal frame
|
|
157
|
+
assumptions: Optional[dict] = None # cpu_mem_bw_gbps, vram_bw_gbps, bpw, ctx, overhead_mib
|
|
158
|
+
|
|
159
|
+
def to_dict(self) -> dict:
|
|
160
|
+
return asdict(self)
|
|
161
|
+
|
|
162
|
+
def to_json(self, indent: int = 2) -> str:
|
|
163
|
+
return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def from_dict(cls, d: dict) -> "PlacementPlan":
|
|
167
|
+
return _build(cls, d)
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def from_json(cls, s: str) -> "PlacementPlan":
|
|
171
|
+
return cls.from_dict(json.loads(s))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dataclass
|
|
175
|
+
class Receipt:
|
|
176
|
+
"""The measured proof — a DIFFERENT mechanism (measurement) verifying the planner's forecast."""
|
|
177
|
+
runtime: str = "llama.cpp"
|
|
178
|
+
n_cpu_moe: Optional[int] = None
|
|
179
|
+
measured_decode_tok_s: Optional[float] = None
|
|
180
|
+
measured_prefill_tok_s: Optional[float] = None
|
|
181
|
+
measured_vram_used_mib: Optional[float] = None
|
|
182
|
+
predicted_decode_tok_s: Optional[float] = None # the plan's CALIBRATED forecast
|
|
183
|
+
ceiling_decode_tok_s: Optional[float] = None # the plan's roofline upper bound
|
|
184
|
+
decode_error_pct: Optional[float] = None # 100*(measured-predicted)/predicted vs the calibrated forecast
|
|
185
|
+
realized_efficiency_pct: Optional[float] = None # 100*measured/ceiling — the calibration seed
|
|
186
|
+
within_band: Optional[bool] = None # did measured land inside the calibrated band? (the proof)
|
|
187
|
+
cleared_floor: Optional[bool] = None
|
|
188
|
+
# Optional per-expert routing de-risk (set when the receipt is built with --trace; activation.py gate):
|
|
189
|
+
routing_cache_helps: Optional[bool] = None # would a hot-expert cache help THIS workload?
|
|
190
|
+
routing_hot_frac_for_coverage: Optional[float] = None # fraction of experts for the routing-coverage target
|
|
191
|
+
routing_concentration: Optional[float] = None # 1 - normalized entropy (0 uniform, 1 peaked)
|
|
192
|
+
# Optional safety envelope (set when the receipt is built with --peaks from a supervised run; watchdog.py):
|
|
193
|
+
peak_gpu_power_pct: Optional[float] = None # worst GPU power draw % observed during the run
|
|
194
|
+
peak_gpu_temp_c: Optional[float] = None
|
|
195
|
+
peak_gpu_vram_used_mib: Optional[float] = None
|
|
196
|
+
peak_host_mem_pct: Optional[float] = None # THE incident metric — worst host memory % during the run
|
|
197
|
+
min_host_avail_mib: Optional[float] = None # lowest free host RAM seen
|
|
198
|
+
safety_samples: Optional[int] = None # how many watchdog polls backed the envelope
|
|
199
|
+
stayed_within_envelope: Optional[bool] = None # True => no watchdog abort fired across the run
|
|
200
|
+
method: Optional[str] = None # how it was measured (provenance)
|
|
201
|
+
notes: List[str] = field(default_factory=list)
|
|
202
|
+
|
|
203
|
+
def to_dict(self) -> dict:
|
|
204
|
+
return asdict(self)
|
|
205
|
+
|
|
206
|
+
def to_json(self, indent: int = 2) -> str:
|
|
207
|
+
return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _build(dc_type: Any, value: Any) -> Any:
|
|
211
|
+
"""Recursively reconstruct a (possibly nested) dataclass from plain dicts.
|
|
212
|
+
|
|
213
|
+
Tolerant: ignores unknown keys and leaves missing fields at their defaults, so an
|
|
214
|
+
older profile.json still loads against a newer schema.
|
|
215
|
+
"""
|
|
216
|
+
if not is_dataclass(dc_type) or value is None:
|
|
217
|
+
return value
|
|
218
|
+
kwargs = {}
|
|
219
|
+
type_by_name = {f.name: f.type for f in fields(dc_type)}
|
|
220
|
+
known = set(type_by_name)
|
|
221
|
+
for k, v in value.items():
|
|
222
|
+
if k not in known:
|
|
223
|
+
continue
|
|
224
|
+
ftype = type_by_name[k]
|
|
225
|
+
# nested dataclass fields are referenced by their global type here
|
|
226
|
+
nested = _GLOBALS.get(_strip_optional(ftype))
|
|
227
|
+
kwargs[k] = _build(nested, v) if (nested and isinstance(v, dict)) else v
|
|
228
|
+
return dc_type(**kwargs)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _strip_optional(t: Any) -> str:
|
|
232
|
+
"""Best-effort: map a field's type annotation to a bare dataclass name for nesting."""
|
|
233
|
+
s = t if isinstance(t, str) else getattr(t, "__name__", str(t))
|
|
234
|
+
# annotations may arrive as "Optional[ExpertInfo]" / "ExpertInfo" depending on import style
|
|
235
|
+
for name in _GLOBALS:
|
|
236
|
+
if name in s:
|
|
237
|
+
return name
|
|
238
|
+
return s
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
_GLOBALS = {
|
|
242
|
+
"GpuInfo": GpuInfo, "PlatformInfo": PlatformInfo, "BandwidthInfo": BandwidthInfo,
|
|
243
|
+
"MemoryInfo": MemoryInfo, "HardwareProfile": HardwareProfile, "ExpertInfo": ExpertInfo,
|
|
244
|
+
"ModelProfile": ModelProfile,
|
|
245
|
+
}
|