gpu-container 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,178 @@
1
+ """Model profiler — analyze a model's architecture + memory growth before loading.
2
+
3
+ What's REAL here today (closed-form, deterministic — docker-knowledge throughput-prediction
4
+ finding "Memory is exact"):
5
+ - KV-cache bytes/token and at a given context (linear in context).
6
+ - dense vs MoE detection, expert structure, from a HuggingFace config dict.
7
+
8
+ What's STUBBED (needs the file on disk / a download — a later Phase-1 step):
9
+ - total_params and per-layer/per-expert byte accounting from safetensors/GGUF headers.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from typing import Optional
14
+
15
+ from .schema import ExpertInfo, ModelProfile
16
+
17
+ # dtype name -> bytes per element
18
+ _DTYPE_BYTES = {
19
+ "float32": 4.0, "float": 4.0, "fp32": 4.0,
20
+ "float16": 2.0, "fp16": 2.0, "half": 2.0,
21
+ "bfloat16": 2.0, "bf16": 2.0,
22
+ "float8": 1.0, "fp8": 1.0, "e4m3": 1.0, "e5m2": 1.0,
23
+ }
24
+
25
+
26
+ def kv_bytes_per_token(n_layers: int, n_kv_heads: int, head_dim: int, dtype_bytes: float = 2.0) -> int:
27
+ """Closed-form KV-cache size per token, in bytes.
28
+
29
+ bytes/token = 2 (K and V) * n_layers * n_kv_heads * head_dim * dtype_bytes
30
+
31
+ Per NVIDIA "Mastering LLM Techniques: Inference Optimization" (docker-knowledge
32
+ throughput-prediction). GQA/MQA shrink this via n_kv_heads < n_attention_heads.
33
+ """
34
+ return int(2 * n_layers * n_kv_heads * head_dim * dtype_bytes)
35
+
36
+
37
+ def _dtype_bytes_from(cfg: dict) -> float:
38
+ td = str(cfg.get("torch_dtype") or cfg.get("dtype") or "bfloat16").lower()
39
+ return _DTYPE_BYTES.get(td, 2.0)
40
+
41
+
42
+ # Effective GGUF bits-per-weight (incl. quant metadata overhead) — llama.cpp quant tables.
43
+ # Used to turn a closed-form param count into a realistic on-device byte footprint.
44
+ _BPW = {
45
+ "q2_k": 3.35, "q3_k_s": 3.50, "q3_k_m": 3.91, "q3_k_l": 4.27,
46
+ "q4_0": 4.55, "q4_1": 4.78, "q4_k_s": 4.57, "q4_k_m": 4.83,
47
+ "q5_0": 5.54, "q5_1": 6.00, "q5_k_s": 5.52, "q5_k_m": 5.67,
48
+ "q6_k": 6.56, "q8_0": 8.50,
49
+ "iq2_xxs": 2.06, "iq3_xxs": 3.06, "iq4_xs": 4.25, "iq4_nl": 4.50,
50
+ "mxfp4": 4.25, "fp8": 8.0, "f16": 16.0, "bf16": 16.0, "f32": 32.0,
51
+ }
52
+
53
+
54
+ def bytes_per_weight(quant: Optional[str], dtype_bytes: float = 2.0) -> float:
55
+ """Bytes per weight for a quant tag (e.g. 'gguf-q4_k_m' -> 0.60), or the dtype default."""
56
+ if quant:
57
+ k = quant.lower().replace("gguf-", "").replace("-", "_").strip()
58
+ if k in _BPW:
59
+ return _BPW[k] / 8.0
60
+ return dtype_bytes
61
+
62
+
63
+ # Quants that quantize ONLY the routed experts, leaving attention/router/embeddings/head near f16
64
+ # (notably MXFP4 / gpt-oss). For these the always-resident non-expert weights are far heavier per
65
+ # parameter than the headline quant implies, so budgeting the GPU floor at the expert bpw would
66
+ # UNDER-count VRAM (the optimistic, OOM-prone direction). The split keeps None-not-guess honest.
67
+ _EXPERT_ONLY_QUANTS = {"mxfp4"}
68
+
69
+
70
+ def non_expert_bytes_per_weight(quant: Optional[str], dtype_bytes: float = 2.0) -> float:
71
+ """Bytes/weight for the ALWAYS-RESIDENT (non-expert) tensors.
72
+
73
+ Equal to the expert bpw for whole-model quants (Q4_K_M, Q8_0, ...). For expert-only quants
74
+ (MXFP4) the non-expert tensors stay near f16, so this returns 2.0 — a conservative upper bound
75
+ on the GPU floor (slightly over-budgets VRAM, which is the SAFE side for a must-offload plan).
76
+ """
77
+ if quant:
78
+ k = quant.lower().replace("gguf-", "").replace("-", "_").strip()
79
+ if k in _EXPERT_ONLY_QUANTS:
80
+ return max(2.0, _BPW.get(k, 8.0) / 8.0)
81
+ return bytes_per_weight(quant, dtype_bytes)
82
+
83
+
84
+ def estimate_param_split(cfg: dict, num_experts: Optional[int]) -> Optional[dict]:
85
+ """Closed-form param split from a HF config — no safetensors needed.
86
+
87
+ Returns the two quantities the placement planner keys off:
88
+ - `expert_total` / `expert_each` — routed-expert params (`--n-cpu-moe` can move these to CPU),
89
+ - `non_expert` — attention + router + embeddings + head + shared experts + dense layers
90
+ (ALWAYS on the GPU), plus `total` and `n_moe_layers`.
91
+ Approximations (documented, receipt-confirmed): SwiGLU expert = 3·H·I; biases/norms omitted
92
+ (sub-1%); a real GGUF may keep embeddings/output at higher precision than the headline quant.
93
+ """
94
+ H = cfg.get("hidden_size")
95
+ L = cfg.get("num_hidden_layers")
96
+ if not (H and L):
97
+ return None
98
+ n_heads = cfg.get("num_attention_heads") or 0
99
+ n_kv = cfg.get("num_key_value_heads") or n_heads
100
+ head_dim = cfg.get("head_dim") or (H // n_heads if n_heads else 0)
101
+ vocab = cfg.get("vocab_size") or 0
102
+ inter = cfg.get("intermediate_size") or 0
103
+ moe_inter = cfg.get("moe_intermediate_size") or inter
104
+ tied = bool(cfg.get("tie_word_embeddings", False))
105
+ n_dense = cfg.get("first_k_dense_replace") or cfg.get("num_dense_layers") or 0
106
+ n_shared = cfg.get("n_shared_experts") or cfg.get("num_shared_experts") or 0
107
+ shared_inter = cfg.get("shared_expert_intermediate_size") or moe_inter
108
+
109
+ n_moe_layers = max(0, L - n_dense) if num_experts else 0
110
+ dense_count = L - n_moe_layers # dense FFN layers (= n_dense for MoE, = L for dense models)
111
+
112
+ expert_each = 3 * H * moe_inter if (num_experts and moe_inter) else 0
113
+ expert_total = (num_experts or 0) * expert_each * n_moe_layers
114
+
115
+ attn_total = (2 * H * n_heads * head_dim + 2 * H * n_kv * head_dim) * L # q,o + k,v
116
+ router_total = (num_experts or 0) * H * n_moe_layers
117
+ shared_total = (n_shared * 3 * H * shared_inter) * n_moe_layers if n_shared else 0
118
+ dense_ffn_total = (3 * H * inter) * dense_count
119
+ embed = vocab * H
120
+ head = 0 if tied else vocab * H
121
+ non_expert = attn_total + router_total + shared_total + dense_ffn_total + embed + head
122
+ return {
123
+ "expert_total": int(expert_total), "expert_each": int(expert_each),
124
+ "non_expert": int(non_expert), "total": int(non_expert + expert_total),
125
+ "n_moe_layers": int(n_moe_layers),
126
+ }
127
+
128
+
129
+ def analyze_config(config: dict, name: Optional[str] = None, quant: Optional[str] = None) -> ModelProfile:
130
+ """Build a ModelProfile from a HuggingFace-style config.json dict.
131
+
132
+ Recognizes the common MoE keys across Mixtral / Qwen-MoE / DeepSeek-V2/V3 / OLMoE.
133
+ Fields it cannot determine are left None (never guessed).
134
+ """
135
+ name = name or config.get("_name_or_path") or config.get("model_type") or "unknown"
136
+ n_layers = config.get("num_hidden_layers")
137
+ n_attn_heads = config.get("num_attention_heads")
138
+ n_kv_heads = config.get("num_key_value_heads") or n_attn_heads
139
+ hidden = config.get("hidden_size")
140
+ head_dim = config.get("head_dim")
141
+ if head_dim is None and hidden and n_attn_heads:
142
+ head_dim = hidden // n_attn_heads
143
+ dtype_bytes = _dtype_bytes_from(config)
144
+
145
+ # MoE detection — the key name varies by family
146
+ num_experts = (config.get("num_local_experts") or config.get("num_experts")
147
+ or config.get("n_routed_experts"))
148
+ experts_per_token = (config.get("num_experts_per_tok") or config.get("num_experts_per_token")
149
+ or config.get("moe_topk"))
150
+ is_moe = num_experts is not None
151
+ split = estimate_param_split(config, num_experts)
152
+ expert = ExpertInfo(
153
+ is_moe=is_moe,
154
+ num_experts=num_experts,
155
+ experts_per_token=experts_per_token,
156
+ shared_params=config.get("n_shared_experts"),
157
+ expert_params_each=(split or {}).get("expert_each") or None,
158
+ )
159
+
160
+ kvbpt = None
161
+ if n_layers and n_kv_heads and head_dim:
162
+ kvbpt = kv_bytes_per_token(n_layers, n_kv_heads, head_dim, dtype_bytes)
163
+
164
+ return ModelProfile(
165
+ name=name,
166
+ architecture="moe" if is_moe else ("dense" if n_layers else "unknown"),
167
+ total_params=(split or {}).get("total"),
168
+ n_layers=n_layers,
169
+ n_kv_heads=n_kv_heads,
170
+ head_dim=head_dim,
171
+ dtype_bytes=dtype_bytes,
172
+ quant=quant,
173
+ expert=expert,
174
+ kv_bytes_per_token=kvbpt,
175
+ expert_params_total=(split or {}).get("expert_total"),
176
+ non_expert_params=(split or {}).get("non_expert"),
177
+ n_moe_layers=(split or {}).get("n_moe_layers"),
178
+ )
@@ -0,0 +1,158 @@
1
+ """NVMe benchmark via fio — sequential AND random-QD1, on the path that actually matters.
2
+
3
+ docker-knowledge wave-2 `hw-measurement` spec:
4
+ - Two passes: SEQUENTIAL (`--rw=read --bs=256k --iodepth=64`) and RANDOM-QD1
5
+ (`--rw=randread --bs=4k --iodepth=1`). QD1 4k is the latency-bound figure the
6
+ cold-expert / KV-spill streaming path actually hits; the sequential headline overstates
7
+ offload throughput by ~10x, so the planner keys streaming math off the QD1 number.
8
+ - `--direct=1 --ioengine=libaio` is mandatory to bypass the OS page cache (otherwise we
9
+ measure RAM, not the SSD). If O_DIRECT is unsupported we REFUSE — never a silent
10
+ buffered fallback that reports a dishonest number.
11
+ - Target a bind-mounted / named volume on the ext4 vdisk. The container's overlay2 layer
12
+ breaks O_DIRECT and mismeasures; a `/mnt/<letter>` drvfs/9p path is ~5-10x slower than
13
+ real ext4. We detect the mount type and refuse the wrong filesystem rather than lie.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import os
19
+ import shutil
20
+ import subprocess
21
+ from typing import Optional, Tuple
22
+
23
+ # Filesystems we must NOT measure on (they produce dishonest numbers).
24
+ _BAD_FS = {"overlay", "overlayfs", "9p", "drvfs", "v9fs", "fuse.drvfs"}
25
+
26
+
27
+ def _mount_for(path: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
28
+ """Return (mountpoint, fstype, device) for the filesystem backing `path`.
29
+
30
+ Reads /proc/mounts and picks the longest mountpoint that is a prefix of `path`.
31
+ Returns (None, None, None) where /proc/mounts is unavailable (e.g. a Windows host).
32
+ """
33
+ try:
34
+ with open("/proc/mounts", "r", encoding="utf-8", errors="ignore") as f:
35
+ entries = []
36
+ for line in f:
37
+ parts = line.split()
38
+ if len(parts) >= 3:
39
+ entries.append((parts[1], parts[2], parts[0])) # mountpoint, fstype, device
40
+ except OSError:
41
+ return (None, None, None)
42
+ target = os.path.abspath(path)
43
+ best = (None, None, None)
44
+ best_len = -1
45
+ for mp, fstype, dev in entries:
46
+ if (target == mp or target.startswith(mp.rstrip("/") + "/") or mp == "/") and len(mp) > best_len:
47
+ best, best_len = (mp, fstype, dev), len(mp)
48
+ return best
49
+
50
+
51
+ def resolve_bench_dir(explicit: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
52
+ """Pick the directory to benchmark. Returns (dir, reason_if_none)."""
53
+ candidate = explicit or os.environ.get("GPU_CONTAINER_BENCH_DIR")
54
+ if candidate:
55
+ return (candidate, None)
56
+ if os.path.isdir("/bench"): # the conventional bind-mount target (see Dockerfile)
57
+ return ("/bench", None)
58
+ return (None, "no bench dir: pass --bench-dir or mount an ext4 volume at /bench "
59
+ "(-v <host-nvme-path>:/bench)")
60
+
61
+
62
+ def _run_fio(fio: str, testfile: str, rw: str, bs: str, qd: int,
63
+ size_gib: int, runtime_s: int, ramp_s: int) -> dict:
64
+ cmd = [
65
+ fio, "--name=gpc", f"--filename={testfile}",
66
+ "--direct=1", "--ioengine=libaio",
67
+ f"--rw={rw}", f"--bs={bs}", f"--iodepth={qd}", "--numjobs=1",
68
+ f"--size={size_gib}G", "--time_based", f"--runtime={runtime_s}",
69
+ f"--ramp_time={ramp_s}", "--group_reporting", "--output-format=json",
70
+ ]
71
+ try:
72
+ p = subprocess.run(cmd, capture_output=True, text=True, timeout=runtime_s + ramp_s + 180)
73
+ except (FileNotFoundError, subprocess.SubprocessError) as e:
74
+ return {"error": f"fio invocation failed: {e}"}
75
+
76
+ blob = (p.stdout or "") + "\n" + (p.stderr or "")
77
+ if "O_DIRECT" in blob or "does not support" in blob or "Operation not supported" in blob:
78
+ return {"error": "O_DIRECT unsupported on this path (overlay fs?) — refusing buffered "
79
+ "fallback; mount an ext4 volume", "stderr": (p.stderr or "")[:400]}
80
+ if p.returncode != 0:
81
+ return {"error": f"fio exit {p.returncode}", "stderr": (p.stderr or "")[:400]}
82
+
83
+ try:
84
+ j = json.loads(p.stdout)
85
+ r = j["jobs"][0]["read"]
86
+ except (ValueError, KeyError, IndexError) as e:
87
+ return {"error": f"could not parse fio json: {e}"}
88
+
89
+ lat = r.get("lat_ns") or r.get("clat_ns") or {}
90
+ return {
91
+ "bw_bytes": r.get("bw_bytes"), # bytes/sec
92
+ "iops": r.get("iops"),
93
+ "lat_us_mean": round(lat["mean"] / 1000.0, 2) if lat.get("mean") is not None else None,
94
+ }
95
+
96
+
97
+ def measure_nvme(bench_dir: Optional[str] = None, size_gib: int = 4,
98
+ runtime_s: int = 8, ramp_s: int = 2) -> dict:
99
+ """Run the seq + random-QD1 fio passes on a validated mount. Honest dict, never raises."""
100
+ out: dict = {
101
+ "seq_read_gbps": None, "rand_qd1_iops": None, "rand_qd1_mbps": None,
102
+ "rand_qd1_lat_us": None, "fs_type": None, "mount": None, "bench_dir": None,
103
+ "direct": True, "ioengine": "libaio", "size_gib": size_gib,
104
+ }
105
+ fio = shutil.which("fio")
106
+ if not fio:
107
+ out["error"] = "fio not found (install fio in the container)"
108
+ return out
109
+
110
+ target, reason = resolve_bench_dir(bench_dir)
111
+ if target is None:
112
+ out["error"] = reason
113
+ return out
114
+ out["bench_dir"] = target
115
+
116
+ try:
117
+ os.makedirs(target, exist_ok=True)
118
+ except OSError as e:
119
+ out["error"] = f"bench dir not writable: {e}"
120
+ return out
121
+
122
+ mp, fstype, _dev = _mount_for(target)
123
+ out["mount"], out["fs_type"] = mp, fstype
124
+ if fstype and fstype.lower() in _BAD_FS:
125
+ out["error"] = (f"refusing to benchmark fs '{fstype}' at {mp}: overlay/drvfs/9p "
126
+ f"mismeasure NVMe — mount an ext4 volume at {target}")
127
+ return out
128
+ if target.startswith("/mnt/") and fstype not in (None,):
129
+ out["error"] = f"refusing /mnt drvfs path {target} (~5-10x slower than ext4)"
130
+ return out
131
+
132
+ testfile = os.path.join(target, ".gpu_container_fio_test.bin")
133
+ try:
134
+ seq = _run_fio(fio, testfile, "read", "256k", 64, size_gib, runtime_s, ramp_s)
135
+ qd1 = _run_fio(fio, testfile, "randread", "4k", 1, size_gib, runtime_s, ramp_s)
136
+ finally:
137
+ try:
138
+ if os.path.exists(testfile):
139
+ os.remove(testfile)
140
+ except OSError:
141
+ pass
142
+
143
+ errors = []
144
+ if "error" in seq:
145
+ errors.append("seq: " + seq["error"])
146
+ elif seq.get("bw_bytes"):
147
+ out["seq_read_gbps"] = round(seq["bw_bytes"] / 1e9, 3)
148
+ if "error" in qd1:
149
+ errors.append("qd1: " + qd1["error"])
150
+ else:
151
+ if qd1.get("iops") is not None:
152
+ out["rand_qd1_iops"] = round(qd1["iops"], 1)
153
+ if qd1.get("bw_bytes"):
154
+ out["rand_qd1_mbps"] = round(qd1["bw_bytes"] / 1e6, 2)
155
+ out["rand_qd1_lat_us"] = qd1.get("lat_us_mean")
156
+ if errors:
157
+ out["error"] = " | ".join(errors)
158
+ return out
@@ -0,0 +1,245 @@
1
+ """Profile schema — the contract every downstream component (planner, receipt) reads.
2
+
3
+ A `Profile` = `HardwareProfile` + optional `ModelProfile` + provenance, fully
4
+ JSON-serializable. Measurement fields are `Optional` and default to `None`: a value
5
+ of `None` means "not measured / unknown", and the planner MUST NOT treat it as zero
6
+ or assume a spec-sheet number (docker-knowledge wave-1, lane `hw-measurement`: honest
7
+ refusal depends on honest inputs; consumer cards are PCIe-bound and NVMe random-QD1 is
8
+ far below sequential, so guessing here silently corrupts every downstream plan).
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ from dataclasses import asdict, dataclass, field, fields, is_dataclass
14
+ from typing import Any, List, Optional
15
+
16
+ SCHEMA_VERSION = "0.1.0"
17
+
18
+
19
+ @dataclass
20
+ class GpuInfo:
21
+ name: str
22
+ vram_total_mib: Optional[int] = None
23
+ vram_free_mib: Optional[int] = None
24
+ vram_reserved_mib: Optional[int] = None # driver-reserved (pynvml v2 only); v1 folds this into 'used'
25
+ driver_version: Optional[str] = None
26
+ cuda_version: Optional[str] = None
27
+ compute_capability: Optional[str] = None # e.g. "12.0" for sm_120 (desktop Blackwell)
28
+ pcie_gen: Optional[int] = None
29
+ pcie_width: Optional[int] = None # lanes, e.g. 16
30
+ vram_source: Optional[str] = None # "pynvml-v2" | "pynvml-v1" | "nvidia-smi" (provenance)
31
+
32
+
33
+ @dataclass
34
+ class PlatformInfo:
35
+ os: str # "windows" | "linux"
36
+ in_container: bool = False
37
+ wsl2: bool = False
38
+ container_runtime: Optional[str] = None # "docker" | None
39
+ nvidia_runtime: Optional[bool] = None # NVIDIA Container Toolkit wired in
40
+ # The load-bearing positioning (docker-knowledge lane `container-runtime`):
41
+ # CUDA UVM oversubscription is unavailable on windows/wsl2 -> explicit placement only.
42
+ uvm_oversubscription: Optional[bool] = None
43
+
44
+
45
+ @dataclass
46
+ class BandwidthInfo:
47
+ """Measured, never spec-sheet. None until benchmarked (hardened in docker-knowledge wave-2)."""
48
+ pcie_h2d_gbps: Optional[float] = None # achieved pinned (~50-55 on Gen5; never the 64 theoretical)
49
+ pcie_d2h_gbps: Optional[float] = None # measured separately — asymmetry is real
50
+ nvme_seq_read_gbps: Optional[float] = None # optimistic ceiling only
51
+ nvme_rand_qd1_read_iops: Optional[float] = None # the one a sequential assumption gets wrong
52
+ nvme_rand_qd1_read_mbps: Optional[float] = None # what cold-expert / KV-spill streaming math keys off
53
+ method: Optional[str] = None # how it was measured (provenance summary)
54
+ details: Optional[dict] = None # structured provenance: buffer sizes, fs, samples, flags
55
+
56
+
57
+ @dataclass
58
+ class MemoryInfo:
59
+ ram_total_gib: Optional[float] = None
60
+ ram_available_gib: Optional[float] = None
61
+ pinnable_ceiling_gib: Optional[float] = None # WSL2 limits this (docker-knowledge container-runtime)
62
+ pinnable_method: Optional[str] = None # how the ceiling was probed (provenance)
63
+ pinnable_capped: Optional[bool] = None # True => probe hit its max; ceiling is a lower bound
64
+ cpu_mem_bw_gbps: Optional[float] = None # measured CPU RAM bandwidth — the MoE-offload throughput input
65
+ cpu_mem_bw_method: Optional[str] = None # how it was measured (provenance)
66
+
67
+
68
+ @dataclass
69
+ class HardwareProfile:
70
+ gpu: GpuInfo
71
+ platform: PlatformInfo
72
+ bandwidth: BandwidthInfo = field(default_factory=lambda: BandwidthInfo())
73
+ memory: MemoryInfo = field(default_factory=lambda: MemoryInfo())
74
+
75
+
76
+ @dataclass
77
+ class ExpertInfo:
78
+ is_moe: bool = False
79
+ num_experts: Optional[int] = None
80
+ experts_per_token: Optional[int] = None # top-k
81
+ shared_params: Optional[int] = None
82
+ expert_params_each: Optional[int] = None
83
+
84
+
85
+ @dataclass
86
+ class ModelProfile:
87
+ name: str
88
+ architecture: str = "unknown" # "dense" | "moe" | "unknown"
89
+ total_params: Optional[int] = None
90
+ n_layers: Optional[int] = None
91
+ n_kv_heads: Optional[int] = None
92
+ head_dim: Optional[int] = None
93
+ dtype_bytes: float = 2.0 # fp16/bf16 default
94
+ quant: Optional[str] = None # "gguf-q4_k_m" | "gptq" | "awq" | "fp8" | None
95
+ expert: ExpertInfo = field(default_factory=ExpertInfo)
96
+ kv_bytes_per_token: Optional[int] = None # closed-form; see model.kv_bytes_per_token()
97
+ # Closed-form param split (model.analyze_config) — the planner's placement math keys off these.
98
+ # `expert_params_total` is the part `--n-cpu-moe` can move to CPU; `non_expert_params` (attention,
99
+ # router, embeddings, head, shared experts, dense layers) ALWAYS stays on the GPU.
100
+ expert_params_total: Optional[int] = None
101
+ non_expert_params: Optional[int] = None
102
+ n_moe_layers: Optional[int] = None # layers that actually carry routed experts
103
+
104
+ def kv_bytes_at(self, context_tokens: int, batch: int = 1) -> Optional[int]:
105
+ """KV-cache bytes at a given context — linear in context (docker-knowledge throughput-prediction)."""
106
+ if self.kv_bytes_per_token is None:
107
+ return None
108
+ return self.kv_bytes_per_token * context_tokens * batch
109
+
110
+
111
+ @dataclass
112
+ class Profile:
113
+ schema_version: str
114
+ created: str # ISO date, passed in (workflows/runners have no clock)
115
+ hardware: HardwareProfile
116
+ model: Optional[ModelProfile] = None
117
+ notes: List[str] = field(default_factory=list)
118
+
119
+ # --- serialization -------------------------------------------------------
120
+ def to_dict(self) -> dict:
121
+ return asdict(self)
122
+
123
+ def to_json(self, indent: int = 2) -> str:
124
+ return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
125
+
126
+ @classmethod
127
+ def from_dict(cls, d: dict) -> "Profile":
128
+ return _build(cls, d)
129
+
130
+ @classmethod
131
+ def from_json(cls, s: str) -> "Profile":
132
+ return cls.from_dict(json.loads(s))
133
+
134
+
135
+ @dataclass
136
+ class PlacementPlan:
137
+ """The planner's output: how to place an MoE model across VRAM/RAM for a runtime, with a
138
+ predicted memory map + throughput and an honest ship/refuse verdict (>1 tok/s floor)."""
139
+ fits: bool
140
+ verdict: str # "ship" | "refuse"
141
+ runtime: str = "llama.cpp"
142
+ n_cpu_moe: Optional[int] = None # the --n-cpu-moe value (MoE layers whose experts -> CPU RAM)
143
+ n_moe_layers: Optional[int] = None
144
+ llama_flags: Optional[str] = None # the exact flag string to launch with
145
+ vram_budget_mib: Optional[float] = None
146
+ vram_used_mib: Optional[float] = None
147
+ ram_used_mib: Optional[float] = None # CPU-resident expert bytes (regular host RAM)
148
+ predicted_decode_tok_s: Optional[float] = None # the CALIBRATED forecast (== ceiling when uncalibrated)
149
+ ceiling_decode_tok_s: Optional[float] = None # the roofline upper bound (real decode is a fraction of it)
150
+ predicted_band_low_tok_s: Optional[float] = None # calibrated band low (None when uncalibrated)
151
+ predicted_band_high_tok_s: Optional[float] = None # calibrated band high
152
+ throughput_basis: Optional[str] = None # "in-VRAM (±10%)" | "cpu-offload estimate — confirmed by receipt"
153
+ calibration_basis: Optional[str] = None # how the forecast was derived (calibrated vs raw ceiling)
154
+ calibration_n_samples: Optional[int] = None # receipts informing the forecast (None/0 = uncalibrated)
155
+ floor_tok_s: float = 1.0
156
+ message: Optional[str] = None # plan summary / contrastive refusal frame
157
+ assumptions: Optional[dict] = None # cpu_mem_bw_gbps, vram_bw_gbps, bpw, ctx, overhead_mib
158
+
159
+ def to_dict(self) -> dict:
160
+ return asdict(self)
161
+
162
+ def to_json(self, indent: int = 2) -> str:
163
+ return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
164
+
165
+ @classmethod
166
+ def from_dict(cls, d: dict) -> "PlacementPlan":
167
+ return _build(cls, d)
168
+
169
+ @classmethod
170
+ def from_json(cls, s: str) -> "PlacementPlan":
171
+ return cls.from_dict(json.loads(s))
172
+
173
+
174
+ @dataclass
175
+ class Receipt:
176
+ """The measured proof — a DIFFERENT mechanism (measurement) verifying the planner's forecast."""
177
+ runtime: str = "llama.cpp"
178
+ n_cpu_moe: Optional[int] = None
179
+ measured_decode_tok_s: Optional[float] = None
180
+ measured_prefill_tok_s: Optional[float] = None
181
+ measured_vram_used_mib: Optional[float] = None
182
+ predicted_decode_tok_s: Optional[float] = None # the plan's CALIBRATED forecast
183
+ ceiling_decode_tok_s: Optional[float] = None # the plan's roofline upper bound
184
+ decode_error_pct: Optional[float] = None # 100*(measured-predicted)/predicted vs the calibrated forecast
185
+ realized_efficiency_pct: Optional[float] = None # 100*measured/ceiling — the calibration seed
186
+ within_band: Optional[bool] = None # did measured land inside the calibrated band? (the proof)
187
+ cleared_floor: Optional[bool] = None
188
+ # Optional per-expert routing de-risk (set when the receipt is built with --trace; activation.py gate):
189
+ routing_cache_helps: Optional[bool] = None # would a hot-expert cache help THIS workload?
190
+ routing_hot_frac_for_coverage: Optional[float] = None # fraction of experts for the routing-coverage target
191
+ routing_concentration: Optional[float] = None # 1 - normalized entropy (0 uniform, 1 peaked)
192
+ # Optional safety envelope (set when the receipt is built with --peaks from a supervised run; watchdog.py):
193
+ peak_gpu_power_pct: Optional[float] = None # worst GPU power draw % observed during the run
194
+ peak_gpu_temp_c: Optional[float] = None
195
+ peak_gpu_vram_used_mib: Optional[float] = None
196
+ peak_host_mem_pct: Optional[float] = None # THE incident metric — worst host memory % during the run
197
+ min_host_avail_mib: Optional[float] = None # lowest free host RAM seen
198
+ safety_samples: Optional[int] = None # how many watchdog polls backed the envelope
199
+ stayed_within_envelope: Optional[bool] = None # True => no watchdog abort fired across the run
200
+ method: Optional[str] = None # how it was measured (provenance)
201
+ notes: List[str] = field(default_factory=list)
202
+
203
+ def to_dict(self) -> dict:
204
+ return asdict(self)
205
+
206
+ def to_json(self, indent: int = 2) -> str:
207
+ return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
208
+
209
+
210
+ def _build(dc_type: Any, value: Any) -> Any:
211
+ """Recursively reconstruct a (possibly nested) dataclass from plain dicts.
212
+
213
+ Tolerant: ignores unknown keys and leaves missing fields at their defaults, so an
214
+ older profile.json still loads against a newer schema.
215
+ """
216
+ if not is_dataclass(dc_type) or value is None:
217
+ return value
218
+ kwargs = {}
219
+ type_by_name = {f.name: f.type for f in fields(dc_type)}
220
+ known = set(type_by_name)
221
+ for k, v in value.items():
222
+ if k not in known:
223
+ continue
224
+ ftype = type_by_name[k]
225
+ # nested dataclass fields are referenced by their global type here
226
+ nested = _GLOBALS.get(_strip_optional(ftype))
227
+ kwargs[k] = _build(nested, v) if (nested and isinstance(v, dict)) else v
228
+ return dc_type(**kwargs)
229
+
230
+
231
+ def _strip_optional(t: Any) -> str:
232
+ """Best-effort: map a field's type annotation to a bare dataclass name for nesting."""
233
+ s = t if isinstance(t, str) else getattr(t, "__name__", str(t))
234
+ # annotations may arrive as "Optional[ExpertInfo]" / "ExpertInfo" depending on import style
235
+ for name in _GLOBALS:
236
+ if name in s:
237
+ return name
238
+ return s
239
+
240
+
241
+ _GLOBALS = {
242
+ "GpuInfo": GpuInfo, "PlatformInfo": PlatformInfo, "BandwidthInfo": BandwidthInfo,
243
+ "MemoryInfo": MemoryInfo, "HardwareProfile": HardwareProfile, "ExpertInfo": ExpertInfo,
244
+ "ModelProfile": ModelProfile,
245
+ }