gpu-container 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,101 @@
1
+ """`gpu-container-plan` — turn a profile (+ model) into a llama.cpp `--n-cpu-moe` placement plan.
2
+
3
+ gpu-container-plan --profile profile.json --model-config qwen3.json --quant gguf-q4_k_m --ctx 4096
4
+
5
+ Exit code is verdict-coded (ANDON): 0 = ship, 3 = refuse. The profile.json comes from
6
+ `gpu-container-profile` (run in-container for honest VRAM/CPU-bandwidth inputs).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import sys
13
+ from typing import List, Optional
14
+
15
+ from ..errors import GpuContainerError, guard
16
+ from ..profiler import model as model_mod
17
+ from ..profiler.schema import Profile
18
+ from .calibration import CalibrationModel, CalibrationStore
19
+ from .placement import plan_llama_cpp
20
+
21
+
22
+ def _main(argv: Optional[List[str]] = None) -> int:
23
+ for _stream in (sys.stdout, sys.stderr):
24
+ try:
25
+ _stream.reconfigure(encoding="utf-8")
26
+ except (AttributeError, ValueError):
27
+ pass
28
+
29
+ ap = argparse.ArgumentParser(
30
+ prog="gpu-container-plan",
31
+ description="Plan an MoE placement (llama.cpp --n-cpu-moe) from a rig+model profile.",
32
+ )
33
+ ap.add_argument("--debug", action="store_true", help="show the full traceback on an unexpected error")
34
+ ap.add_argument("--profile", required=True, help="profile.json from gpu-container-profile")
35
+ ap.add_argument("--model-config", help="HF config.json to (re)profile the model side into the plan")
36
+ ap.add_argument("--model-name", help="override the model name")
37
+ ap.add_argument("--quant", help="quant tag, e.g. gguf-q4_k_m (drives bytes/weight + footprint)")
38
+ ap.add_argument("--ctx", type=int, default=4096, help="context length for the KV-cache budget")
39
+ ap.add_argument("--batch", type=int, default=1)
40
+ ap.add_argument("--cpu-bw", type=float, help="override CPU RAM bandwidth (GB/s)")
41
+ ap.add_argument("--non-expert-bpw", type=float,
42
+ help="bytes/weight for always-resident weights (auto: f16 for mxfp4, else the quant bpw)")
43
+ ap.add_argument("--floor", type=float, default=1.0, help="refuse below this decode tok/s")
44
+ ap.add_argument("--hf", help="model ref for the launch command, e.g. unsloth/Qwen3-30B-A3B-GGUF:Q4_K_M")
45
+ ap.add_argument("--calibration-dir", help="extra calibration receipts to fold in (atop the bundled seed)")
46
+ ap.add_argument("--no-calibration", action="store_true",
47
+ help="forecast the raw roofline ceiling only (skip the calibrated band)")
48
+ ap.add_argument("-o", "--out", help="write the plan JSON here (default: stdout)")
49
+ args = ap.parse_args(argv)
50
+
51
+ try:
52
+ with open(args.profile, "r", encoding="utf-8") as f:
53
+ prof = Profile.from_json(f.read())
54
+ except FileNotFoundError:
55
+ raise GpuContainerError("IO_PROFILE_NOT_FOUND", f"profile not found: {args.profile}",
56
+ hint="run `gpu-container-profile -o profile.json` first (in-container)")
57
+ except (ValueError, OSError) as e:
58
+ raise GpuContainerError("INPUT_BAD_PROFILE", f"could not read {args.profile}",
59
+ hint="expected a profile.json from gpu-container-profile", cause=str(e))
60
+
61
+ if args.model_config:
62
+ try:
63
+ with open(args.model_config, "r", encoding="utf-8") as f:
64
+ cfg = json.load(f)
65
+ except (OSError, ValueError) as e:
66
+ raise GpuContainerError("INPUT_BAD_MODEL_CONFIG", f"could not read {args.model_config}",
67
+ hint="expected a HuggingFace config.json", cause=str(e))
68
+ prof.model = model_mod.analyze_config(cfg, name=args.model_name, quant=args.quant or "gguf-q4_k_m")
69
+ elif args.quant and prof.model is not None:
70
+ prof.model.quant = args.quant
71
+
72
+ # Calibration: bundled seed + any extra receipts, unless disabled. With no data for the shape's
73
+ # regime the planner falls back to the raw ceiling on its own.
74
+ calibration = None
75
+ if not args.no_calibration:
76
+ extra = CalibrationStore(args.calibration_dir).points() if args.calibration_dir else None
77
+ calibration = CalibrationModel.from_seed(extra=extra)
78
+
79
+ plan = plan_llama_cpp(
80
+ prof, ctx_len=args.ctx, batch=args.batch,
81
+ cpu_mem_bw_gbps=args.cpu_bw, non_expert_bpw=args.non_expert_bpw,
82
+ floor_tok_s=args.floor, model_ref=args.hf, calibration=calibration,
83
+ )
84
+
85
+ js = plan.to_json()
86
+ if args.out:
87
+ with open(args.out, "w", encoding="utf-8") as f:
88
+ f.write(js + "\n")
89
+ print(f"wrote {args.out}", file=sys.stderr)
90
+ else:
91
+ print(js)
92
+ print(plan.message, file=sys.stderr)
93
+ return 0 if plan.verdict == "ship" else 3
94
+
95
+
96
+ def main(argv: Optional[List[str]] = None) -> int:
97
+ return guard(_main, argv)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ raise SystemExit(main())
@@ -0,0 +1,120 @@
1
+ """`gpu-container-concentration` — the per-expert-cache de-risk gate, as a command.
2
+
3
+ Given an activation trace (which experts fired, per layer), score routing CONCENTRATION and answer
4
+ the prior question for the per-expert lane: would a hot-expert VRAM cache (the llama.cpp #20757 lane)
5
+ actually help, or is routing too uniform to bother? Backs ADR-0001; logic in `activation.py`.
6
+
7
+ gpu-container-concentration --trace trace.json
8
+ gpu-container-concentration --imatrix imatrix.gguf --model-name Qwen3-30B-A3B # needs the `gguf` pkg
9
+
10
+ `--imatrix` reads a `llama-imatrix` output directly (per-layer `ffn_down_exps.weight.counts`); the
11
+ `gguf` package is an OPTIONAL dependency — only that path needs it. `--trace` keeps the core dep-free.
12
+
13
+ Exit code (ANDON-style, scriptable):
14
+ 0 = analyzed; a per-expert cache is NOT justified (routing too uniform — the common 'hold' outcome)
15
+ 5 = analyzed; routing concentrates enough that a cache could help (worth weighing #20757)
16
+ 2 = usage / input error
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import sys
22
+ from typing import List, Optional
23
+
24
+ from ..errors import GpuContainerError, guard
25
+ from .activation import ActivationTrace, analyze_concentration, load_trace
26
+
27
+
28
+ def _trace_from_imatrix(path: str, model_name: str, topk: int) -> ActivationTrace:
29
+ """Build an ActivationTrace from a llama-imatrix `imatrix.gguf` (per-expert `.counts`).
30
+
31
+ Raises ValueError on a missing `gguf` package or a non-MoE / unexpected imatrix (the caller maps
32
+ that to exit 2)."""
33
+ try:
34
+ import gguf # optional dependency — only the --imatrix path needs it
35
+ except ImportError:
36
+ raise ValueError("--imatrix needs the 'gguf' package (pip install gguf); "
37
+ "or extract a trace.json yourself and pass --trace.")
38
+ reader = gguf.GGUFReader(path)
39
+ counts = {}
40
+ for t in reader.tensors:
41
+ nm = t.name.strip()
42
+ if nm.endswith("ffn_down_exps.weight.counts") and nm.startswith("blk."):
43
+ layer = int(nm.split(".")[1])
44
+ counts[layer] = [int(round(float(x))) for x in list(t.data.flatten())]
45
+ if not counts:
46
+ raise ValueError("no per-expert counts (ffn_down_exps.weight.counts) in this imatrix — "
47
+ "is it a llama-imatrix .gguf for an MoE model?")
48
+ E = len(next(iter(counts.values())))
49
+ layers = [{"layer_index": L, "expert_counts": counts[L]} for L in sorted(counts)]
50
+ n_tokens = (sum(layers[0]["expert_counts"]) // topk) if topk else 0
51
+ return ActivationTrace.from_dict({
52
+ "model": model_name, "num_experts": E, "experts_per_token": topk,
53
+ "n_tokens": n_tokens, "layers": layers, "source": f"llama-imatrix per-expert counts: {path}",
54
+ })
55
+
56
+
57
+ def _main(argv: Optional[List[str]] = None) -> int:
58
+ for _stream in (sys.stdout, sys.stderr):
59
+ try:
60
+ _stream.reconfigure(encoding="utf-8")
61
+ except (AttributeError, ValueError):
62
+ pass
63
+
64
+ ap = argparse.ArgumentParser(
65
+ prog="gpu-container-concentration",
66
+ description="Per-expert-cache de-risk gate: does this model's routing concentrate enough to cache?",
67
+ )
68
+ ap.add_argument("--debug", action="store_true", help="show the full traceback on an unexpected error")
69
+ src = ap.add_mutually_exclusive_group(required=True)
70
+ src.add_argument("--trace", help="ActivationTrace JSON (L×E per-expert counts)")
71
+ src.add_argument("--imatrix", help="llama-imatrix imatrix.gguf (extract per-expert counts; needs `gguf`)")
72
+ ap.add_argument("--model-name", default="model", help="model name (for the --imatrix path / the report)")
73
+ ap.add_argument("--topk", type=int, default=8, help="experts/token, for the --imatrix n_tokens estimate")
74
+ ap.add_argument("--coverage", type=float, default=0.90, help="routing-mass coverage target (default 0.90)")
75
+ ap.add_argument("--threshold", type=float, default=0.50,
76
+ help="cache_helps if < this fraction of experts cover the target (default 0.50)")
77
+ ap.add_argument("-o", "--out", help="write the report JSON here (default: stdout)")
78
+ args = ap.parse_args(argv)
79
+
80
+ if args.imatrix:
81
+ try:
82
+ trace = _trace_from_imatrix(args.imatrix, args.model_name, args.topk)
83
+ except (ValueError, OSError) as e:
84
+ raise GpuContainerError("INPUT_BAD_IMATRIX", str(e),
85
+ hint="pass --trace with an L×E counts JSON instead, "
86
+ "or `pip install gguf` for the --imatrix path")
87
+ else:
88
+ trace = load_trace(args.trace)
89
+ if trace is None:
90
+ raise GpuContainerError("IO_TRACE_UNREADABLE", f"could not load a trace from {args.trace}",
91
+ hint="expected an ActivationTrace JSON "
92
+ "(model, num_experts, experts_per_token, layers[])")
93
+
94
+ rep = analyze_concentration(trace, coverage_target=args.coverage, cache_helps_threshold=args.threshold)
95
+
96
+ js = rep.to_json()
97
+ if args.out:
98
+ with open(args.out, "w", encoding="utf-8") as f:
99
+ f.write(js + "\n")
100
+ print(f"wrote {args.out}", file=sys.stderr)
101
+ else:
102
+ print(js)
103
+
104
+ need = rep.hot_frac_for_coverage * rep.num_experts
105
+ verdict = "CACHE COULD HELP" if rep.cache_helps else "cache NOT justified"
106
+ print(f"{rep.model}: {verdict} — {need:.0f}/{rep.num_experts} experts ({rep.hot_frac_for_coverage:.0%}) "
107
+ f"resident for {rep.coverage_target:.0%} routing coverage; concentration {rep.concentration_score:.2f}, "
108
+ f"top expert {rep.top1_share:.1%} ({rep.n_layers} layers, ~{rep.n_tokens} tok).", file=sys.stderr)
109
+ for n in rep.notes:
110
+ print(f" note: {n}", file=sys.stderr)
111
+
112
+ return 5 if rep.cache_helps else 0
113
+
114
+
115
+ def main(argv: Optional[List[str]] = None) -> int:
116
+ return guard(_main, argv)
117
+
118
+
119
+ if __name__ == "__main__":
120
+ raise SystemExit(main())
@@ -0,0 +1,198 @@
1
+ """llama.cpp `--n-cpu-moe` placement planner.
2
+
3
+ Closed-form, deterministic, reversible (a plan is a file). The math:
4
+
5
+ VRAM_used(N) = non_expert_bytes + (per_moe_layer_expert_bytes · (L_moe − N)) + KV_bytes + overhead
6
+
7
+ `--n-cpu-moe N` moves the first N MoE layers' experts to CPU RAM, so raising N lowers VRAM use
8
+ (more experts off the GPU) and lowers throughput (CPU computes them at RAM bandwidth, far below
9
+ the GPU's). We pick the SMALLEST N that fits — maximal GPU residency, maximal speed — and refuse
10
+ if even N = L_moe (all experts on CPU) cannot fit the always-resident footprint.
11
+
12
+ Decode is bandwidth-bound at batch 1 (feasibility #10): per token the GPU reads its resident
13
+ weights + active experts at VRAM bandwidth while the CPU reads its N layers' active experts at
14
+ RAM bandwidth. Throughput ≈ 1 / (t_gpu + t_cpu). This is an ESTIMATE for N>0 (heavy-offload
15
+ prediction misses up to ~2-3× — feasibility #11), labelled and confirmed by the receipt; the
16
+ N=0 in-VRAM case is the ±10% regime.
17
+ """
18
+ from __future__ import annotations
19
+
20
+ from typing import Optional
21
+
22
+ from ..profiler import model as model_mod
23
+ from ..profiler.schema import PlacementPlan, Profile
24
+ from .calibration import CalibrationModel
25
+
26
+ _MIB = 1024 * 1024
27
+ _GB = 1e9
28
+
29
+ # Labelled defaults when the profile hasn't measured them (the planner flags the assumption).
30
+ DEFAULT_CPU_BW_GBPS = 80.0 # DDR5 dual-channel, conservative (measure via the profiler to override)
31
+ DEFAULT_VRAM_BW_GBPS = 1790.0 # RTX 5090 GDDR7 512-bit (~1.79 TB/s)
32
+
33
+
34
+ def plan_llama_cpp(
35
+ profile: Profile,
36
+ ctx_len: int = 4096,
37
+ batch: int = 1,
38
+ cpu_mem_bw_gbps: Optional[float] = None,
39
+ vram_bw_gbps: Optional[float] = None,
40
+ non_expert_bpw: Optional[float] = None,
41
+ overhead_mib: float = 1024.0,
42
+ floor_tok_s: float = 1.0,
43
+ model_ref: Optional[str] = None,
44
+ force_n_cpu_moe: Optional[int] = None,
45
+ calibration: Optional[CalibrationModel] = None,
46
+ ) -> PlacementPlan:
47
+ """Plan an MoE placement for llama.cpp `--n-cpu-moe`. Honest verdict, never raises."""
48
+ hw, model = profile.hardware, profile.model
49
+
50
+ if model is None or not model.expert.is_moe:
51
+ return PlacementPlan(fits=False, verdict="refuse",
52
+ message="Not an MoE model (or no model profiled) — the --n-cpu-moe lane "
53
+ "is MoE-only. Dense offload is a separate (refusal-prone) lane.")
54
+ if model.expert_params_total is None or model.non_expert_params is None or not model.n_moe_layers:
55
+ return PlacementPlan(fits=False, verdict="refuse",
56
+ message="model profile lacks the closed-form param split — re-run the "
57
+ "profiler with --model-config <config.json>.")
58
+ vram_budget = hw.gpu.vram_free_mib or hw.gpu.vram_total_mib
59
+ if not vram_budget:
60
+ return PlacementPlan(fits=False, verdict="refuse",
61
+ message="VRAM unknown (profiler returned None) — cannot place honestly.")
62
+
63
+ bpw = model_mod.bytes_per_weight(model.quant, model.dtype_bytes)
64
+ cpu_bw = (cpu_mem_bw_gbps or getattr(hw.memory, "cpu_mem_bw_gbps", None) or DEFAULT_CPU_BW_GBPS) * _GB
65
+ vram_bw = (vram_bw_gbps or DEFAULT_VRAM_BW_GBPS) * _GB
66
+ cpu_bw_measured = bool(cpu_mem_bw_gbps or getattr(hw.memory, "cpu_mem_bw_gbps", None))
67
+
68
+ # Non-expert (always-resident) weights may carry a heavier precision than the experts: MXFP4
69
+ # quantizes experts only, leaving attention/embeddings near f16. Budget the GPU floor at that
70
+ # precision so the plan doesn't under-count VRAM (model.non_expert_bytes_per_weight default;
71
+ # `non_expert_bpw` overrides). Whole-model quants (Q4_K_M, ...) resolve to the same bpw.
72
+ ne_bpw = (non_expert_bpw if non_expert_bpw is not None
73
+ else model_mod.non_expert_bytes_per_weight(model.quant, model.dtype_bytes))
74
+ Lm = model.n_moe_layers
75
+ expert_total_bytes = model.expert_params_total * bpw
76
+ non_expert_bytes = model.non_expert_params * ne_bpw
77
+ per_layer_expert_bytes = expert_total_bytes / Lm
78
+ kv_bytes = model.kv_bytes_at(ctx_len, batch) or 0
79
+
80
+ def vram_used_mib(n: int) -> float:
81
+ gpu_expert_bytes = per_layer_expert_bytes * (Lm - n)
82
+ return (non_expert_bytes + gpu_expert_bytes + kv_bytes) / _MIB + overhead_mib
83
+
84
+ assumptions = {
85
+ "cpu_mem_bw_gbps": round(cpu_bw / _GB, 1), "cpu_bw_measured": cpu_bw_measured,
86
+ "vram_bw_gbps": round(vram_bw / _GB, 1), "bytes_per_weight": round(bpw, 3),
87
+ "non_expert_bpw": round(ne_bpw, 3),
88
+ "ctx_len": ctx_len, "batch": batch, "overhead_mib": overhead_mib,
89
+ "kv_dtype": "f16", "model_total_b": model.total_params,
90
+ "forced_n": force_n_cpu_moe,
91
+ }
92
+
93
+ if force_n_cpu_moe is not None:
94
+ # explore a specific N (what-if / receipt scoring) instead of the auto-minimal one
95
+ N = max(0, min(int(force_n_cpu_moe), Lm))
96
+ if vram_used_mib(N) > vram_budget:
97
+ return PlacementPlan(
98
+ fits=False, verdict="refuse", n_cpu_moe=N, n_moe_layers=Lm,
99
+ vram_budget_mib=round(vram_budget, 1), vram_used_mib=round(vram_used_mib(N), 1),
100
+ assumptions=assumptions, floor_tok_s=floor_tok_s,
101
+ message=f"REFUSE: forced --n-cpu-moe {N} still needs ~{vram_used_mib(N):.0f} MiB "
102
+ f"> {vram_budget:.0f} MiB free VRAM.")
103
+ else:
104
+ # smallest N in [0, Lm] that fits — maximal GPU residency, maximal speed
105
+ n_fit = next((n for n in range(0, Lm + 1) if vram_used_mib(n) <= vram_budget), None)
106
+ if n_fit is None:
107
+ floor_vram = vram_used_mib(Lm)
108
+ return PlacementPlan(
109
+ fits=False, verdict="refuse", n_cpu_moe=Lm, n_moe_layers=Lm,
110
+ vram_budget_mib=round(vram_budget, 1), vram_used_mib=round(floor_vram, 1),
111
+ assumptions=assumptions, floor_tok_s=floor_tok_s,
112
+ message=(f"REFUSE: even with ALL experts on CPU (--cpu-moe), the always-resident footprint "
113
+ f"(attention + router + embeddings + output head + {ctx_len}-tok KV) needs "
114
+ f"~{floor_vram:.0f} MiB > {vram_budget:.0f} MiB free VRAM. You probably expected "
115
+ f"{model.name} to fit; it cannot, because the non-expert weights alone exceed VRAM. "
116
+ f"Options: shorter --ctx, q8_0 KV cache, a smaller/more-quantized model, or more VRAM."),
117
+ )
118
+ N = n_fit
119
+ ram_used_mib = per_layer_expert_bytes * N / _MIB
120
+ ram_avail_mib = (hw.memory.ram_available_gib or hw.memory.ram_total_gib or 0) * 1024
121
+ ram_ok = (ram_used_mib <= ram_avail_mib) if ram_avail_mib else True
122
+
123
+ # bandwidth-bound decode estimate
124
+ topk = model.expert.experts_per_token or 0
125
+ active_each = (model.expert.expert_params_each or 0) * bpw
126
+ gpu_active_expert = active_each * topk * (Lm - N)
127
+ cpu_active_expert = active_each * topk * N
128
+ # per-token GPU read ≈ all non-expert weights (attention/router/head) + GPU-resident active
129
+ # experts + the KV cache (attention reads it every decode step). CPU reads its active experts.
130
+ # This is a ROOFLINE CEILING (peak bandwidth, no kernel/launch/attention-compute overhead): a
131
+ # true UPPER BOUND on decode tok/s. Real decode runs at a fraction of it; the receipt measures
132
+ # the actual efficiency and closes the gap. Refusal is conservative — it fires only when even
133
+ # this optimistic ceiling cannot clear the floor.
134
+ t_gpu = (non_expert_bytes + gpu_active_expert + kv_bytes) / vram_bw
135
+ t_cpu = cpu_active_expert / cpu_bw
136
+ ceiling = (1.0 / (t_gpu + t_cpu)) if (t_gpu + t_cpu) > 0 else None
137
+ basis = ("roofline ceiling, in-VRAM (active-weight bandwidth; real decode is a fraction of this -- "
138
+ "small-active MoE is largely overhead-bound -- confirmed by receipt)" if N == 0
139
+ else "roofline ceiling, CPU-offload (real well below; heavy-offload can miss 2-3x -- confirmed by receipt)")
140
+
141
+ # Calibrated forecast (opt-in): scale the ceiling by the measured realized efficiency for this
142
+ # regime (calibration.py). With no calibration data the forecast IS the ceiling -- the honest
143
+ # uncalibrated path. The ceiling is retained as the upper bound AND the refusal floor below.
144
+ calibrated, band_low, band_high, calib_n = ceiling, None, None, 0
145
+ calib_basis = ("uncalibrated: the roofline ceiling is the upper bound -- real decode is a fraction; "
146
+ "run a receipt (gpu-container-receipt) to calibrate this shape")
147
+ if calibration is not None and ceiling is not None:
148
+ est = calibration.estimate("in_vram" if N == 0 else "offload", (N / Lm) if Lm else 0.0)
149
+ if est is not None:
150
+ calibrated = ceiling * est.efficiency
151
+ band_low, band_high = ceiling * est.low, ceiling * est.high
152
+ calib_n, calib_basis = est.n_samples, est.basis
153
+
154
+ # Refusal floor keys on the CEILING (conservative ANDON): refuse only when even the optimistic
155
+ # upper bound cannot clear the floor -- never refuse a model that might be usable.
156
+ below_floor = ceiling is not None and ceiling < floor_tok_s
157
+ verdict = "refuse" if (below_floor or not ram_ok) else "ship"
158
+ vu = vram_used_mib(N)
159
+
160
+ # -fa on (not bare -fa): current llama.cpp made flash-attn a tri-state (on|off|auto) and rejects
161
+ # a value-less -fa. Confirmed against the ghcr.io/ggml-org/llama.cpp:full-cuda help (2026-06-04).
162
+ flags = f"-ngl 99 --n-cpu-moe {N} -c {ctx_len} -fa on"
163
+ cmd_model = f"-hf {model_ref}" if model_ref else "-m <model.gguf>"
164
+
165
+ if verdict == "refuse" and below_floor:
166
+ msg = (f"REFUSE: even the roofline CEILING for the best plan (--n-cpu-moe {N}) is "
167
+ f"~{ceiling:.2f} tok/s < {floor_tok_s} floor — real decode is lower still. "
168
+ f"You probably expected {model.name} to be usable; with {N}/{Lm} expert layers on CPU "
169
+ f"(RAM bandwidth ~{cpu_bw/_GB:.0f} GB/s) it cannot clear the floor. "
170
+ f"Options: a smaller model, fewer active experts, or more VRAM to keep N low.")
171
+ elif verdict == "refuse":
172
+ msg = (f"REFUSE: plan needs {ram_used_mib/1024:.1f} GiB CPU RAM for {N} expert layers but only "
173
+ f"~{ram_avail_mib/1024:.1f} GiB available.")
174
+ else:
175
+ tier = "fully in VRAM" if N == 0 else f"{N}/{Lm} expert layers on CPU RAM"
176
+ if calib_n:
177
+ forecast = (f"~{calibrated:.0f} tok/s (calibrated, band [{band_low:.0f}, {band_high:.0f}]; "
178
+ f"roofline ceiling {ceiling:.0f}, from {calib_n} receipt(s))")
179
+ else:
180
+ forecast = f"<= {ceiling:.0f} tok/s (roofline ceiling — real is a fraction; run a receipt to calibrate)"
181
+ msg = (f"SHIP: {model.name} {tier}. Decode {forecast}. "
182
+ f"VRAM ~{vu:.0f}/{vram_budget:.0f} MiB, CPU-expert RAM ~{ram_used_mib/1024:.1f} GiB. "
183
+ f"Launch: llama-cli {cmd_model} {flags}")
184
+ if calib_n and calibrated < floor_tok_s <= ceiling:
185
+ msg += (f" — borderline: the calibrated forecast (~{calibrated:.1f}) dips below the {floor_tok_s} "
186
+ f"tok/s floor though the ceiling clears it; the receipt will settle it.")
187
+
188
+ return PlacementPlan(
189
+ fits=True, verdict=verdict, n_cpu_moe=N, n_moe_layers=Lm, llama_flags=flags,
190
+ vram_budget_mib=round(vram_budget, 1), vram_used_mib=round(vu, 1),
191
+ ram_used_mib=round(ram_used_mib, 1),
192
+ predicted_decode_tok_s=round(calibrated, 2) if calibrated else None,
193
+ ceiling_decode_tok_s=round(ceiling, 2) if ceiling else None,
194
+ predicted_band_low_tok_s=round(band_low, 2) if band_low else None,
195
+ predicted_band_high_tok_s=round(band_high, 2) if band_high else None,
196
+ throughput_basis=basis, calibration_basis=calib_basis, calibration_n_samples=calib_n,
197
+ floor_tok_s=floor_tok_s, message=msg, assumptions=assumptions,
198
+ )
@@ -0,0 +1,155 @@
1
+ """The receipt — a DIFFERENT mechanism (measurement) verifying the planner's forecast.
2
+
3
+ llama-bench emits per-test rows (`pp###` = prefill, `tg###` = token-generation/decode) with an
4
+ `avg_ts` tokens/sec. We parse that, pair it with the plan's predicted CEILING, and record the
5
+ realized efficiency (measured ÷ ceiling) + whether the >1 tok/s floor actually cleared. That
6
+ efficiency is the calibration seed that closes the static-prediction gap (the architecture's loop).
7
+ The model never grades its own forecast: the generator is the planner's closed form; the verifier
8
+ is a real run on the GPU.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ from typing import List, Optional
14
+
15
+ from ..profiler.schema import PlacementPlan, Receipt
16
+ from .activation import ConcentrationReport
17
+ from .calibration import CalibrationPoint
18
+
19
+
20
+ def parse_llama_bench(stdout: str) -> List[dict]:
21
+ """Parse `llama-bench -o json` output into [{test, n_cpu_moe, n_gpu_layers, avg_ts}, ...]."""
22
+ # The JSON array may be embedded in other log lines; extract the outermost [...] span.
23
+ s, e = stdout.find("["), stdout.rfind("]")
24
+ if s < 0 or e < 0 or e <= s:
25
+ return []
26
+ try:
27
+ rows = json.loads(stdout[s:e + 1])
28
+ except ValueError:
29
+ return []
30
+ out = []
31
+ for r in rows:
32
+ if not isinstance(r, dict):
33
+ continue
34
+ out.append({
35
+ "test": r.get("test") or r.get("n_prompt") or "?",
36
+ "n_cpu_moe": r.get("n_cpu_moe"),
37
+ "n_gpu_layers": r.get("n_gpu_layers"),
38
+ "avg_ts": r.get("avg_ts"),
39
+ "model": r.get("model_filename") or r.get("model_type"),
40
+ })
41
+ return out
42
+
43
+
44
+ def _pick(rows: List[dict], prefix: str) -> Optional[float]:
45
+ """Average tok/s for the first row whose `test` starts with `prefix` (tg=decode, pp=prefill)."""
46
+ for r in rows:
47
+ t = str(r.get("test", ""))
48
+ if t.startswith(prefix) and r.get("avg_ts") is not None:
49
+ return float(r["avg_ts"])
50
+ return None
51
+
52
+
53
+ def build_receipt(
54
+ plan: PlacementPlan,
55
+ decode_tok_s: Optional[float],
56
+ prefill_tok_s: Optional[float] = None,
57
+ vram_used_mib: Optional[float] = None,
58
+ method: Optional[str] = None,
59
+ concentration: Optional[ConcentrationReport] = None,
60
+ peaks: Optional[dict] = None,
61
+ ) -> Receipt:
62
+ """Pair a measured run with the plan's forecast(s) -> a Receipt.
63
+
64
+ The plan now carries two forecasts, so the receipt makes two comparisons:
65
+ - realized efficiency = measured / CEILING -> the calibration seed for the next plan,
66
+ - decode error = (measured - CALIBRATED) / CALIBRATED -> was the calibrated forecast right?,
67
+ - within_band = measured inside the plan's calibrated band -> the loop's proof.
68
+ `ceiling` falls back to the calibrated field for plans predating the ceiling split.
69
+
70
+ `peaks` (a dict from `gpu-container-watchdog run --peaks-out`) folds the run's SAFETY ENVELOPE
71
+ into the receipt — peak power / host-mem / VRAM — proving the run stayed inside the rig's limits.
72
+ """
73
+ pred = plan.predicted_decode_tok_s # calibrated forecast
74
+ ceiling = plan.ceiling_decode_tok_s or plan.predicted_decode_tok_s # roofline upper bound
75
+ err = round(100.0 * (decode_tok_s - pred) / pred, 1) if (decode_tok_s and pred) else None
76
+ eff_pct = round(100.0 * decode_tok_s / ceiling, 1) if (decode_tok_s and ceiling) else None
77
+ lo, hi = plan.predicted_band_low_tok_s, plan.predicted_band_high_tok_s
78
+ within = (lo <= decode_tok_s <= hi) if (decode_tok_s and lo is not None and hi is not None) else None
79
+
80
+ notes: List[str] = []
81
+ if eff_pct is not None:
82
+ notes.append(f"realized {eff_pct:.0f}% of the roofline ceiling ({decode_tok_s:.1f} of {ceiling:.0f} tok/s) "
83
+ f"— this efficiency is the calibration seed for the next plan.")
84
+ if decode_tok_s > ceiling:
85
+ notes.append("ANDON: measured EXCEEDS the ceiling — the bandwidth model is wrong (check "
86
+ "vram_bw / bytes_per_weight assumptions), not just inefficient.")
87
+ if within is not None:
88
+ notes.append(f"calibrated forecast {pred:.1f} tok/s, band [{lo:.1f}, {hi:.1f}] — measured "
89
+ f"{decode_tok_s:.1f} {'LANDED INSIDE' if within else 'FELL OUTSIDE'} the band "
90
+ f"({'loop closed' if within else 'recalibrate: ingest this receipt and refit'}).")
91
+ if plan.vram_used_mib and vram_used_mib:
92
+ dv = 100.0 * (vram_used_mib - plan.vram_used_mib) / plan.vram_used_mib
93
+ notes.append(f"VRAM predicted {plan.vram_used_mib:.0f} MiB vs measured {vram_used_mib:.0f} MiB ({dv:+.0f}%).")
94
+ if concentration is not None:
95
+ helps = ("a per-expert cache COULD help this workload" if concentration.cache_helps
96
+ else "routing is near-uniform — a per-expert cache would NOT help this workload")
97
+ notes.append(f"routing de-risk: {helps} (need {concentration.hot_frac_for_coverage:.0%} of experts for "
98
+ f"{concentration.coverage_target:.0%} coverage; concentration {concentration.concentration_score:.2f}, "
99
+ f"top expert {concentration.top1_share:.1%}).")
100
+ if peaks:
101
+ env = []
102
+ if peaks.get("peak_host_mem_pct") is not None:
103
+ env.append(f"peak host-mem {peaks['peak_host_mem_pct']:.0f}%")
104
+ if peaks.get("peak_gpu_power_pct") is not None:
105
+ env.append(f"peak power {peaks['peak_gpu_power_pct']:.0f}%")
106
+ if peaks.get("peak_gpu_vram_used_mib") is not None:
107
+ env.append(f"peak VRAM {peaks['peak_gpu_vram_used_mib'] / 1024:.1f} GiB")
108
+ stayed = peaks.get("stayed_within_envelope")
109
+ tail = ("stayed within the safety envelope" if stayed else
110
+ "BREACHED the safety envelope — aborted mid-run") if stayed is not None else "envelope recorded"
111
+ notes.append(f"safety: {', '.join(env) or 'no peaks'} over {peaks.get('samples', 0)} watchdog polls "
112
+ f"— {tail}.")
113
+ return Receipt(
114
+ runtime=plan.runtime, n_cpu_moe=plan.n_cpu_moe,
115
+ measured_decode_tok_s=round(decode_tok_s, 2) if decode_tok_s else None,
116
+ measured_prefill_tok_s=round(prefill_tok_s, 2) if prefill_tok_s else None,
117
+ measured_vram_used_mib=round(vram_used_mib, 1) if vram_used_mib else None,
118
+ predicted_decode_tok_s=pred, ceiling_decode_tok_s=round(ceiling, 2) if ceiling else None,
119
+ decode_error_pct=err, realized_efficiency_pct=eff_pct, within_band=within,
120
+ cleared_floor=(decode_tok_s >= plan.floor_tok_s) if decode_tok_s else None,
121
+ routing_cache_helps=concentration.cache_helps if concentration else None,
122
+ routing_hot_frac_for_coverage=round(concentration.hot_frac_for_coverage, 3) if concentration else None,
123
+ routing_concentration=round(concentration.concentration_score, 3) if concentration else None,
124
+ peak_gpu_power_pct=(peaks or {}).get("peak_gpu_power_pct"),
125
+ peak_gpu_temp_c=(peaks or {}).get("peak_gpu_temp_c"),
126
+ peak_gpu_vram_used_mib=(peaks or {}).get("peak_gpu_vram_used_mib"),
127
+ peak_host_mem_pct=(peaks or {}).get("peak_host_mem_pct"),
128
+ min_host_avail_mib=(peaks or {}).get("min_host_avail_mib"),
129
+ safety_samples=(peaks or {}).get("samples"),
130
+ stayed_within_envelope=(peaks or {}).get("stayed_within_envelope"),
131
+ method=method, notes=notes,
132
+ )
133
+
134
+
135
+ def plan_to_calibration_point(
136
+ plan: PlacementPlan,
137
+ measured_decode_tok_s: float,
138
+ model_name: str,
139
+ quant: Optional[str] = None,
140
+ created: Optional[str] = None,
141
+ rig: Optional[str] = None,
142
+ source: Optional[str] = None,
143
+ ) -> CalibrationPoint:
144
+ """Distill a (plan, measured-decode) pair into a CalibrationPoint for the store — the loop's
145
+ write-back. The ceiling comes from the plan (so efficiency is measured/ceiling); the bandwidth
146
+ assumptions ride along as provenance so the point stays auditable."""
147
+ a = plan.assumptions or {}
148
+ return CalibrationPoint(
149
+ model=model_name, quant=quant,
150
+ n_cpu_moe=plan.n_cpu_moe or 0, n_moe_layers=plan.n_moe_layers or 0,
151
+ ceiling_tok_s=plan.ceiling_decode_tok_s or plan.predicted_decode_tok_s or 0.0,
152
+ measured_tok_s=measured_decode_tok_s,
153
+ cpu_bw_gbps=a.get("cpu_mem_bw_gbps"), vram_bw_gbps=a.get("vram_bw_gbps"),
154
+ ctx_len=a.get("ctx_len"), created=created, rig=rig, source=source,
155
+ )