gpu-container 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ """gpu-container — a model-aware inference memory-placement planner for single-GPU rigs.
2
+
3
+ Phase 1 starts with the profiler (this package's `profiler` module): it emits the
4
+ profile JSON that every downstream component (planner, receipt) reads. Knowledge that
5
+ grounds the measurement methodology lives in the docker-knowledge KB
6
+ (readouts/docker-knowledge), seeded by the feasibility study-swarm.
7
+ """
8
+
9
+ __version__ = "0.1.0"
@@ -0,0 +1,60 @@
1
+ """Unified `gpu-container <command>` entry — the single-binary / npm-launcher dispatcher.
2
+
3
+ `pip install gpu-container` lays down five console scripts (`gpu-container-profile`, `-plan`,
4
+ `-receipt`, `-concentration`, `-watchdog`). This dispatcher exposes the same five as subcommands of
5
+ one `gpu-container` command, which is what the PyInstaller binary and the npm launcher run:
6
+
7
+ gpu-container profile ...
8
+ gpu-container plan ...
9
+ gpu-container watchdog run -- <cmd...>
10
+
11
+ Subcommand modules are imported lazily so a fast path (`--help`) never imports the whole package,
12
+ and so a frozen binary only pulls what the chosen command needs.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import sys
17
+ from typing import List, Optional
18
+
19
+ from . import __version__
20
+
21
+ _SUB = {
22
+ "profile": "gpu_container.profiler.cli",
23
+ "plan": "gpu_container.planner.cli",
24
+ "receipt": "gpu_container.planner.receipt_cli",
25
+ "concentration": "gpu_container.planner.concentration_cli",
26
+ "watchdog": "gpu_container.watchdog",
27
+ }
28
+
29
+ _USAGE = (
30
+ "usage: gpu-container <command> [args...]\n\n"
31
+ "commands:\n"
32
+ " profile profile the rig (+ model) -> profile.json\n"
33
+ " plan plan an MoE placement (llama.cpp --n-cpu-moe) from a profile\n"
34
+ " receipt verify a plan against a real llama-bench run\n"
35
+ " concentration per-expert-cache de-risk gate (routing concentration)\n"
36
+ " watchdog rig-safety control plane (monitor, or `run -- <cmd>`)\n\n"
37
+ "run `gpu-container <command> --help` for per-command help.\n"
38
+ "(each command is also installed standalone as `gpu-container-<command>`.)"
39
+ )
40
+
41
+
42
+ def main(argv: Optional[List[str]] = None) -> int:
43
+ argv = list(sys.argv[1:] if argv is None else argv)
44
+ if not argv or argv[0] in ("-h", "--help"):
45
+ print(_USAGE)
46
+ return 0
47
+ if argv[0] in ("-V", "--version"):
48
+ print(__version__) # static — works in a frozen binary (no importlib.metadata lookup)
49
+ return 0
50
+ cmd, rest = argv[0], argv[1:]
51
+ mod = _SUB.get(cmd)
52
+ if mod is None:
53
+ print(f"gpu-container: unknown command '{cmd}'\n\n{_USAGE}", file=sys.stderr)
54
+ return 2
55
+ import importlib
56
+ return importlib.import_module(mod).main(rest)
57
+
58
+
59
+ if __name__ == "__main__":
60
+ raise SystemExit(main())
@@ -0,0 +1,72 @@
1
+ """Structured errors + a CLI guard — the shipcheck B1/B3 contract.
2
+
3
+ Every user-facing failure carries a stable `{code, message, hint, cause?, retryable?}` shape
4
+ (`GpuContainerError`) and renders as a few clean lines — never a raw traceback. `guard()` wraps a
5
+ CLI's body so an *unexpected* exception also becomes one clean line + exit 2; the full traceback
6
+ appears only with `--debug`.
7
+
8
+ Error `code`s are namespaced and stable once released (treat like API). Prefixes:
9
+ INPUT_ bad user input / validation IO_ filesystem / paths
10
+ DEP_ a missing optional dependency RUNTIME_ unexpected failure
11
+ STATE_ corrupt / stale internal state
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import sys
16
+ from dataclasses import dataclass
17
+ from typing import Callable, List, Optional
18
+
19
+
20
+ @dataclass
21
+ class GpuContainerError(Exception):
22
+ """A user-facing error with a structured, stable shape. Raise it from a CLI body; `guard`
23
+ renders it and returns `exit_code` (default 2 — distinct from the planners' verdict codes)."""
24
+ code: str
25
+ message: str
26
+ hint: Optional[str] = None
27
+ cause: Optional[str] = None
28
+ retryable: bool = False
29
+ exit_code: int = 2
30
+
31
+ def __str__(self) -> str:
32
+ return f"{self.code}: {self.message}"
33
+
34
+ def to_dict(self) -> dict:
35
+ return {"code": self.code, "message": self.message, "hint": self.hint,
36
+ "cause": self.cause, "retryable": self.retryable}
37
+
38
+ def render(self) -> str:
39
+ lines = [f"ERROR [{self.code}]: {self.message}"]
40
+ if self.hint:
41
+ lines.append(f" hint: {self.hint}")
42
+ if self.cause:
43
+ lines.append(f" cause: {self.cause}")
44
+ if self.retryable:
45
+ lines.append(" retryable: yes")
46
+ return "\n".join(lines)
47
+
48
+
49
+ def guard(run: Callable[[Optional[List[str]]], int], argv: Optional[List[str]] = None) -> int:
50
+ """Run a CLI body, turning failures into clean structured output and never leaking a raw
51
+ traceback unless `--debug` is present.
52
+
53
+ - `GpuContainerError` -> render it, return its `exit_code` (an expected error; no trace).
54
+ - any other Exception -> with `--debug`, re-raise (show the trace); else one clean line + exit 2.
55
+ Normal returns (the verdict codes 0/3/4/5/7) pass straight through. argparse's own usage exits
56
+ (SystemExit) are not caught — they keep argparse's message + exit 2.
57
+ """
58
+ debug = "--debug" in (argv if argv is not None else sys.argv[1:])
59
+ try:
60
+ return run(argv)
61
+ except GpuContainerError as e:
62
+ print(e.render(), file=sys.stderr)
63
+ return e.exit_code
64
+ except KeyboardInterrupt:
65
+ print("ERROR [RUNTIME_INTERRUPTED]: interrupted by user", file=sys.stderr)
66
+ return 130
67
+ except Exception as e: # noqa: BLE001 — deliberate: no raw stack without --debug (gate B3)
68
+ if debug:
69
+ raise
70
+ print(f"ERROR [RUNTIME_UNEXPECTED]: {type(e).__name__}: {e}\n"
71
+ " hint: re-run with --debug for the full traceback", file=sys.stderr)
72
+ return 2
@@ -0,0 +1,17 @@
1
+ """The placement planner: a profile + model -> an explicit tiered placement plan for a runtime.
2
+
3
+ Phase-1 target: llama.cpp `--n-cpu-moe N` (the first N MoE layers' expert weights live in CPU
4
+ RAM and are computed on CPU; attention/router/shared/embeddings/head stay on the GPU — the
5
+ KTransformers-style "compute where the weights are", NOT per-token PCIe streaming; verified in
6
+ tensor-engine-knowledge). The planner finds the minimal N that fits VRAM, predicts the memory
7
+ map + decode throughput, and REFUSES below the >1 tok/s floor with a contrastive frame.
8
+ """
9
+ from .calibration import ( # noqa: F401
10
+ CalibrationModel,
11
+ CalibrationPoint,
12
+ CalibrationStore,
13
+ EfficiencyEstimate,
14
+ load_seed_points,
15
+ )
16
+ from .placement import DEFAULT_CPU_BW_GBPS, DEFAULT_VRAM_BW_GBPS, plan_llama_cpp # noqa: F401
17
+ from .receipt import build_receipt, parse_llama_bench, plan_to_calibration_point # noqa: F401
@@ -0,0 +1,225 @@
1
+ """Activation-trace concentration analysis — does per-expert caching even help?
2
+
3
+ The throughput half ([`calibration.py`]) turns receipts into a tok/s forecast. THIS module answers a
4
+ PRIOR, go/no-go question for the per-expert lane: given a captured activation trace (which experts
5
+ fired, per layer, over a representative workload), is the routing CONCENTRATED enough that a small
6
+ hot-expert VRAM cache would hit often — or is it so uniform that the cache (llama.cpp #20757) isn't
7
+ worth building?
8
+
9
+ It is the de-risk gate for ADR-0001 (`docs/decisions/0001-per-expert-cache-build-vs-upstream.md`):
10
+ build the runtime expert cache only where the trace shows a small fraction of experts captures most
11
+ of the routing.
12
+
13
+ Grounded in docker-knowledge wave-4 (moe-placement):
14
+ - Per-LAYER *total* activation is ~uniform — every token hits every layer's top-k experts — so the
15
+ signal is PER-EXPERT concentration WITHIN a layer, not which layer. Only the runtime cache can
16
+ exploit per-expert skew; `-ot` is per-layer (llamacpp-experts-fused-per-layer-not-per-expert).
17
+ - Skew is request-level and flattens to uniform across diverse prompts (MoE-Infinity,
18
+ arXiv:2401.14361) — so concentration is WORKLOAD-DEPENDENT: a trace is only valid for the
19
+ workload it was cut from. The report says so; a diverse-prompt trace reads LESS concentrated.
20
+ - The trace is an L×E count matrix captured via an eval-callback (activation-trace-via-eval-callback);
21
+ THIS module only CONSUMES it. None-not-guess: no trace -> no verdict (the planner stays per-layer).
22
+
23
+ Two measures, deliberately:
24
+ - `hot_frac_for_coverage` — the fraction of a layer's experts that must be resident to capture
25
+ `coverage_target` (default 90%) of its routing. The ACTIONABLE number; maps straight to #20757
26
+ `--moe-expert-cache-size`.
27
+ - `concentration_score = 1 - normalized_entropy` — a threshold-free [0,1] skew measure
28
+ (0 = uniform, 1 = one expert), robust to the arbitrary coverage target.
29
+
30
+ `cache_helps` is a convenience gate on the numbers, never a substitute for them.
31
+ """
32
+ from __future__ import annotations
33
+
34
+ import json
35
+ import math
36
+ from dataclasses import asdict, dataclass, field
37
+ from statistics import median
38
+ from typing import List, Optional
39
+
40
+ DEFAULT_COVERAGE_TARGET = 0.90
41
+ # If fewer than this fraction of a layer's experts cover `coverage_target` of its routing, a hot-expert
42
+ # cache buys real VRAM back — so a cache "helps". Tunable; the numbers are reported regardless.
43
+ DEFAULT_CACHE_HELPS_THRESHOLD = 0.50
44
+
45
+
46
+ @dataclass
47
+ class LayerActivation:
48
+ """One MoE layer's routing counts: expert_counts[i] = tokens routed to expert i in this layer."""
49
+ layer_index: int
50
+ expert_counts: List[int] = field(default_factory=list)
51
+
52
+ @property
53
+ def total(self) -> int:
54
+ return sum(self.expert_counts)
55
+
56
+
57
+ @dataclass
58
+ class ActivationTrace:
59
+ """An L×E activation trace captured over a representative workload (the eval-callback's output).
60
+
61
+ Persists only measured facts; concentration is DERIVED by `analyze_concentration` so the verdict
62
+ is always re-derivable from the counts, never a number we can silently get wrong.
63
+ """
64
+ model: str
65
+ num_experts: int # E (routed experts per MoE layer)
66
+ experts_per_token: int # top-k (sanity: each layer total ~= n_tokens * k)
67
+ n_tokens: int # decode tokens the trace covers
68
+ layers: List[LayerActivation] = field(default_factory=list)
69
+ gate_weighted: bool = False # counts are gate-mass-weighted (else raw selection counts)
70
+ created: Optional[str] = None # ISO date (passed in; the capture harness has no clock)
71
+ rig: Optional[str] = None
72
+ source: Optional[str] = None # provenance (which workload / run produced it)
73
+
74
+ def to_dict(self) -> dict:
75
+ return asdict(self)
76
+
77
+ def to_json(self, indent: int = 2) -> str:
78
+ return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
79
+
80
+ @classmethod
81
+ def from_dict(cls, d: dict) -> "ActivationTrace":
82
+ known = set(cls.__dataclass_fields__) # type: ignore[attr-defined]
83
+ kw = {k: v for k, v in d.items() if k in known}
84
+ kw["layers"] = [
85
+ LayerActivation(layer_index=l.get("layer_index"),
86
+ expert_counts=[int(c) for c in (l.get("expert_counts") or [])])
87
+ for l in (d.get("layers") or []) if isinstance(l, dict)
88
+ ]
89
+ return cls(**kw)
90
+
91
+ @classmethod
92
+ def from_json(cls, s: str) -> "ActivationTrace":
93
+ return cls.from_dict(json.loads(s))
94
+
95
+
96
+ @dataclass
97
+ class LayerConcentration:
98
+ layer_index: int
99
+ total_mass: int # sanity: ~= n_tokens * top_k (per-layer totals are ~uniform)
100
+ top1_share: float # routing share of the single hottest expert
101
+ hot_frac_for_coverage: float # fraction of experts needed to reach coverage_target
102
+ concentration_score: float # 1 - normalized entropy (0 = uniform, 1 = fully concentrated)
103
+
104
+
105
+ @dataclass
106
+ class ConcentrationReport:
107
+ """The de-risk verdict: would a per-expert cache help, and by how much, for THIS workload."""
108
+ model: str
109
+ num_experts: int
110
+ n_layers: int # layers with routing mass that were analyzed
111
+ n_tokens: int
112
+ coverage_target: float
113
+ threshold: float
114
+ cache_helps: bool
115
+ hot_frac_for_coverage: float # median over layers — the headline cache-size number
116
+ concentration_score: float # mean over layers
117
+ top1_share: float # median over layers
118
+ per_layer: List[LayerConcentration] = field(default_factory=list)
119
+ basis: str = ""
120
+ notes: List[str] = field(default_factory=list)
121
+
122
+ def to_dict(self) -> dict:
123
+ return asdict(self)
124
+
125
+ def to_json(self, indent: int = 2) -> str:
126
+ return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
127
+
128
+
129
+ def _normalized_entropy(counts: List[int]) -> float:
130
+ """Shannon entropy of the routing distribution, normalized to [0,1] by log(E).
131
+
132
+ 1.0 = perfectly uniform (every expert equally used); 0.0 = all mass on one expert. Caller
133
+ guarantees total > 0. A single expert (or a single active expert) is fully concentrated (0.0)."""
134
+ total = sum(counts)
135
+ nz = [c for c in counts if c > 0]
136
+ E = len(counts)
137
+ if E <= 1 or len(nz) <= 1:
138
+ return 0.0
139
+ H = -sum((c / total) * math.log(c / total) for c in nz)
140
+ return H / math.log(E)
141
+
142
+
143
+ def _hot_frac_for_coverage(counts: List[int], target: float) -> float:
144
+ """Fraction of experts (resident, hottest-first) needed to capture `target` of routing mass.
145
+
146
+ Low = concentrated (a small cache covers most routing); ~target = uniform (no cache win)."""
147
+ total = sum(counts)
148
+ E = len(counts)
149
+ if total <= 0 or E <= 0:
150
+ return 1.0
151
+ need = target * total
152
+ cum = 0
153
+ for i, c in enumerate(sorted(counts, reverse=True), start=1):
154
+ cum += c
155
+ if cum >= need:
156
+ return i / E
157
+ return 1.0
158
+
159
+
160
+ def analyze_layer(layer: LayerActivation, coverage_target: float) -> Optional[LayerConcentration]:
161
+ """Per-layer concentration, or None for a zero-mass layer (skipped, never guessed)."""
162
+ counts = layer.expert_counts
163
+ total = sum(counts)
164
+ if total <= 0 or not counts:
165
+ return None
166
+ return LayerConcentration(
167
+ layer_index=layer.layer_index,
168
+ total_mass=total,
169
+ top1_share=max(counts) / total,
170
+ hot_frac_for_coverage=_hot_frac_for_coverage(counts, coverage_target),
171
+ concentration_score=1.0 - _normalized_entropy(counts),
172
+ )
173
+
174
+
175
+ def analyze_concentration(
176
+ trace: ActivationTrace,
177
+ coverage_target: float = DEFAULT_COVERAGE_TARGET,
178
+ cache_helps_threshold: float = DEFAULT_CACHE_HELPS_THRESHOLD,
179
+ ) -> ConcentrationReport:
180
+ """Aggregate a trace into the per-expert-cache de-risk verdict. Never raises; honest on empty data."""
181
+ per_layer = [c for c in (analyze_layer(l, coverage_target) for l in trace.layers) if c is not None]
182
+ notes: List[str] = []
183
+
184
+ if not per_layer:
185
+ return ConcentrationReport(
186
+ model=trace.model, num_experts=trace.num_experts, n_layers=0, n_tokens=trace.n_tokens,
187
+ coverage_target=coverage_target, threshold=cache_helps_threshold,
188
+ cache_helps=False, hot_frac_for_coverage=1.0, concentration_score=0.0, top1_share=0.0,
189
+ per_layer=[], basis="no layers with routing mass — cannot assess (treated as 'cache not justified')",
190
+ notes=["empty or zero-mass trace — capture a real workload trace before deciding"],
191
+ )
192
+
193
+ hot = float(median(c.hot_frac_for_coverage for c in per_layer))
194
+ conc = sum(c.concentration_score for c in per_layer) / len(per_layer)
195
+ top1 = float(median(c.top1_share for c in per_layer))
196
+ cache_helps = hot < cache_helps_threshold
197
+
198
+ # Sanity: per-layer totals should be ~uniform (every token hits every layer's top-k). A large
199
+ # spread hints at a malformed trace, a wrong experts_per_token/n_tokens, or unequal expert counts.
200
+ masses = [c.total_mass for c in per_layer]
201
+ if max(masses) and (max(masses) - min(masses)) / max(masses) > 0.2:
202
+ notes.append("per-layer totals vary >20% — check experts_per_token/n_tokens, or layers may "
203
+ "carry differing expert counts")
204
+ notes.append("concentration is WORKLOAD-DEPENDENT: request-level skew flattens across diverse "
205
+ "prompts (MoE-Infinity); this verdict is valid only for the workload this trace covers")
206
+
207
+ basis = (f"{len(per_layer)} layers; hot_frac = median experts for {coverage_target:.0%} routing "
208
+ f"coverage; concentration = 1 - normalized_entropy (mean); "
209
+ f"cache_helps = hot_frac < {cache_helps_threshold:.0%}")
210
+
211
+ return ConcentrationReport(
212
+ model=trace.model, num_experts=trace.num_experts, n_layers=len(per_layer),
213
+ n_tokens=trace.n_tokens, coverage_target=coverage_target, threshold=cache_helps_threshold,
214
+ cache_helps=cache_helps, hot_frac_for_coverage=hot, concentration_score=conc,
215
+ top1_share=top1, per_layer=per_layer, basis=basis, notes=notes,
216
+ )
217
+
218
+
219
+ def load_trace(path) -> Optional[ActivationTrace]:
220
+ """Load a trace JSON (the capture harness's output). Returns None on any error — never raises."""
221
+ try:
222
+ with open(path, "r", encoding="utf-8") as f:
223
+ return ActivationTrace.from_dict(json.load(f))
224
+ except (OSError, ValueError, TypeError):
225
+ return None
@@ -0,0 +1,224 @@
1
+ """Receipt-driven recalibration — turn measured receipts into a calibrated forecast.
2
+
3
+ The planner emits a roofline CEILING (peak bandwidth, zero overhead): a true upper bound on
4
+ decode tok/s, but real decode runs at a fraction of it (Qwen3-30B-A3B realized 41% in-VRAM,
5
+ 56-61% offloaded — milestone 2-3 live receipt). This module closes that static-prediction gap:
6
+
7
+ receipt -> CalibrationPoint (realized efficiency at a known shape)
8
+ -> CalibrationStore (a JSON dir; append-only, auditable)
9
+ -> CalibrationModel (efficiency = f(regime, offload-fraction), with a band)
10
+ -> planner emits ceiling x efficiency +/- band (the calibrated forecast)
11
+
12
+ Two regimes, because they are bound by different things (placement.py `basis`):
13
+ - `in_vram` (N = 0): overhead-bound — small-active MoE spends most of its time NOT moving
14
+ bytes, so realized efficiency is low (~41%) and roughly flat.
15
+ - `offload` (N > 0): CPU-RAM-bandwidth-bound — the roofline fits better (~56-61%) and tracks
16
+ the offload fraction N / n_moe_layers.
17
+
18
+ Sparse-data-honest by construction: we bucket by regime and interpolate within `offload` only
19
+ when there are >= 2 distinct offload fractions; otherwise we report the regime's central
20
+ efficiency. The band never narrows below +/-`default_margin` (we cannot claim more confidence
21
+ than the data supports), and it always contains every observed point in the regime. With NO
22
+ points for a regime, `estimate()` returns None and the planner falls back to the raw ceiling --
23
+ the honest "uncalibrated" path. This mirrors the feasibility verdict's calibration #2: the
24
+ +/-10% receipt is scoped to the regimes we have measured; everywhere else is ceiling + band.
25
+
26
+ The model never grades its own forecast: the points come from llama-bench (a real GPU run, a
27
+ DIFFERENT mechanism than the planner's closed form) -- the EXTERNAL_VERIFIER discipline.
28
+ """
29
+ from __future__ import annotations
30
+
31
+ import json
32
+ import os
33
+ from collections import defaultdict
34
+ from dataclasses import asdict, dataclass
35
+ from statistics import median
36
+ from typing import Iterable, List, Optional
37
+
38
+ # Bundled seed: the measured Qwen3-30B-A3B receipts that ship with the package so a known shape
39
+ # is calibrated out-of-the-box. Lives next to this module.
40
+ _SEED_PATH = os.path.join(os.path.dirname(__file__), "calibration_seed.json")
41
+
42
+ DEFAULT_MARGIN = 0.25 # +/-25% efficiency band (feasibility #11: heavy offload can miss 2-3x; this
43
+ # is the *calibrated* band, far tighter than that worst case, but never tighter
44
+ # than the data supports)
45
+
46
+
47
+ @dataclass
48
+ class CalibrationPoint:
49
+ """One receipt's realized efficiency, tagged with the shape it was measured at.
50
+
51
+ `efficiency`, `regime`, and `offload_fraction` are DERIVED (properties) -- we persist only the
52
+ measured facts (ceiling, measured tok/s, the N/L shape, the bandwidth assumptions) so a point is
53
+ auditable and re-derivable, never a number we can silently get wrong.
54
+ """
55
+ model: str
56
+ n_cpu_moe: int
57
+ n_moe_layers: int
58
+ ceiling_tok_s: float
59
+ measured_tok_s: float
60
+ quant: Optional[str] = None
61
+ cpu_bw_gbps: Optional[float] = None
62
+ vram_bw_gbps: Optional[float] = None
63
+ ctx_len: Optional[int] = None
64
+ created: Optional[str] = None # ISO date (passed in; runners have no clock)
65
+ rig: Optional[str] = None
66
+ source: Optional[str] = None # provenance (which run / receipt)
67
+
68
+ @property
69
+ def regime(self) -> str:
70
+ return "in_vram" if self.n_cpu_moe == 0 else "offload"
71
+
72
+ @property
73
+ def offload_fraction(self) -> float:
74
+ return (self.n_cpu_moe / self.n_moe_layers) if self.n_moe_layers else 0.0
75
+
76
+ @property
77
+ def efficiency(self) -> Optional[float]:
78
+ if not self.ceiling_tok_s:
79
+ return None
80
+ return self.measured_tok_s / self.ceiling_tok_s
81
+
82
+ def to_dict(self) -> dict:
83
+ return asdict(self)
84
+
85
+ @classmethod
86
+ def from_dict(cls, d: dict) -> "CalibrationPoint":
87
+ known = {f for f in cls.__dataclass_fields__} # type: ignore[attr-defined]
88
+ return cls(**{k: v for k, v in d.items() if k in known})
89
+
90
+
91
+ @dataclass
92
+ class EfficiencyEstimate:
93
+ """A calibrated efficiency (measured / ceiling) for a shape, with an honest band."""
94
+ efficiency: float # central estimate
95
+ low: float # band low (efficiency units)
96
+ high: float # band high (efficiency units, capped at 1.0 -- can't beat the ceiling)
97
+ n_samples: int
98
+ regime: str
99
+ basis: str # human-readable provenance of the estimate
100
+
101
+
102
+ class CalibrationStore:
103
+ """Append-only JSON-directory persistence for calibration points.
104
+
105
+ A point is one `.json` file (so concurrent writers never clobber each other); a file may also
106
+ hold a LIST of points (the bundled seed is one such file). Reading tolerates both shapes and
107
+ skips anything malformed -- a corrupt point degrades the calibration, it never crashes a plan.
108
+ """
109
+
110
+ def __init__(self, path: str):
111
+ self.path = path
112
+
113
+ def add(self, point: CalibrationPoint, filename: Optional[str] = None) -> str:
114
+ os.makedirs(self.path, exist_ok=True)
115
+ if filename is None:
116
+ # Stable, collision-resistant name from the shape + provenance (no clock dependency).
117
+ stamp = (point.created or "nodate").replace(":", "-")
118
+ safe_model = "".join(c if c.isalnum() else "-" for c in point.model)[:40]
119
+ filename = f"{stamp}_{safe_model}_n{point.n_cpu_moe}.json"
120
+ dest = os.path.join(self.path, filename)
121
+ with open(dest, "w", encoding="utf-8") as f:
122
+ json.dump(point.to_dict(), f, indent=2, ensure_ascii=False)
123
+ return dest
124
+
125
+ def points(self) -> List[CalibrationPoint]:
126
+ out: List[CalibrationPoint] = []
127
+ if not os.path.isdir(self.path):
128
+ return out
129
+ for name in sorted(os.listdir(self.path)):
130
+ if not name.endswith(".json"):
131
+ continue
132
+ out.extend(_load_points_file(os.path.join(self.path, name)))
133
+ return out
134
+
135
+
136
+ def _load_points_file(path: str) -> List[CalibrationPoint]:
137
+ """Load a JSON file holding either one point (dict) or many (list). Never raises on bad data."""
138
+ try:
139
+ with open(path, "r", encoding="utf-8") as f:
140
+ data = json.load(f)
141
+ except (OSError, ValueError):
142
+ return []
143
+ records = data if isinstance(data, list) else [data]
144
+ out: List[CalibrationPoint] = []
145
+ for rec in records:
146
+ if isinstance(rec, dict):
147
+ try:
148
+ out.append(CalibrationPoint.from_dict(rec))
149
+ except (TypeError, ValueError):
150
+ continue
151
+ return out
152
+
153
+
154
+ def load_seed_points() -> List[CalibrationPoint]:
155
+ """The measured receipts bundled with the package (Qwen3-30B-A3B, milestone 2-3)."""
156
+ return _load_points_file(_SEED_PATH)
157
+
158
+
159
+ class CalibrationModel:
160
+ """Fits realized efficiency from calibration points, bucketed by regime.
161
+
162
+ `estimate(regime, offload_fraction)` returns an `EfficiencyEstimate` or None (no data for that
163
+ regime -> the planner falls back to the ceiling). Within `offload`, it interpolates piecewise-
164
+ linearly over the offload fraction when >= 2 distinct fractions are known; otherwise it uses the
165
+ regime's median efficiency. The band is +/-`margin`, widened so it always contains every observed
166
+ point in the regime, and capped at efficiency 1.0.
167
+ """
168
+
169
+ def __init__(self, points: Iterable[CalibrationPoint], margin: float = DEFAULT_MARGIN):
170
+ self.points = [p for p in points if p.efficiency is not None]
171
+ self.margin = margin
172
+
173
+ @classmethod
174
+ def from_seed(cls, extra: Optional[Iterable[CalibrationPoint]] = None,
175
+ margin: float = DEFAULT_MARGIN) -> "CalibrationModel":
176
+ pts = list(load_seed_points())
177
+ if extra:
178
+ pts.extend(extra)
179
+ return cls(pts, margin=margin)
180
+
181
+ def has_data(self) -> bool:
182
+ return bool(self.points)
183
+
184
+ def estimate(self, regime: str, offload_fraction: float = 0.0) -> Optional[EfficiencyEstimate]:
185
+ pts = [p for p in self.points if p.regime == regime]
186
+ if not pts:
187
+ return None
188
+ effs = [p.efficiency for p in pts] # type: ignore[misc] (filtered to non-None in __init__)
189
+
190
+ # group efficiencies by offload fraction (average repeated runs at the same fraction)
191
+ by_frac = defaultdict(list)
192
+ for p in pts:
193
+ by_frac[round(p.offload_fraction, 4)].append(p.efficiency)
194
+ curve = sorted((f, sum(v) / len(v)) for f, v in by_frac.items())
195
+
196
+ if regime == "offload" and len(curve) >= 2:
197
+ central = _interp(curve, offload_fraction)
198
+ basis = (f"calibrated: piecewise-linear over {len(curve)} offload fractions "
199
+ f"({len(pts)} receipt(s)) at frac={offload_fraction:.2f}")
200
+ else:
201
+ central = float(median(effs))
202
+ basis = f"calibrated: median of {len(pts)} '{regime}' receipt(s)"
203
+
204
+ # band: never tighter than +/-margin, always contains every observed point in the regime
205
+ spread = max((abs(e - central) / central for e in effs), default=0.0) if central else 0.0
206
+ rel = max(self.margin, spread)
207
+ low = max(central * (1.0 - rel), 1e-4)
208
+ high = min(central * (1.0 + rel), 1.0)
209
+ return EfficiencyEstimate(efficiency=central, low=low, high=high,
210
+ n_samples=len(pts), regime=regime, basis=basis)
211
+
212
+
213
+ def _interp(curve: List[tuple], x: float) -> float:
214
+ """Piecewise-linear interpolation over a sorted [(x, y), ...] curve, clamped at both ends
215
+ (NO extrapolation -- beyond the measured fractions we hold the nearest observed efficiency)."""
216
+ if x <= curve[0][0]:
217
+ return curve[0][1]
218
+ if x >= curve[-1][0]:
219
+ return curve[-1][1]
220
+ for (x0, y0), (x1, y1) in zip(curve, curve[1:]):
221
+ if x0 <= x <= x1:
222
+ t = (x - x0) / (x1 - x0) if x1 > x0 else 0.0
223
+ return y0 + t * (y1 - y0)
224
+ return curve[-1][1]
@@ -0,0 +1,44 @@
1
+ [
2
+ {
3
+ "model": "Qwen3-30B-A3B",
4
+ "n_cpu_moe": 0,
5
+ "n_moe_layers": 48,
6
+ "ceiling_tok_s": 737.56,
7
+ "measured_tok_s": 302.4,
8
+ "quant": "gguf-q4_k_m",
9
+ "cpu_bw_gbps": 40.7,
10
+ "vram_bw_gbps": 1790.0,
11
+ "ctx_len": 4096,
12
+ "created": "2026-06-04",
13
+ "rig": "RTX 5090 (sm_120) WSL2 Docker, driver 610.47, CUDA 12.8",
14
+ "source": "milestone 2-3 live receipt: Qwen3-30B-A3B Q4_K_M, llama-bench -p 512 -n 128"
15
+ },
16
+ {
17
+ "model": "Qwen3-30B-A3B",
18
+ "n_cpu_moe": 24,
19
+ "n_moe_layers": 48,
20
+ "ceiling_tok_s": 69.02,
21
+ "measured_tok_s": 41.9,
22
+ "quant": "gguf-q4_k_m",
23
+ "cpu_bw_gbps": 40.7,
24
+ "vram_bw_gbps": 1790.0,
25
+ "ctx_len": 4096,
26
+ "created": "2026-06-04",
27
+ "rig": "RTX 5090 (sm_120) WSL2 Docker, driver 610.47, CUDA 12.8",
28
+ "source": "milestone 2-3 live receipt: Qwen3-30B-A3B Q4_K_M, llama-bench -p 512 -n 128"
29
+ },
30
+ {
31
+ "model": "Qwen3-30B-A3B",
32
+ "n_cpu_moe": 48,
33
+ "n_moe_layers": 48,
34
+ "ceiling_tok_s": 36.2,
35
+ "measured_tok_s": 20.4,
36
+ "quant": "gguf-q4_k_m",
37
+ "cpu_bw_gbps": 40.7,
38
+ "vram_bw_gbps": 1790.0,
39
+ "ctx_len": 4096,
40
+ "created": "2026-06-04",
41
+ "rig": "RTX 5090 (sm_120) WSL2 Docker, driver 610.47, CUDA 12.8",
42
+ "source": "milestone 2-3 live receipt: Qwen3-30B-A3B Q4_K_M, llama-bench -p 512 -n 128"
43
+ }
44
+ ]