gpu-container 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_container/__init__.py +9 -0
- gpu_container/__main__.py +60 -0
- gpu_container/errors.py +72 -0
- gpu_container/planner/__init__.py +17 -0
- gpu_container/planner/activation.py +225 -0
- gpu_container/planner/calibration.py +224 -0
- gpu_container/planner/calibration_seed.json +44 -0
- gpu_container/planner/cli.py +101 -0
- gpu_container/planner/concentration_cli.py +120 -0
- gpu_container/planner/placement.py +198 -0
- gpu_container/planner/receipt.py +155 -0
- gpu_container/planner/receipt_cli.py +143 -0
- gpu_container/profiler/__init__.py +24 -0
- gpu_container/profiler/baseline.py +122 -0
- gpu_container/profiler/cli.py +151 -0
- gpu_container/profiler/cuda_bench.py +306 -0
- gpu_container/profiler/hardware.py +304 -0
- gpu_container/profiler/model.py +178 -0
- gpu_container/profiler/nvme_bench.py +158 -0
- gpu_container/profiler/schema.py +245 -0
- gpu_container/watchdog.py +563 -0
- gpu_container-0.1.0.dist-info/METADATA +100 -0
- gpu_container-0.1.0.dist-info/RECORD +26 -0
- gpu_container-0.1.0.dist-info/WHEEL +4 -0
- gpu_container-0.1.0.dist-info/entry_points.txt +7 -0
- gpu_container-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""gpu-container — a model-aware inference memory-placement planner for single-GPU rigs.
|
|
2
|
+
|
|
3
|
+
Phase 1 starts with the profiler (this package's `profiler` module): it emits the
|
|
4
|
+
profile JSON that every downstream component (planner, receipt) reads. Knowledge that
|
|
5
|
+
grounds the measurement methodology lives in the docker-knowledge KB
|
|
6
|
+
(readouts/docker-knowledge), seeded by the feasibility study-swarm.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Unified `gpu-container <command>` entry — the single-binary / npm-launcher dispatcher.
|
|
2
|
+
|
|
3
|
+
`pip install gpu-container` lays down five console scripts (`gpu-container-profile`, `-plan`,
|
|
4
|
+
`-receipt`, `-concentration`, `-watchdog`). This dispatcher exposes the same five as subcommands of
|
|
5
|
+
one `gpu-container` command, which is what the PyInstaller binary and the npm launcher run:
|
|
6
|
+
|
|
7
|
+
gpu-container profile ...
|
|
8
|
+
gpu-container plan ...
|
|
9
|
+
gpu-container watchdog run -- <cmd...>
|
|
10
|
+
|
|
11
|
+
Subcommand modules are imported lazily so a fast path (`--help`) never imports the whole package,
|
|
12
|
+
and so a frozen binary only pulls what the chosen command needs.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import sys
|
|
17
|
+
from typing import List, Optional
|
|
18
|
+
|
|
19
|
+
from . import __version__
|
|
20
|
+
|
|
21
|
+
_SUB = {
|
|
22
|
+
"profile": "gpu_container.profiler.cli",
|
|
23
|
+
"plan": "gpu_container.planner.cli",
|
|
24
|
+
"receipt": "gpu_container.planner.receipt_cli",
|
|
25
|
+
"concentration": "gpu_container.planner.concentration_cli",
|
|
26
|
+
"watchdog": "gpu_container.watchdog",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
_USAGE = (
|
|
30
|
+
"usage: gpu-container <command> [args...]\n\n"
|
|
31
|
+
"commands:\n"
|
|
32
|
+
" profile profile the rig (+ model) -> profile.json\n"
|
|
33
|
+
" plan plan an MoE placement (llama.cpp --n-cpu-moe) from a profile\n"
|
|
34
|
+
" receipt verify a plan against a real llama-bench run\n"
|
|
35
|
+
" concentration per-expert-cache de-risk gate (routing concentration)\n"
|
|
36
|
+
" watchdog rig-safety control plane (monitor, or `run -- <cmd>`)\n\n"
|
|
37
|
+
"run `gpu-container <command> --help` for per-command help.\n"
|
|
38
|
+
"(each command is also installed standalone as `gpu-container-<command>`.)"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
43
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
44
|
+
if not argv or argv[0] in ("-h", "--help"):
|
|
45
|
+
print(_USAGE)
|
|
46
|
+
return 0
|
|
47
|
+
if argv[0] in ("-V", "--version"):
|
|
48
|
+
print(__version__) # static — works in a frozen binary (no importlib.metadata lookup)
|
|
49
|
+
return 0
|
|
50
|
+
cmd, rest = argv[0], argv[1:]
|
|
51
|
+
mod = _SUB.get(cmd)
|
|
52
|
+
if mod is None:
|
|
53
|
+
print(f"gpu-container: unknown command '{cmd}'\n\n{_USAGE}", file=sys.stderr)
|
|
54
|
+
return 2
|
|
55
|
+
import importlib
|
|
56
|
+
return importlib.import_module(mod).main(rest)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == "__main__":
|
|
60
|
+
raise SystemExit(main())
|
gpu_container/errors.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Structured errors + a CLI guard — the shipcheck B1/B3 contract.
|
|
2
|
+
|
|
3
|
+
Every user-facing failure carries a stable `{code, message, hint, cause?, retryable?}` shape
|
|
4
|
+
(`GpuContainerError`) and renders as a few clean lines — never a raw traceback. `guard()` wraps a
|
|
5
|
+
CLI's body so an *unexpected* exception also becomes one clean line + exit 2; the full traceback
|
|
6
|
+
appears only with `--debug`.
|
|
7
|
+
|
|
8
|
+
Error `code`s are namespaced and stable once released (treat like API). Prefixes:
|
|
9
|
+
INPUT_ bad user input / validation IO_ filesystem / paths
|
|
10
|
+
DEP_ a missing optional dependency RUNTIME_ unexpected failure
|
|
11
|
+
STATE_ corrupt / stale internal state
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import sys
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Callable, List, Optional
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class GpuContainerError(Exception):
|
|
22
|
+
"""A user-facing error with a structured, stable shape. Raise it from a CLI body; `guard`
|
|
23
|
+
renders it and returns `exit_code` (default 2 — distinct from the planners' verdict codes)."""
|
|
24
|
+
code: str
|
|
25
|
+
message: str
|
|
26
|
+
hint: Optional[str] = None
|
|
27
|
+
cause: Optional[str] = None
|
|
28
|
+
retryable: bool = False
|
|
29
|
+
exit_code: int = 2
|
|
30
|
+
|
|
31
|
+
def __str__(self) -> str:
|
|
32
|
+
return f"{self.code}: {self.message}"
|
|
33
|
+
|
|
34
|
+
def to_dict(self) -> dict:
|
|
35
|
+
return {"code": self.code, "message": self.message, "hint": self.hint,
|
|
36
|
+
"cause": self.cause, "retryable": self.retryable}
|
|
37
|
+
|
|
38
|
+
def render(self) -> str:
|
|
39
|
+
lines = [f"ERROR [{self.code}]: {self.message}"]
|
|
40
|
+
if self.hint:
|
|
41
|
+
lines.append(f" hint: {self.hint}")
|
|
42
|
+
if self.cause:
|
|
43
|
+
lines.append(f" cause: {self.cause}")
|
|
44
|
+
if self.retryable:
|
|
45
|
+
lines.append(" retryable: yes")
|
|
46
|
+
return "\n".join(lines)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def guard(run: Callable[[Optional[List[str]]], int], argv: Optional[List[str]] = None) -> int:
|
|
50
|
+
"""Run a CLI body, turning failures into clean structured output and never leaking a raw
|
|
51
|
+
traceback unless `--debug` is present.
|
|
52
|
+
|
|
53
|
+
- `GpuContainerError` -> render it, return its `exit_code` (an expected error; no trace).
|
|
54
|
+
- any other Exception -> with `--debug`, re-raise (show the trace); else one clean line + exit 2.
|
|
55
|
+
Normal returns (the verdict codes 0/3/4/5/7) pass straight through. argparse's own usage exits
|
|
56
|
+
(SystemExit) are not caught — they keep argparse's message + exit 2.
|
|
57
|
+
"""
|
|
58
|
+
debug = "--debug" in (argv if argv is not None else sys.argv[1:])
|
|
59
|
+
try:
|
|
60
|
+
return run(argv)
|
|
61
|
+
except GpuContainerError as e:
|
|
62
|
+
print(e.render(), file=sys.stderr)
|
|
63
|
+
return e.exit_code
|
|
64
|
+
except KeyboardInterrupt:
|
|
65
|
+
print("ERROR [RUNTIME_INTERRUPTED]: interrupted by user", file=sys.stderr)
|
|
66
|
+
return 130
|
|
67
|
+
except Exception as e: # noqa: BLE001 — deliberate: no raw stack without --debug (gate B3)
|
|
68
|
+
if debug:
|
|
69
|
+
raise
|
|
70
|
+
print(f"ERROR [RUNTIME_UNEXPECTED]: {type(e).__name__}: {e}\n"
|
|
71
|
+
" hint: re-run with --debug for the full traceback", file=sys.stderr)
|
|
72
|
+
return 2
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""The placement planner: a profile + model -> an explicit tiered placement plan for a runtime.
|
|
2
|
+
|
|
3
|
+
Phase-1 target: llama.cpp `--n-cpu-moe N` (the first N MoE layers' expert weights live in CPU
|
|
4
|
+
RAM and are computed on CPU; attention/router/shared/embeddings/head stay on the GPU — the
|
|
5
|
+
KTransformers-style "compute where the weights are", NOT per-token PCIe streaming; verified in
|
|
6
|
+
tensor-engine-knowledge). The planner finds the minimal N that fits VRAM, predicts the memory
|
|
7
|
+
map + decode throughput, and REFUSES below the >1 tok/s floor with a contrastive frame.
|
|
8
|
+
"""
|
|
9
|
+
from .calibration import ( # noqa: F401
|
|
10
|
+
CalibrationModel,
|
|
11
|
+
CalibrationPoint,
|
|
12
|
+
CalibrationStore,
|
|
13
|
+
EfficiencyEstimate,
|
|
14
|
+
load_seed_points,
|
|
15
|
+
)
|
|
16
|
+
from .placement import DEFAULT_CPU_BW_GBPS, DEFAULT_VRAM_BW_GBPS, plan_llama_cpp # noqa: F401
|
|
17
|
+
from .receipt import build_receipt, parse_llama_bench, plan_to_calibration_point # noqa: F401
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""Activation-trace concentration analysis — does per-expert caching even help?
|
|
2
|
+
|
|
3
|
+
The throughput half ([`calibration.py`]) turns receipts into a tok/s forecast. THIS module answers a
|
|
4
|
+
PRIOR, go/no-go question for the per-expert lane: given a captured activation trace (which experts
|
|
5
|
+
fired, per layer, over a representative workload), is the routing CONCENTRATED enough that a small
|
|
6
|
+
hot-expert VRAM cache would hit often — or is it so uniform that the cache (llama.cpp #20757) isn't
|
|
7
|
+
worth building?
|
|
8
|
+
|
|
9
|
+
It is the de-risk gate for ADR-0001 (`docs/decisions/0001-per-expert-cache-build-vs-upstream.md`):
|
|
10
|
+
build the runtime expert cache only where the trace shows a small fraction of experts captures most
|
|
11
|
+
of the routing.
|
|
12
|
+
|
|
13
|
+
Grounded in docker-knowledge wave-4 (moe-placement):
|
|
14
|
+
- Per-LAYER *total* activation is ~uniform — every token hits every layer's top-k experts — so the
|
|
15
|
+
signal is PER-EXPERT concentration WITHIN a layer, not which layer. Only the runtime cache can
|
|
16
|
+
exploit per-expert skew; `-ot` is per-layer (llamacpp-experts-fused-per-layer-not-per-expert).
|
|
17
|
+
- Skew is request-level and flattens to uniform across diverse prompts (MoE-Infinity,
|
|
18
|
+
arXiv:2401.14361) — so concentration is WORKLOAD-DEPENDENT: a trace is only valid for the
|
|
19
|
+
workload it was cut from. The report says so; a diverse-prompt trace reads LESS concentrated.
|
|
20
|
+
- The trace is an L×E count matrix captured via an eval-callback (activation-trace-via-eval-callback);
|
|
21
|
+
THIS module only CONSUMES it. None-not-guess: no trace -> no verdict (the planner stays per-layer).
|
|
22
|
+
|
|
23
|
+
Two measures, deliberately:
|
|
24
|
+
- `hot_frac_for_coverage` — the fraction of a layer's experts that must be resident to capture
|
|
25
|
+
`coverage_target` (default 90%) of its routing. The ACTIONABLE number; maps straight to #20757
|
|
26
|
+
`--moe-expert-cache-size`.
|
|
27
|
+
- `concentration_score = 1 - normalized_entropy` — a threshold-free [0,1] skew measure
|
|
28
|
+
(0 = uniform, 1 = one expert), robust to the arbitrary coverage target.
|
|
29
|
+
|
|
30
|
+
`cache_helps` is a convenience gate on the numbers, never a substitute for them.
|
|
31
|
+
"""
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import json
|
|
35
|
+
import math
|
|
36
|
+
from dataclasses import asdict, dataclass, field
|
|
37
|
+
from statistics import median
|
|
38
|
+
from typing import List, Optional
|
|
39
|
+
|
|
40
|
+
DEFAULT_COVERAGE_TARGET = 0.90
|
|
41
|
+
# If fewer than this fraction of a layer's experts cover `coverage_target` of its routing, a hot-expert
|
|
42
|
+
# cache buys real VRAM back — so a cache "helps". Tunable; the numbers are reported regardless.
|
|
43
|
+
DEFAULT_CACHE_HELPS_THRESHOLD = 0.50
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class LayerActivation:
|
|
48
|
+
"""One MoE layer's routing counts: expert_counts[i] = tokens routed to expert i in this layer."""
|
|
49
|
+
layer_index: int
|
|
50
|
+
expert_counts: List[int] = field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def total(self) -> int:
|
|
54
|
+
return sum(self.expert_counts)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class ActivationTrace:
|
|
59
|
+
"""An L×E activation trace captured over a representative workload (the eval-callback's output).
|
|
60
|
+
|
|
61
|
+
Persists only measured facts; concentration is DERIVED by `analyze_concentration` so the verdict
|
|
62
|
+
is always re-derivable from the counts, never a number we can silently get wrong.
|
|
63
|
+
"""
|
|
64
|
+
model: str
|
|
65
|
+
num_experts: int # E (routed experts per MoE layer)
|
|
66
|
+
experts_per_token: int # top-k (sanity: each layer total ~= n_tokens * k)
|
|
67
|
+
n_tokens: int # decode tokens the trace covers
|
|
68
|
+
layers: List[LayerActivation] = field(default_factory=list)
|
|
69
|
+
gate_weighted: bool = False # counts are gate-mass-weighted (else raw selection counts)
|
|
70
|
+
created: Optional[str] = None # ISO date (passed in; the capture harness has no clock)
|
|
71
|
+
rig: Optional[str] = None
|
|
72
|
+
source: Optional[str] = None # provenance (which workload / run produced it)
|
|
73
|
+
|
|
74
|
+
def to_dict(self) -> dict:
|
|
75
|
+
return asdict(self)
|
|
76
|
+
|
|
77
|
+
def to_json(self, indent: int = 2) -> str:
|
|
78
|
+
return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def from_dict(cls, d: dict) -> "ActivationTrace":
|
|
82
|
+
known = set(cls.__dataclass_fields__) # type: ignore[attr-defined]
|
|
83
|
+
kw = {k: v for k, v in d.items() if k in known}
|
|
84
|
+
kw["layers"] = [
|
|
85
|
+
LayerActivation(layer_index=l.get("layer_index"),
|
|
86
|
+
expert_counts=[int(c) for c in (l.get("expert_counts") or [])])
|
|
87
|
+
for l in (d.get("layers") or []) if isinstance(l, dict)
|
|
88
|
+
]
|
|
89
|
+
return cls(**kw)
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def from_json(cls, s: str) -> "ActivationTrace":
|
|
93
|
+
return cls.from_dict(json.loads(s))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class LayerConcentration:
|
|
98
|
+
layer_index: int
|
|
99
|
+
total_mass: int # sanity: ~= n_tokens * top_k (per-layer totals are ~uniform)
|
|
100
|
+
top1_share: float # routing share of the single hottest expert
|
|
101
|
+
hot_frac_for_coverage: float # fraction of experts needed to reach coverage_target
|
|
102
|
+
concentration_score: float # 1 - normalized entropy (0 = uniform, 1 = fully concentrated)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class ConcentrationReport:
|
|
107
|
+
"""The de-risk verdict: would a per-expert cache help, and by how much, for THIS workload."""
|
|
108
|
+
model: str
|
|
109
|
+
num_experts: int
|
|
110
|
+
n_layers: int # layers with routing mass that were analyzed
|
|
111
|
+
n_tokens: int
|
|
112
|
+
coverage_target: float
|
|
113
|
+
threshold: float
|
|
114
|
+
cache_helps: bool
|
|
115
|
+
hot_frac_for_coverage: float # median over layers — the headline cache-size number
|
|
116
|
+
concentration_score: float # mean over layers
|
|
117
|
+
top1_share: float # median over layers
|
|
118
|
+
per_layer: List[LayerConcentration] = field(default_factory=list)
|
|
119
|
+
basis: str = ""
|
|
120
|
+
notes: List[str] = field(default_factory=list)
|
|
121
|
+
|
|
122
|
+
def to_dict(self) -> dict:
|
|
123
|
+
return asdict(self)
|
|
124
|
+
|
|
125
|
+
def to_json(self, indent: int = 2) -> str:
|
|
126
|
+
return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _normalized_entropy(counts: List[int]) -> float:
|
|
130
|
+
"""Shannon entropy of the routing distribution, normalized to [0,1] by log(E).
|
|
131
|
+
|
|
132
|
+
1.0 = perfectly uniform (every expert equally used); 0.0 = all mass on one expert. Caller
|
|
133
|
+
guarantees total > 0. A single expert (or a single active expert) is fully concentrated (0.0)."""
|
|
134
|
+
total = sum(counts)
|
|
135
|
+
nz = [c for c in counts if c > 0]
|
|
136
|
+
E = len(counts)
|
|
137
|
+
if E <= 1 or len(nz) <= 1:
|
|
138
|
+
return 0.0
|
|
139
|
+
H = -sum((c / total) * math.log(c / total) for c in nz)
|
|
140
|
+
return H / math.log(E)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _hot_frac_for_coverage(counts: List[int], target: float) -> float:
|
|
144
|
+
"""Fraction of experts (resident, hottest-first) needed to capture `target` of routing mass.
|
|
145
|
+
|
|
146
|
+
Low = concentrated (a small cache covers most routing); ~target = uniform (no cache win)."""
|
|
147
|
+
total = sum(counts)
|
|
148
|
+
E = len(counts)
|
|
149
|
+
if total <= 0 or E <= 0:
|
|
150
|
+
return 1.0
|
|
151
|
+
need = target * total
|
|
152
|
+
cum = 0
|
|
153
|
+
for i, c in enumerate(sorted(counts, reverse=True), start=1):
|
|
154
|
+
cum += c
|
|
155
|
+
if cum >= need:
|
|
156
|
+
return i / E
|
|
157
|
+
return 1.0
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def analyze_layer(layer: LayerActivation, coverage_target: float) -> Optional[LayerConcentration]:
|
|
161
|
+
"""Per-layer concentration, or None for a zero-mass layer (skipped, never guessed)."""
|
|
162
|
+
counts = layer.expert_counts
|
|
163
|
+
total = sum(counts)
|
|
164
|
+
if total <= 0 or not counts:
|
|
165
|
+
return None
|
|
166
|
+
return LayerConcentration(
|
|
167
|
+
layer_index=layer.layer_index,
|
|
168
|
+
total_mass=total,
|
|
169
|
+
top1_share=max(counts) / total,
|
|
170
|
+
hot_frac_for_coverage=_hot_frac_for_coverage(counts, coverage_target),
|
|
171
|
+
concentration_score=1.0 - _normalized_entropy(counts),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def analyze_concentration(
|
|
176
|
+
trace: ActivationTrace,
|
|
177
|
+
coverage_target: float = DEFAULT_COVERAGE_TARGET,
|
|
178
|
+
cache_helps_threshold: float = DEFAULT_CACHE_HELPS_THRESHOLD,
|
|
179
|
+
) -> ConcentrationReport:
|
|
180
|
+
"""Aggregate a trace into the per-expert-cache de-risk verdict. Never raises; honest on empty data."""
|
|
181
|
+
per_layer = [c for c in (analyze_layer(l, coverage_target) for l in trace.layers) if c is not None]
|
|
182
|
+
notes: List[str] = []
|
|
183
|
+
|
|
184
|
+
if not per_layer:
|
|
185
|
+
return ConcentrationReport(
|
|
186
|
+
model=trace.model, num_experts=trace.num_experts, n_layers=0, n_tokens=trace.n_tokens,
|
|
187
|
+
coverage_target=coverage_target, threshold=cache_helps_threshold,
|
|
188
|
+
cache_helps=False, hot_frac_for_coverage=1.0, concentration_score=0.0, top1_share=0.0,
|
|
189
|
+
per_layer=[], basis="no layers with routing mass — cannot assess (treated as 'cache not justified')",
|
|
190
|
+
notes=["empty or zero-mass trace — capture a real workload trace before deciding"],
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
hot = float(median(c.hot_frac_for_coverage for c in per_layer))
|
|
194
|
+
conc = sum(c.concentration_score for c in per_layer) / len(per_layer)
|
|
195
|
+
top1 = float(median(c.top1_share for c in per_layer))
|
|
196
|
+
cache_helps = hot < cache_helps_threshold
|
|
197
|
+
|
|
198
|
+
# Sanity: per-layer totals should be ~uniform (every token hits every layer's top-k). A large
|
|
199
|
+
# spread hints at a malformed trace, a wrong experts_per_token/n_tokens, or unequal expert counts.
|
|
200
|
+
masses = [c.total_mass for c in per_layer]
|
|
201
|
+
if max(masses) and (max(masses) - min(masses)) / max(masses) > 0.2:
|
|
202
|
+
notes.append("per-layer totals vary >20% — check experts_per_token/n_tokens, or layers may "
|
|
203
|
+
"carry differing expert counts")
|
|
204
|
+
notes.append("concentration is WORKLOAD-DEPENDENT: request-level skew flattens across diverse "
|
|
205
|
+
"prompts (MoE-Infinity); this verdict is valid only for the workload this trace covers")
|
|
206
|
+
|
|
207
|
+
basis = (f"{len(per_layer)} layers; hot_frac = median experts for {coverage_target:.0%} routing "
|
|
208
|
+
f"coverage; concentration = 1 - normalized_entropy (mean); "
|
|
209
|
+
f"cache_helps = hot_frac < {cache_helps_threshold:.0%}")
|
|
210
|
+
|
|
211
|
+
return ConcentrationReport(
|
|
212
|
+
model=trace.model, num_experts=trace.num_experts, n_layers=len(per_layer),
|
|
213
|
+
n_tokens=trace.n_tokens, coverage_target=coverage_target, threshold=cache_helps_threshold,
|
|
214
|
+
cache_helps=cache_helps, hot_frac_for_coverage=hot, concentration_score=conc,
|
|
215
|
+
top1_share=top1, per_layer=per_layer, basis=basis, notes=notes,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def load_trace(path) -> Optional[ActivationTrace]:
|
|
220
|
+
"""Load a trace JSON (the capture harness's output). Returns None on any error — never raises."""
|
|
221
|
+
try:
|
|
222
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
223
|
+
return ActivationTrace.from_dict(json.load(f))
|
|
224
|
+
except (OSError, ValueError, TypeError):
|
|
225
|
+
return None
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""Receipt-driven recalibration — turn measured receipts into a calibrated forecast.
|
|
2
|
+
|
|
3
|
+
The planner emits a roofline CEILING (peak bandwidth, zero overhead): a true upper bound on
|
|
4
|
+
decode tok/s, but real decode runs at a fraction of it (Qwen3-30B-A3B realized 41% in-VRAM,
|
|
5
|
+
56-61% offloaded — milestone 2-3 live receipt). This module closes that static-prediction gap:
|
|
6
|
+
|
|
7
|
+
receipt -> CalibrationPoint (realized efficiency at a known shape)
|
|
8
|
+
-> CalibrationStore (a JSON dir; append-only, auditable)
|
|
9
|
+
-> CalibrationModel (efficiency = f(regime, offload-fraction), with a band)
|
|
10
|
+
-> planner emits ceiling x efficiency +/- band (the calibrated forecast)
|
|
11
|
+
|
|
12
|
+
Two regimes, because they are bound by different things (placement.py `basis`):
|
|
13
|
+
- `in_vram` (N = 0): overhead-bound — small-active MoE spends most of its time NOT moving
|
|
14
|
+
bytes, so realized efficiency is low (~41%) and roughly flat.
|
|
15
|
+
- `offload` (N > 0): CPU-RAM-bandwidth-bound — the roofline fits better (~56-61%) and tracks
|
|
16
|
+
the offload fraction N / n_moe_layers.
|
|
17
|
+
|
|
18
|
+
Sparse-data-honest by construction: we bucket by regime and interpolate within `offload` only
|
|
19
|
+
when there are >= 2 distinct offload fractions; otherwise we report the regime's central
|
|
20
|
+
efficiency. The band never narrows below +/-`default_margin` (we cannot claim more confidence
|
|
21
|
+
than the data supports), and it always contains every observed point in the regime. With NO
|
|
22
|
+
points for a regime, `estimate()` returns None and the planner falls back to the raw ceiling --
|
|
23
|
+
the honest "uncalibrated" path. This mirrors the feasibility verdict's calibration #2: the
|
|
24
|
+
+/-10% receipt is scoped to the regimes we have measured; everywhere else is ceiling + band.
|
|
25
|
+
|
|
26
|
+
The model never grades its own forecast: the points come from llama-bench (a real GPU run, a
|
|
27
|
+
DIFFERENT mechanism than the planner's closed form) -- the EXTERNAL_VERIFIER discipline.
|
|
28
|
+
"""
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import json
|
|
32
|
+
import os
|
|
33
|
+
from collections import defaultdict
|
|
34
|
+
from dataclasses import asdict, dataclass
|
|
35
|
+
from statistics import median
|
|
36
|
+
from typing import Iterable, List, Optional
|
|
37
|
+
|
|
38
|
+
# Bundled seed: the measured Qwen3-30B-A3B receipts that ship with the package so a known shape
|
|
39
|
+
# is calibrated out-of-the-box. Lives next to this module.
|
|
40
|
+
_SEED_PATH = os.path.join(os.path.dirname(__file__), "calibration_seed.json")
|
|
41
|
+
|
|
42
|
+
DEFAULT_MARGIN = 0.25 # +/-25% efficiency band (feasibility #11: heavy offload can miss 2-3x; this
|
|
43
|
+
# is the *calibrated* band, far tighter than that worst case, but never tighter
|
|
44
|
+
# than the data supports)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class CalibrationPoint:
|
|
49
|
+
"""One receipt's realized efficiency, tagged with the shape it was measured at.
|
|
50
|
+
|
|
51
|
+
`efficiency`, `regime`, and `offload_fraction` are DERIVED (properties) -- we persist only the
|
|
52
|
+
measured facts (ceiling, measured tok/s, the N/L shape, the bandwidth assumptions) so a point is
|
|
53
|
+
auditable and re-derivable, never a number we can silently get wrong.
|
|
54
|
+
"""
|
|
55
|
+
model: str
|
|
56
|
+
n_cpu_moe: int
|
|
57
|
+
n_moe_layers: int
|
|
58
|
+
ceiling_tok_s: float
|
|
59
|
+
measured_tok_s: float
|
|
60
|
+
quant: Optional[str] = None
|
|
61
|
+
cpu_bw_gbps: Optional[float] = None
|
|
62
|
+
vram_bw_gbps: Optional[float] = None
|
|
63
|
+
ctx_len: Optional[int] = None
|
|
64
|
+
created: Optional[str] = None # ISO date (passed in; runners have no clock)
|
|
65
|
+
rig: Optional[str] = None
|
|
66
|
+
source: Optional[str] = None # provenance (which run / receipt)
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def regime(self) -> str:
|
|
70
|
+
return "in_vram" if self.n_cpu_moe == 0 else "offload"
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def offload_fraction(self) -> float:
|
|
74
|
+
return (self.n_cpu_moe / self.n_moe_layers) if self.n_moe_layers else 0.0
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def efficiency(self) -> Optional[float]:
|
|
78
|
+
if not self.ceiling_tok_s:
|
|
79
|
+
return None
|
|
80
|
+
return self.measured_tok_s / self.ceiling_tok_s
|
|
81
|
+
|
|
82
|
+
def to_dict(self) -> dict:
|
|
83
|
+
return asdict(self)
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def from_dict(cls, d: dict) -> "CalibrationPoint":
|
|
87
|
+
known = {f for f in cls.__dataclass_fields__} # type: ignore[attr-defined]
|
|
88
|
+
return cls(**{k: v for k, v in d.items() if k in known})
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class EfficiencyEstimate:
|
|
93
|
+
"""A calibrated efficiency (measured / ceiling) for a shape, with an honest band."""
|
|
94
|
+
efficiency: float # central estimate
|
|
95
|
+
low: float # band low (efficiency units)
|
|
96
|
+
high: float # band high (efficiency units, capped at 1.0 -- can't beat the ceiling)
|
|
97
|
+
n_samples: int
|
|
98
|
+
regime: str
|
|
99
|
+
basis: str # human-readable provenance of the estimate
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class CalibrationStore:
|
|
103
|
+
"""Append-only JSON-directory persistence for calibration points.
|
|
104
|
+
|
|
105
|
+
A point is one `.json` file (so concurrent writers never clobber each other); a file may also
|
|
106
|
+
hold a LIST of points (the bundled seed is one such file). Reading tolerates both shapes and
|
|
107
|
+
skips anything malformed -- a corrupt point degrades the calibration, it never crashes a plan.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(self, path: str):
|
|
111
|
+
self.path = path
|
|
112
|
+
|
|
113
|
+
def add(self, point: CalibrationPoint, filename: Optional[str] = None) -> str:
|
|
114
|
+
os.makedirs(self.path, exist_ok=True)
|
|
115
|
+
if filename is None:
|
|
116
|
+
# Stable, collision-resistant name from the shape + provenance (no clock dependency).
|
|
117
|
+
stamp = (point.created or "nodate").replace(":", "-")
|
|
118
|
+
safe_model = "".join(c if c.isalnum() else "-" for c in point.model)[:40]
|
|
119
|
+
filename = f"{stamp}_{safe_model}_n{point.n_cpu_moe}.json"
|
|
120
|
+
dest = os.path.join(self.path, filename)
|
|
121
|
+
with open(dest, "w", encoding="utf-8") as f:
|
|
122
|
+
json.dump(point.to_dict(), f, indent=2, ensure_ascii=False)
|
|
123
|
+
return dest
|
|
124
|
+
|
|
125
|
+
def points(self) -> List[CalibrationPoint]:
|
|
126
|
+
out: List[CalibrationPoint] = []
|
|
127
|
+
if not os.path.isdir(self.path):
|
|
128
|
+
return out
|
|
129
|
+
for name in sorted(os.listdir(self.path)):
|
|
130
|
+
if not name.endswith(".json"):
|
|
131
|
+
continue
|
|
132
|
+
out.extend(_load_points_file(os.path.join(self.path, name)))
|
|
133
|
+
return out
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _load_points_file(path: str) -> List[CalibrationPoint]:
|
|
137
|
+
"""Load a JSON file holding either one point (dict) or many (list). Never raises on bad data."""
|
|
138
|
+
try:
|
|
139
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
140
|
+
data = json.load(f)
|
|
141
|
+
except (OSError, ValueError):
|
|
142
|
+
return []
|
|
143
|
+
records = data if isinstance(data, list) else [data]
|
|
144
|
+
out: List[CalibrationPoint] = []
|
|
145
|
+
for rec in records:
|
|
146
|
+
if isinstance(rec, dict):
|
|
147
|
+
try:
|
|
148
|
+
out.append(CalibrationPoint.from_dict(rec))
|
|
149
|
+
except (TypeError, ValueError):
|
|
150
|
+
continue
|
|
151
|
+
return out
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def load_seed_points() -> List[CalibrationPoint]:
|
|
155
|
+
"""The measured receipts bundled with the package (Qwen3-30B-A3B, milestone 2-3)."""
|
|
156
|
+
return _load_points_file(_SEED_PATH)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class CalibrationModel:
|
|
160
|
+
"""Fits realized efficiency from calibration points, bucketed by regime.
|
|
161
|
+
|
|
162
|
+
`estimate(regime, offload_fraction)` returns an `EfficiencyEstimate` or None (no data for that
|
|
163
|
+
regime -> the planner falls back to the ceiling). Within `offload`, it interpolates piecewise-
|
|
164
|
+
linearly over the offload fraction when >= 2 distinct fractions are known; otherwise it uses the
|
|
165
|
+
regime's median efficiency. The band is +/-`margin`, widened so it always contains every observed
|
|
166
|
+
point in the regime, and capped at efficiency 1.0.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
def __init__(self, points: Iterable[CalibrationPoint], margin: float = DEFAULT_MARGIN):
|
|
170
|
+
self.points = [p for p in points if p.efficiency is not None]
|
|
171
|
+
self.margin = margin
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_seed(cls, extra: Optional[Iterable[CalibrationPoint]] = None,
|
|
175
|
+
margin: float = DEFAULT_MARGIN) -> "CalibrationModel":
|
|
176
|
+
pts = list(load_seed_points())
|
|
177
|
+
if extra:
|
|
178
|
+
pts.extend(extra)
|
|
179
|
+
return cls(pts, margin=margin)
|
|
180
|
+
|
|
181
|
+
def has_data(self) -> bool:
|
|
182
|
+
return bool(self.points)
|
|
183
|
+
|
|
184
|
+
def estimate(self, regime: str, offload_fraction: float = 0.0) -> Optional[EfficiencyEstimate]:
|
|
185
|
+
pts = [p for p in self.points if p.regime == regime]
|
|
186
|
+
if not pts:
|
|
187
|
+
return None
|
|
188
|
+
effs = [p.efficiency for p in pts] # type: ignore[misc] (filtered to non-None in __init__)
|
|
189
|
+
|
|
190
|
+
# group efficiencies by offload fraction (average repeated runs at the same fraction)
|
|
191
|
+
by_frac = defaultdict(list)
|
|
192
|
+
for p in pts:
|
|
193
|
+
by_frac[round(p.offload_fraction, 4)].append(p.efficiency)
|
|
194
|
+
curve = sorted((f, sum(v) / len(v)) for f, v in by_frac.items())
|
|
195
|
+
|
|
196
|
+
if regime == "offload" and len(curve) >= 2:
|
|
197
|
+
central = _interp(curve, offload_fraction)
|
|
198
|
+
basis = (f"calibrated: piecewise-linear over {len(curve)} offload fractions "
|
|
199
|
+
f"({len(pts)} receipt(s)) at frac={offload_fraction:.2f}")
|
|
200
|
+
else:
|
|
201
|
+
central = float(median(effs))
|
|
202
|
+
basis = f"calibrated: median of {len(pts)} '{regime}' receipt(s)"
|
|
203
|
+
|
|
204
|
+
# band: never tighter than +/-margin, always contains every observed point in the regime
|
|
205
|
+
spread = max((abs(e - central) / central for e in effs), default=0.0) if central else 0.0
|
|
206
|
+
rel = max(self.margin, spread)
|
|
207
|
+
low = max(central * (1.0 - rel), 1e-4)
|
|
208
|
+
high = min(central * (1.0 + rel), 1.0)
|
|
209
|
+
return EfficiencyEstimate(efficiency=central, low=low, high=high,
|
|
210
|
+
n_samples=len(pts), regime=regime, basis=basis)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _interp(curve: List[tuple], x: float) -> float:
|
|
214
|
+
"""Piecewise-linear interpolation over a sorted [(x, y), ...] curve, clamped at both ends
|
|
215
|
+
(NO extrapolation -- beyond the measured fractions we hold the nearest observed efficiency)."""
|
|
216
|
+
if x <= curve[0][0]:
|
|
217
|
+
return curve[0][1]
|
|
218
|
+
if x >= curve[-1][0]:
|
|
219
|
+
return curve[-1][1]
|
|
220
|
+
for (x0, y0), (x1, y1) in zip(curve, curve[1:]):
|
|
221
|
+
if x0 <= x <= x1:
|
|
222
|
+
t = (x - x0) / (x1 - x0) if x1 > x0 else 0.0
|
|
223
|
+
return y0 + t * (y1 - y0)
|
|
224
|
+
return curve[-1][1]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"model": "Qwen3-30B-A3B",
|
|
4
|
+
"n_cpu_moe": 0,
|
|
5
|
+
"n_moe_layers": 48,
|
|
6
|
+
"ceiling_tok_s": 737.56,
|
|
7
|
+
"measured_tok_s": 302.4,
|
|
8
|
+
"quant": "gguf-q4_k_m",
|
|
9
|
+
"cpu_bw_gbps": 40.7,
|
|
10
|
+
"vram_bw_gbps": 1790.0,
|
|
11
|
+
"ctx_len": 4096,
|
|
12
|
+
"created": "2026-06-04",
|
|
13
|
+
"rig": "RTX 5090 (sm_120) WSL2 Docker, driver 610.47, CUDA 12.8",
|
|
14
|
+
"source": "milestone 2-3 live receipt: Qwen3-30B-A3B Q4_K_M, llama-bench -p 512 -n 128"
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"model": "Qwen3-30B-A3B",
|
|
18
|
+
"n_cpu_moe": 24,
|
|
19
|
+
"n_moe_layers": 48,
|
|
20
|
+
"ceiling_tok_s": 69.02,
|
|
21
|
+
"measured_tok_s": 41.9,
|
|
22
|
+
"quant": "gguf-q4_k_m",
|
|
23
|
+
"cpu_bw_gbps": 40.7,
|
|
24
|
+
"vram_bw_gbps": 1790.0,
|
|
25
|
+
"ctx_len": 4096,
|
|
26
|
+
"created": "2026-06-04",
|
|
27
|
+
"rig": "RTX 5090 (sm_120) WSL2 Docker, driver 610.47, CUDA 12.8",
|
|
28
|
+
"source": "milestone 2-3 live receipt: Qwen3-30B-A3B Q4_K_M, llama-bench -p 512 -n 128"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"model": "Qwen3-30B-A3B",
|
|
32
|
+
"n_cpu_moe": 48,
|
|
33
|
+
"n_moe_layers": 48,
|
|
34
|
+
"ceiling_tok_s": 36.2,
|
|
35
|
+
"measured_tok_s": 20.4,
|
|
36
|
+
"quant": "gguf-q4_k_m",
|
|
37
|
+
"cpu_bw_gbps": 40.7,
|
|
38
|
+
"vram_bw_gbps": 1790.0,
|
|
39
|
+
"ctx_len": 4096,
|
|
40
|
+
"created": "2026-06-04",
|
|
41
|
+
"rig": "RTX 5090 (sm_120) WSL2 Docker, driver 610.47, CUDA 12.8",
|
|
42
|
+
"source": "milestone 2-3 live receipt: Qwen3-30B-A3B Q4_K_M, llama-bench -p 512 -n 128"
|
|
43
|
+
}
|
|
44
|
+
]
|