gpu-container 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_container/__init__.py +9 -0
- gpu_container/__main__.py +60 -0
- gpu_container/errors.py +72 -0
- gpu_container/planner/__init__.py +17 -0
- gpu_container/planner/activation.py +225 -0
- gpu_container/planner/calibration.py +224 -0
- gpu_container/planner/calibration_seed.json +44 -0
- gpu_container/planner/cli.py +101 -0
- gpu_container/planner/concentration_cli.py +120 -0
- gpu_container/planner/placement.py +198 -0
- gpu_container/planner/receipt.py +155 -0
- gpu_container/planner/receipt_cli.py +143 -0
- gpu_container/profiler/__init__.py +24 -0
- gpu_container/profiler/baseline.py +122 -0
- gpu_container/profiler/cli.py +151 -0
- gpu_container/profiler/cuda_bench.py +306 -0
- gpu_container/profiler/hardware.py +304 -0
- gpu_container/profiler/model.py +178 -0
- gpu_container/profiler/nvme_bench.py +158 -0
- gpu_container/profiler/schema.py +245 -0
- gpu_container/watchdog.py +563 -0
- gpu_container-0.1.0.dist-info/METADATA +100 -0
- gpu_container-0.1.0.dist-info/RECORD +26 -0
- gpu_container-0.1.0.dist-info/WHEEL +4 -0
- gpu_container-0.1.0.dist-info/entry_points.txt +7 -0
- gpu_container-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""`gpu-container-plan` — turn a profile (+ model) into a llama.cpp `--n-cpu-moe` placement plan.
|
|
2
|
+
|
|
3
|
+
gpu-container-plan --profile profile.json --model-config qwen3.json --quant gguf-q4_k_m --ctx 4096
|
|
4
|
+
|
|
5
|
+
Exit code is verdict-coded (ANDON): 0 = ship, 3 = refuse. The profile.json comes from
|
|
6
|
+
`gpu-container-profile` (run in-container for honest VRAM/CPU-bandwidth inputs).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from typing import List, Optional
|
|
14
|
+
|
|
15
|
+
from ..errors import GpuContainerError, guard
|
|
16
|
+
from ..profiler import model as model_mod
|
|
17
|
+
from ..profiler.schema import Profile
|
|
18
|
+
from .calibration import CalibrationModel, CalibrationStore
|
|
19
|
+
from .placement import plan_llama_cpp
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _main(argv: Optional[List[str]] = None) -> int:
|
|
23
|
+
for _stream in (sys.stdout, sys.stderr):
|
|
24
|
+
try:
|
|
25
|
+
_stream.reconfigure(encoding="utf-8")
|
|
26
|
+
except (AttributeError, ValueError):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
ap = argparse.ArgumentParser(
|
|
30
|
+
prog="gpu-container-plan",
|
|
31
|
+
description="Plan an MoE placement (llama.cpp --n-cpu-moe) from a rig+model profile.",
|
|
32
|
+
)
|
|
33
|
+
ap.add_argument("--debug", action="store_true", help="show the full traceback on an unexpected error")
|
|
34
|
+
ap.add_argument("--profile", required=True, help="profile.json from gpu-container-profile")
|
|
35
|
+
ap.add_argument("--model-config", help="HF config.json to (re)profile the model side into the plan")
|
|
36
|
+
ap.add_argument("--model-name", help="override the model name")
|
|
37
|
+
ap.add_argument("--quant", help="quant tag, e.g. gguf-q4_k_m (drives bytes/weight + footprint)")
|
|
38
|
+
ap.add_argument("--ctx", type=int, default=4096, help="context length for the KV-cache budget")
|
|
39
|
+
ap.add_argument("--batch", type=int, default=1)
|
|
40
|
+
ap.add_argument("--cpu-bw", type=float, help="override CPU RAM bandwidth (GB/s)")
|
|
41
|
+
ap.add_argument("--non-expert-bpw", type=float,
|
|
42
|
+
help="bytes/weight for always-resident weights (auto: f16 for mxfp4, else the quant bpw)")
|
|
43
|
+
ap.add_argument("--floor", type=float, default=1.0, help="refuse below this decode tok/s")
|
|
44
|
+
ap.add_argument("--hf", help="model ref for the launch command, e.g. unsloth/Qwen3-30B-A3B-GGUF:Q4_K_M")
|
|
45
|
+
ap.add_argument("--calibration-dir", help="extra calibration receipts to fold in (atop the bundled seed)")
|
|
46
|
+
ap.add_argument("--no-calibration", action="store_true",
|
|
47
|
+
help="forecast the raw roofline ceiling only (skip the calibrated band)")
|
|
48
|
+
ap.add_argument("-o", "--out", help="write the plan JSON here (default: stdout)")
|
|
49
|
+
args = ap.parse_args(argv)
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
with open(args.profile, "r", encoding="utf-8") as f:
|
|
53
|
+
prof = Profile.from_json(f.read())
|
|
54
|
+
except FileNotFoundError:
|
|
55
|
+
raise GpuContainerError("IO_PROFILE_NOT_FOUND", f"profile not found: {args.profile}",
|
|
56
|
+
hint="run `gpu-container-profile -o profile.json` first (in-container)")
|
|
57
|
+
except (ValueError, OSError) as e:
|
|
58
|
+
raise GpuContainerError("INPUT_BAD_PROFILE", f"could not read {args.profile}",
|
|
59
|
+
hint="expected a profile.json from gpu-container-profile", cause=str(e))
|
|
60
|
+
|
|
61
|
+
if args.model_config:
|
|
62
|
+
try:
|
|
63
|
+
with open(args.model_config, "r", encoding="utf-8") as f:
|
|
64
|
+
cfg = json.load(f)
|
|
65
|
+
except (OSError, ValueError) as e:
|
|
66
|
+
raise GpuContainerError("INPUT_BAD_MODEL_CONFIG", f"could not read {args.model_config}",
|
|
67
|
+
hint="expected a HuggingFace config.json", cause=str(e))
|
|
68
|
+
prof.model = model_mod.analyze_config(cfg, name=args.model_name, quant=args.quant or "gguf-q4_k_m")
|
|
69
|
+
elif args.quant and prof.model is not None:
|
|
70
|
+
prof.model.quant = args.quant
|
|
71
|
+
|
|
72
|
+
# Calibration: bundled seed + any extra receipts, unless disabled. With no data for the shape's
|
|
73
|
+
# regime the planner falls back to the raw ceiling on its own.
|
|
74
|
+
calibration = None
|
|
75
|
+
if not args.no_calibration:
|
|
76
|
+
extra = CalibrationStore(args.calibration_dir).points() if args.calibration_dir else None
|
|
77
|
+
calibration = CalibrationModel.from_seed(extra=extra)
|
|
78
|
+
|
|
79
|
+
plan = plan_llama_cpp(
|
|
80
|
+
prof, ctx_len=args.ctx, batch=args.batch,
|
|
81
|
+
cpu_mem_bw_gbps=args.cpu_bw, non_expert_bpw=args.non_expert_bpw,
|
|
82
|
+
floor_tok_s=args.floor, model_ref=args.hf, calibration=calibration,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
js = plan.to_json()
|
|
86
|
+
if args.out:
|
|
87
|
+
with open(args.out, "w", encoding="utf-8") as f:
|
|
88
|
+
f.write(js + "\n")
|
|
89
|
+
print(f"wrote {args.out}", file=sys.stderr)
|
|
90
|
+
else:
|
|
91
|
+
print(js)
|
|
92
|
+
print(plan.message, file=sys.stderr)
|
|
93
|
+
return 0 if plan.verdict == "ship" else 3
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
97
|
+
return guard(_main, argv)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""`gpu-container-concentration` — the per-expert-cache de-risk gate, as a command.
|
|
2
|
+
|
|
3
|
+
Given an activation trace (which experts fired, per layer), score routing CONCENTRATION and answer
|
|
4
|
+
the prior question for the per-expert lane: would a hot-expert VRAM cache (the llama.cpp #20757 lane)
|
|
5
|
+
actually help, or is routing too uniform to bother? Backs ADR-0001; logic in `activation.py`.
|
|
6
|
+
|
|
7
|
+
gpu-container-concentration --trace trace.json
|
|
8
|
+
gpu-container-concentration --imatrix imatrix.gguf --model-name Qwen3-30B-A3B # needs the `gguf` pkg
|
|
9
|
+
|
|
10
|
+
`--imatrix` reads a `llama-imatrix` output directly (per-layer `ffn_down_exps.weight.counts`); the
|
|
11
|
+
`gguf` package is an OPTIONAL dependency — only that path needs it. `--trace` keeps the core dep-free.
|
|
12
|
+
|
|
13
|
+
Exit code (ANDON-style, scriptable):
|
|
14
|
+
0 = analyzed; a per-expert cache is NOT justified (routing too uniform — the common 'hold' outcome)
|
|
15
|
+
5 = analyzed; routing concentrates enough that a cache could help (worth weighing #20757)
|
|
16
|
+
2 = usage / input error
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import sys
|
|
22
|
+
from typing import List, Optional
|
|
23
|
+
|
|
24
|
+
from ..errors import GpuContainerError, guard
|
|
25
|
+
from .activation import ActivationTrace, analyze_concentration, load_trace
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _trace_from_imatrix(path: str, model_name: str, topk: int) -> ActivationTrace:
|
|
29
|
+
"""Build an ActivationTrace from a llama-imatrix `imatrix.gguf` (per-expert `.counts`).
|
|
30
|
+
|
|
31
|
+
Raises ValueError on a missing `gguf` package or a non-MoE / unexpected imatrix (the caller maps
|
|
32
|
+
that to exit 2)."""
|
|
33
|
+
try:
|
|
34
|
+
import gguf # optional dependency — only the --imatrix path needs it
|
|
35
|
+
except ImportError:
|
|
36
|
+
raise ValueError("--imatrix needs the 'gguf' package (pip install gguf); "
|
|
37
|
+
"or extract a trace.json yourself and pass --trace.")
|
|
38
|
+
reader = gguf.GGUFReader(path)
|
|
39
|
+
counts = {}
|
|
40
|
+
for t in reader.tensors:
|
|
41
|
+
nm = t.name.strip()
|
|
42
|
+
if nm.endswith("ffn_down_exps.weight.counts") and nm.startswith("blk."):
|
|
43
|
+
layer = int(nm.split(".")[1])
|
|
44
|
+
counts[layer] = [int(round(float(x))) for x in list(t.data.flatten())]
|
|
45
|
+
if not counts:
|
|
46
|
+
raise ValueError("no per-expert counts (ffn_down_exps.weight.counts) in this imatrix — "
|
|
47
|
+
"is it a llama-imatrix .gguf for an MoE model?")
|
|
48
|
+
E = len(next(iter(counts.values())))
|
|
49
|
+
layers = [{"layer_index": L, "expert_counts": counts[L]} for L in sorted(counts)]
|
|
50
|
+
n_tokens = (sum(layers[0]["expert_counts"]) // topk) if topk else 0
|
|
51
|
+
return ActivationTrace.from_dict({
|
|
52
|
+
"model": model_name, "num_experts": E, "experts_per_token": topk,
|
|
53
|
+
"n_tokens": n_tokens, "layers": layers, "source": f"llama-imatrix per-expert counts: {path}",
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _main(argv: Optional[List[str]] = None) -> int:
|
|
58
|
+
for _stream in (sys.stdout, sys.stderr):
|
|
59
|
+
try:
|
|
60
|
+
_stream.reconfigure(encoding="utf-8")
|
|
61
|
+
except (AttributeError, ValueError):
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
ap = argparse.ArgumentParser(
|
|
65
|
+
prog="gpu-container-concentration",
|
|
66
|
+
description="Per-expert-cache de-risk gate: does this model's routing concentrate enough to cache?",
|
|
67
|
+
)
|
|
68
|
+
ap.add_argument("--debug", action="store_true", help="show the full traceback on an unexpected error")
|
|
69
|
+
src = ap.add_mutually_exclusive_group(required=True)
|
|
70
|
+
src.add_argument("--trace", help="ActivationTrace JSON (L×E per-expert counts)")
|
|
71
|
+
src.add_argument("--imatrix", help="llama-imatrix imatrix.gguf (extract per-expert counts; needs `gguf`)")
|
|
72
|
+
ap.add_argument("--model-name", default="model", help="model name (for the --imatrix path / the report)")
|
|
73
|
+
ap.add_argument("--topk", type=int, default=8, help="experts/token, for the --imatrix n_tokens estimate")
|
|
74
|
+
ap.add_argument("--coverage", type=float, default=0.90, help="routing-mass coverage target (default 0.90)")
|
|
75
|
+
ap.add_argument("--threshold", type=float, default=0.50,
|
|
76
|
+
help="cache_helps if < this fraction of experts cover the target (default 0.50)")
|
|
77
|
+
ap.add_argument("-o", "--out", help="write the report JSON here (default: stdout)")
|
|
78
|
+
args = ap.parse_args(argv)
|
|
79
|
+
|
|
80
|
+
if args.imatrix:
|
|
81
|
+
try:
|
|
82
|
+
trace = _trace_from_imatrix(args.imatrix, args.model_name, args.topk)
|
|
83
|
+
except (ValueError, OSError) as e:
|
|
84
|
+
raise GpuContainerError("INPUT_BAD_IMATRIX", str(e),
|
|
85
|
+
hint="pass --trace with an L×E counts JSON instead, "
|
|
86
|
+
"or `pip install gguf` for the --imatrix path")
|
|
87
|
+
else:
|
|
88
|
+
trace = load_trace(args.trace)
|
|
89
|
+
if trace is None:
|
|
90
|
+
raise GpuContainerError("IO_TRACE_UNREADABLE", f"could not load a trace from {args.trace}",
|
|
91
|
+
hint="expected an ActivationTrace JSON "
|
|
92
|
+
"(model, num_experts, experts_per_token, layers[])")
|
|
93
|
+
|
|
94
|
+
rep = analyze_concentration(trace, coverage_target=args.coverage, cache_helps_threshold=args.threshold)
|
|
95
|
+
|
|
96
|
+
js = rep.to_json()
|
|
97
|
+
if args.out:
|
|
98
|
+
with open(args.out, "w", encoding="utf-8") as f:
|
|
99
|
+
f.write(js + "\n")
|
|
100
|
+
print(f"wrote {args.out}", file=sys.stderr)
|
|
101
|
+
else:
|
|
102
|
+
print(js)
|
|
103
|
+
|
|
104
|
+
need = rep.hot_frac_for_coverage * rep.num_experts
|
|
105
|
+
verdict = "CACHE COULD HELP" if rep.cache_helps else "cache NOT justified"
|
|
106
|
+
print(f"{rep.model}: {verdict} — {need:.0f}/{rep.num_experts} experts ({rep.hot_frac_for_coverage:.0%}) "
|
|
107
|
+
f"resident for {rep.coverage_target:.0%} routing coverage; concentration {rep.concentration_score:.2f}, "
|
|
108
|
+
f"top expert {rep.top1_share:.1%} ({rep.n_layers} layers, ~{rep.n_tokens} tok).", file=sys.stderr)
|
|
109
|
+
for n in rep.notes:
|
|
110
|
+
print(f" note: {n}", file=sys.stderr)
|
|
111
|
+
|
|
112
|
+
return 5 if rep.cache_helps else 0
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
116
|
+
return guard(_main, argv)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""llama.cpp `--n-cpu-moe` placement planner.
|
|
2
|
+
|
|
3
|
+
Closed-form, deterministic, reversible (a plan is a file). The math:
|
|
4
|
+
|
|
5
|
+
VRAM_used(N) = non_expert_bytes + (per_moe_layer_expert_bytes · (L_moe − N)) + KV_bytes + overhead
|
|
6
|
+
|
|
7
|
+
`--n-cpu-moe N` moves the first N MoE layers' experts to CPU RAM, so raising N lowers VRAM use
|
|
8
|
+
(more experts off the GPU) and lowers throughput (CPU computes them at RAM bandwidth, far below
|
|
9
|
+
the GPU's). We pick the SMALLEST N that fits — maximal GPU residency, maximal speed — and refuse
|
|
10
|
+
if even N = L_moe (all experts on CPU) cannot fit the always-resident footprint.
|
|
11
|
+
|
|
12
|
+
Decode is bandwidth-bound at batch 1 (feasibility #10): per token the GPU reads its resident
|
|
13
|
+
weights + active experts at VRAM bandwidth while the CPU reads its N layers' active experts at
|
|
14
|
+
RAM bandwidth. Throughput ≈ 1 / (t_gpu + t_cpu). This is an ESTIMATE for N>0 (heavy-offload
|
|
15
|
+
prediction misses up to ~2-3× — feasibility #11), labelled and confirmed by the receipt; the
|
|
16
|
+
N=0 in-VRAM case is the ±10% regime.
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from typing import Optional
|
|
21
|
+
|
|
22
|
+
from ..profiler import model as model_mod
|
|
23
|
+
from ..profiler.schema import PlacementPlan, Profile
|
|
24
|
+
from .calibration import CalibrationModel
|
|
25
|
+
|
|
26
|
+
_MIB = 1024 * 1024
|
|
27
|
+
_GB = 1e9
|
|
28
|
+
|
|
29
|
+
# Labelled defaults when the profile hasn't measured them (the planner flags the assumption).
|
|
30
|
+
DEFAULT_CPU_BW_GBPS = 80.0 # DDR5 dual-channel, conservative (measure via the profiler to override)
|
|
31
|
+
DEFAULT_VRAM_BW_GBPS = 1790.0 # RTX 5090 GDDR7 512-bit (~1.79 TB/s)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def plan_llama_cpp(
|
|
35
|
+
profile: Profile,
|
|
36
|
+
ctx_len: int = 4096,
|
|
37
|
+
batch: int = 1,
|
|
38
|
+
cpu_mem_bw_gbps: Optional[float] = None,
|
|
39
|
+
vram_bw_gbps: Optional[float] = None,
|
|
40
|
+
non_expert_bpw: Optional[float] = None,
|
|
41
|
+
overhead_mib: float = 1024.0,
|
|
42
|
+
floor_tok_s: float = 1.0,
|
|
43
|
+
model_ref: Optional[str] = None,
|
|
44
|
+
force_n_cpu_moe: Optional[int] = None,
|
|
45
|
+
calibration: Optional[CalibrationModel] = None,
|
|
46
|
+
) -> PlacementPlan:
|
|
47
|
+
"""Plan an MoE placement for llama.cpp `--n-cpu-moe`. Honest verdict, never raises."""
|
|
48
|
+
hw, model = profile.hardware, profile.model
|
|
49
|
+
|
|
50
|
+
if model is None or not model.expert.is_moe:
|
|
51
|
+
return PlacementPlan(fits=False, verdict="refuse",
|
|
52
|
+
message="Not an MoE model (or no model profiled) — the --n-cpu-moe lane "
|
|
53
|
+
"is MoE-only. Dense offload is a separate (refusal-prone) lane.")
|
|
54
|
+
if model.expert_params_total is None or model.non_expert_params is None or not model.n_moe_layers:
|
|
55
|
+
return PlacementPlan(fits=False, verdict="refuse",
|
|
56
|
+
message="model profile lacks the closed-form param split — re-run the "
|
|
57
|
+
"profiler with --model-config <config.json>.")
|
|
58
|
+
vram_budget = hw.gpu.vram_free_mib or hw.gpu.vram_total_mib
|
|
59
|
+
if not vram_budget:
|
|
60
|
+
return PlacementPlan(fits=False, verdict="refuse",
|
|
61
|
+
message="VRAM unknown (profiler returned None) — cannot place honestly.")
|
|
62
|
+
|
|
63
|
+
bpw = model_mod.bytes_per_weight(model.quant, model.dtype_bytes)
|
|
64
|
+
cpu_bw = (cpu_mem_bw_gbps or getattr(hw.memory, "cpu_mem_bw_gbps", None) or DEFAULT_CPU_BW_GBPS) * _GB
|
|
65
|
+
vram_bw = (vram_bw_gbps or DEFAULT_VRAM_BW_GBPS) * _GB
|
|
66
|
+
cpu_bw_measured = bool(cpu_mem_bw_gbps or getattr(hw.memory, "cpu_mem_bw_gbps", None))
|
|
67
|
+
|
|
68
|
+
# Non-expert (always-resident) weights may carry a heavier precision than the experts: MXFP4
|
|
69
|
+
# quantizes experts only, leaving attention/embeddings near f16. Budget the GPU floor at that
|
|
70
|
+
# precision so the plan doesn't under-count VRAM (model.non_expert_bytes_per_weight default;
|
|
71
|
+
# `non_expert_bpw` overrides). Whole-model quants (Q4_K_M, ...) resolve to the same bpw.
|
|
72
|
+
ne_bpw = (non_expert_bpw if non_expert_bpw is not None
|
|
73
|
+
else model_mod.non_expert_bytes_per_weight(model.quant, model.dtype_bytes))
|
|
74
|
+
Lm = model.n_moe_layers
|
|
75
|
+
expert_total_bytes = model.expert_params_total * bpw
|
|
76
|
+
non_expert_bytes = model.non_expert_params * ne_bpw
|
|
77
|
+
per_layer_expert_bytes = expert_total_bytes / Lm
|
|
78
|
+
kv_bytes = model.kv_bytes_at(ctx_len, batch) or 0
|
|
79
|
+
|
|
80
|
+
def vram_used_mib(n: int) -> float:
|
|
81
|
+
gpu_expert_bytes = per_layer_expert_bytes * (Lm - n)
|
|
82
|
+
return (non_expert_bytes + gpu_expert_bytes + kv_bytes) / _MIB + overhead_mib
|
|
83
|
+
|
|
84
|
+
assumptions = {
|
|
85
|
+
"cpu_mem_bw_gbps": round(cpu_bw / _GB, 1), "cpu_bw_measured": cpu_bw_measured,
|
|
86
|
+
"vram_bw_gbps": round(vram_bw / _GB, 1), "bytes_per_weight": round(bpw, 3),
|
|
87
|
+
"non_expert_bpw": round(ne_bpw, 3),
|
|
88
|
+
"ctx_len": ctx_len, "batch": batch, "overhead_mib": overhead_mib,
|
|
89
|
+
"kv_dtype": "f16", "model_total_b": model.total_params,
|
|
90
|
+
"forced_n": force_n_cpu_moe,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if force_n_cpu_moe is not None:
|
|
94
|
+
# explore a specific N (what-if / receipt scoring) instead of the auto-minimal one
|
|
95
|
+
N = max(0, min(int(force_n_cpu_moe), Lm))
|
|
96
|
+
if vram_used_mib(N) > vram_budget:
|
|
97
|
+
return PlacementPlan(
|
|
98
|
+
fits=False, verdict="refuse", n_cpu_moe=N, n_moe_layers=Lm,
|
|
99
|
+
vram_budget_mib=round(vram_budget, 1), vram_used_mib=round(vram_used_mib(N), 1),
|
|
100
|
+
assumptions=assumptions, floor_tok_s=floor_tok_s,
|
|
101
|
+
message=f"REFUSE: forced --n-cpu-moe {N} still needs ~{vram_used_mib(N):.0f} MiB "
|
|
102
|
+
f"> {vram_budget:.0f} MiB free VRAM.")
|
|
103
|
+
else:
|
|
104
|
+
# smallest N in [0, Lm] that fits — maximal GPU residency, maximal speed
|
|
105
|
+
n_fit = next((n for n in range(0, Lm + 1) if vram_used_mib(n) <= vram_budget), None)
|
|
106
|
+
if n_fit is None:
|
|
107
|
+
floor_vram = vram_used_mib(Lm)
|
|
108
|
+
return PlacementPlan(
|
|
109
|
+
fits=False, verdict="refuse", n_cpu_moe=Lm, n_moe_layers=Lm,
|
|
110
|
+
vram_budget_mib=round(vram_budget, 1), vram_used_mib=round(floor_vram, 1),
|
|
111
|
+
assumptions=assumptions, floor_tok_s=floor_tok_s,
|
|
112
|
+
message=(f"REFUSE: even with ALL experts on CPU (--cpu-moe), the always-resident footprint "
|
|
113
|
+
f"(attention + router + embeddings + output head + {ctx_len}-tok KV) needs "
|
|
114
|
+
f"~{floor_vram:.0f} MiB > {vram_budget:.0f} MiB free VRAM. You probably expected "
|
|
115
|
+
f"{model.name} to fit; it cannot, because the non-expert weights alone exceed VRAM. "
|
|
116
|
+
f"Options: shorter --ctx, q8_0 KV cache, a smaller/more-quantized model, or more VRAM."),
|
|
117
|
+
)
|
|
118
|
+
N = n_fit
|
|
119
|
+
ram_used_mib = per_layer_expert_bytes * N / _MIB
|
|
120
|
+
ram_avail_mib = (hw.memory.ram_available_gib or hw.memory.ram_total_gib or 0) * 1024
|
|
121
|
+
ram_ok = (ram_used_mib <= ram_avail_mib) if ram_avail_mib else True
|
|
122
|
+
|
|
123
|
+
# bandwidth-bound decode estimate
|
|
124
|
+
topk = model.expert.experts_per_token or 0
|
|
125
|
+
active_each = (model.expert.expert_params_each or 0) * bpw
|
|
126
|
+
gpu_active_expert = active_each * topk * (Lm - N)
|
|
127
|
+
cpu_active_expert = active_each * topk * N
|
|
128
|
+
# per-token GPU read ≈ all non-expert weights (attention/router/head) + GPU-resident active
|
|
129
|
+
# experts + the KV cache (attention reads it every decode step). CPU reads its active experts.
|
|
130
|
+
# This is a ROOFLINE CEILING (peak bandwidth, no kernel/launch/attention-compute overhead): a
|
|
131
|
+
# true UPPER BOUND on decode tok/s. Real decode runs at a fraction of it; the receipt measures
|
|
132
|
+
# the actual efficiency and closes the gap. Refusal is conservative — it fires only when even
|
|
133
|
+
# this optimistic ceiling cannot clear the floor.
|
|
134
|
+
t_gpu = (non_expert_bytes + gpu_active_expert + kv_bytes) / vram_bw
|
|
135
|
+
t_cpu = cpu_active_expert / cpu_bw
|
|
136
|
+
ceiling = (1.0 / (t_gpu + t_cpu)) if (t_gpu + t_cpu) > 0 else None
|
|
137
|
+
basis = ("roofline ceiling, in-VRAM (active-weight bandwidth; real decode is a fraction of this -- "
|
|
138
|
+
"small-active MoE is largely overhead-bound -- confirmed by receipt)" if N == 0
|
|
139
|
+
else "roofline ceiling, CPU-offload (real well below; heavy-offload can miss 2-3x -- confirmed by receipt)")
|
|
140
|
+
|
|
141
|
+
# Calibrated forecast (opt-in): scale the ceiling by the measured realized efficiency for this
|
|
142
|
+
# regime (calibration.py). With no calibration data the forecast IS the ceiling -- the honest
|
|
143
|
+
# uncalibrated path. The ceiling is retained as the upper bound AND the refusal floor below.
|
|
144
|
+
calibrated, band_low, band_high, calib_n = ceiling, None, None, 0
|
|
145
|
+
calib_basis = ("uncalibrated: the roofline ceiling is the upper bound -- real decode is a fraction; "
|
|
146
|
+
"run a receipt (gpu-container-receipt) to calibrate this shape")
|
|
147
|
+
if calibration is not None and ceiling is not None:
|
|
148
|
+
est = calibration.estimate("in_vram" if N == 0 else "offload", (N / Lm) if Lm else 0.0)
|
|
149
|
+
if est is not None:
|
|
150
|
+
calibrated = ceiling * est.efficiency
|
|
151
|
+
band_low, band_high = ceiling * est.low, ceiling * est.high
|
|
152
|
+
calib_n, calib_basis = est.n_samples, est.basis
|
|
153
|
+
|
|
154
|
+
# Refusal floor keys on the CEILING (conservative ANDON): refuse only when even the optimistic
|
|
155
|
+
# upper bound cannot clear the floor -- never refuse a model that might be usable.
|
|
156
|
+
below_floor = ceiling is not None and ceiling < floor_tok_s
|
|
157
|
+
verdict = "refuse" if (below_floor or not ram_ok) else "ship"
|
|
158
|
+
vu = vram_used_mib(N)
|
|
159
|
+
|
|
160
|
+
# -fa on (not bare -fa): current llama.cpp made flash-attn a tri-state (on|off|auto) and rejects
|
|
161
|
+
# a value-less -fa. Confirmed against the ghcr.io/ggml-org/llama.cpp:full-cuda help (2026-06-04).
|
|
162
|
+
flags = f"-ngl 99 --n-cpu-moe {N} -c {ctx_len} -fa on"
|
|
163
|
+
cmd_model = f"-hf {model_ref}" if model_ref else "-m <model.gguf>"
|
|
164
|
+
|
|
165
|
+
if verdict == "refuse" and below_floor:
|
|
166
|
+
msg = (f"REFUSE: even the roofline CEILING for the best plan (--n-cpu-moe {N}) is "
|
|
167
|
+
f"~{ceiling:.2f} tok/s < {floor_tok_s} floor — real decode is lower still. "
|
|
168
|
+
f"You probably expected {model.name} to be usable; with {N}/{Lm} expert layers on CPU "
|
|
169
|
+
f"(RAM bandwidth ~{cpu_bw/_GB:.0f} GB/s) it cannot clear the floor. "
|
|
170
|
+
f"Options: a smaller model, fewer active experts, or more VRAM to keep N low.")
|
|
171
|
+
elif verdict == "refuse":
|
|
172
|
+
msg = (f"REFUSE: plan needs {ram_used_mib/1024:.1f} GiB CPU RAM for {N} expert layers but only "
|
|
173
|
+
f"~{ram_avail_mib/1024:.1f} GiB available.")
|
|
174
|
+
else:
|
|
175
|
+
tier = "fully in VRAM" if N == 0 else f"{N}/{Lm} expert layers on CPU RAM"
|
|
176
|
+
if calib_n:
|
|
177
|
+
forecast = (f"~{calibrated:.0f} tok/s (calibrated, band [{band_low:.0f}, {band_high:.0f}]; "
|
|
178
|
+
f"roofline ceiling {ceiling:.0f}, from {calib_n} receipt(s))")
|
|
179
|
+
else:
|
|
180
|
+
forecast = f"<= {ceiling:.0f} tok/s (roofline ceiling — real is a fraction; run a receipt to calibrate)"
|
|
181
|
+
msg = (f"SHIP: {model.name} {tier}. Decode {forecast}. "
|
|
182
|
+
f"VRAM ~{vu:.0f}/{vram_budget:.0f} MiB, CPU-expert RAM ~{ram_used_mib/1024:.1f} GiB. "
|
|
183
|
+
f"Launch: llama-cli {cmd_model} {flags}")
|
|
184
|
+
if calib_n and calibrated < floor_tok_s <= ceiling:
|
|
185
|
+
msg += (f" — borderline: the calibrated forecast (~{calibrated:.1f}) dips below the {floor_tok_s} "
|
|
186
|
+
f"tok/s floor though the ceiling clears it; the receipt will settle it.")
|
|
187
|
+
|
|
188
|
+
return PlacementPlan(
|
|
189
|
+
fits=True, verdict=verdict, n_cpu_moe=N, n_moe_layers=Lm, llama_flags=flags,
|
|
190
|
+
vram_budget_mib=round(vram_budget, 1), vram_used_mib=round(vu, 1),
|
|
191
|
+
ram_used_mib=round(ram_used_mib, 1),
|
|
192
|
+
predicted_decode_tok_s=round(calibrated, 2) if calibrated else None,
|
|
193
|
+
ceiling_decode_tok_s=round(ceiling, 2) if ceiling else None,
|
|
194
|
+
predicted_band_low_tok_s=round(band_low, 2) if band_low else None,
|
|
195
|
+
predicted_band_high_tok_s=round(band_high, 2) if band_high else None,
|
|
196
|
+
throughput_basis=basis, calibration_basis=calib_basis, calibration_n_samples=calib_n,
|
|
197
|
+
floor_tok_s=floor_tok_s, message=msg, assumptions=assumptions,
|
|
198
|
+
)
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""The receipt — a DIFFERENT mechanism (measurement) verifying the planner's forecast.
|
|
2
|
+
|
|
3
|
+
llama-bench emits per-test rows (`pp###` = prefill, `tg###` = token-generation/decode) with an
|
|
4
|
+
`avg_ts` tokens/sec. We parse that, pair it with the plan's predicted CEILING, and record the
|
|
5
|
+
realized efficiency (measured ÷ ceiling) + whether the >1 tok/s floor actually cleared. That
|
|
6
|
+
efficiency is the calibration seed that closes the static-prediction gap (the architecture's loop).
|
|
7
|
+
The model never grades its own forecast: the generator is the planner's closed form; the verifier
|
|
8
|
+
is a real run on the GPU.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from typing import List, Optional
|
|
14
|
+
|
|
15
|
+
from ..profiler.schema import PlacementPlan, Receipt
|
|
16
|
+
from .activation import ConcentrationReport
|
|
17
|
+
from .calibration import CalibrationPoint
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parse_llama_bench(stdout: str) -> List[dict]:
|
|
21
|
+
"""Parse `llama-bench -o json` output into [{test, n_cpu_moe, n_gpu_layers, avg_ts}, ...]."""
|
|
22
|
+
# The JSON array may be embedded in other log lines; extract the outermost [...] span.
|
|
23
|
+
s, e = stdout.find("["), stdout.rfind("]")
|
|
24
|
+
if s < 0 or e < 0 or e <= s:
|
|
25
|
+
return []
|
|
26
|
+
try:
|
|
27
|
+
rows = json.loads(stdout[s:e + 1])
|
|
28
|
+
except ValueError:
|
|
29
|
+
return []
|
|
30
|
+
out = []
|
|
31
|
+
for r in rows:
|
|
32
|
+
if not isinstance(r, dict):
|
|
33
|
+
continue
|
|
34
|
+
out.append({
|
|
35
|
+
"test": r.get("test") or r.get("n_prompt") or "?",
|
|
36
|
+
"n_cpu_moe": r.get("n_cpu_moe"),
|
|
37
|
+
"n_gpu_layers": r.get("n_gpu_layers"),
|
|
38
|
+
"avg_ts": r.get("avg_ts"),
|
|
39
|
+
"model": r.get("model_filename") or r.get("model_type"),
|
|
40
|
+
})
|
|
41
|
+
return out
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _pick(rows: List[dict], prefix: str) -> Optional[float]:
|
|
45
|
+
"""Average tok/s for the first row whose `test` starts with `prefix` (tg=decode, pp=prefill)."""
|
|
46
|
+
for r in rows:
|
|
47
|
+
t = str(r.get("test", ""))
|
|
48
|
+
if t.startswith(prefix) and r.get("avg_ts") is not None:
|
|
49
|
+
return float(r["avg_ts"])
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def build_receipt(
|
|
54
|
+
plan: PlacementPlan,
|
|
55
|
+
decode_tok_s: Optional[float],
|
|
56
|
+
prefill_tok_s: Optional[float] = None,
|
|
57
|
+
vram_used_mib: Optional[float] = None,
|
|
58
|
+
method: Optional[str] = None,
|
|
59
|
+
concentration: Optional[ConcentrationReport] = None,
|
|
60
|
+
peaks: Optional[dict] = None,
|
|
61
|
+
) -> Receipt:
|
|
62
|
+
"""Pair a measured run with the plan's forecast(s) -> a Receipt.
|
|
63
|
+
|
|
64
|
+
The plan now carries two forecasts, so the receipt makes two comparisons:
|
|
65
|
+
- realized efficiency = measured / CEILING -> the calibration seed for the next plan,
|
|
66
|
+
- decode error = (measured - CALIBRATED) / CALIBRATED -> was the calibrated forecast right?,
|
|
67
|
+
- within_band = measured inside the plan's calibrated band -> the loop's proof.
|
|
68
|
+
`ceiling` falls back to the calibrated field for plans predating the ceiling split.
|
|
69
|
+
|
|
70
|
+
`peaks` (a dict from `gpu-container-watchdog run --peaks-out`) folds the run's SAFETY ENVELOPE
|
|
71
|
+
into the receipt — peak power / host-mem / VRAM — proving the run stayed inside the rig's limits.
|
|
72
|
+
"""
|
|
73
|
+
pred = plan.predicted_decode_tok_s # calibrated forecast
|
|
74
|
+
ceiling = plan.ceiling_decode_tok_s or plan.predicted_decode_tok_s # roofline upper bound
|
|
75
|
+
err = round(100.0 * (decode_tok_s - pred) / pred, 1) if (decode_tok_s and pred) else None
|
|
76
|
+
eff_pct = round(100.0 * decode_tok_s / ceiling, 1) if (decode_tok_s and ceiling) else None
|
|
77
|
+
lo, hi = plan.predicted_band_low_tok_s, plan.predicted_band_high_tok_s
|
|
78
|
+
within = (lo <= decode_tok_s <= hi) if (decode_tok_s and lo is not None and hi is not None) else None
|
|
79
|
+
|
|
80
|
+
notes: List[str] = []
|
|
81
|
+
if eff_pct is not None:
|
|
82
|
+
notes.append(f"realized {eff_pct:.0f}% of the roofline ceiling ({decode_tok_s:.1f} of {ceiling:.0f} tok/s) "
|
|
83
|
+
f"— this efficiency is the calibration seed for the next plan.")
|
|
84
|
+
if decode_tok_s > ceiling:
|
|
85
|
+
notes.append("ANDON: measured EXCEEDS the ceiling — the bandwidth model is wrong (check "
|
|
86
|
+
"vram_bw / bytes_per_weight assumptions), not just inefficient.")
|
|
87
|
+
if within is not None:
|
|
88
|
+
notes.append(f"calibrated forecast {pred:.1f} tok/s, band [{lo:.1f}, {hi:.1f}] — measured "
|
|
89
|
+
f"{decode_tok_s:.1f} {'LANDED INSIDE' if within else 'FELL OUTSIDE'} the band "
|
|
90
|
+
f"({'loop closed' if within else 'recalibrate: ingest this receipt and refit'}).")
|
|
91
|
+
if plan.vram_used_mib and vram_used_mib:
|
|
92
|
+
dv = 100.0 * (vram_used_mib - plan.vram_used_mib) / plan.vram_used_mib
|
|
93
|
+
notes.append(f"VRAM predicted {plan.vram_used_mib:.0f} MiB vs measured {vram_used_mib:.0f} MiB ({dv:+.0f}%).")
|
|
94
|
+
if concentration is not None:
|
|
95
|
+
helps = ("a per-expert cache COULD help this workload" if concentration.cache_helps
|
|
96
|
+
else "routing is near-uniform — a per-expert cache would NOT help this workload")
|
|
97
|
+
notes.append(f"routing de-risk: {helps} (need {concentration.hot_frac_for_coverage:.0%} of experts for "
|
|
98
|
+
f"{concentration.coverage_target:.0%} coverage; concentration {concentration.concentration_score:.2f}, "
|
|
99
|
+
f"top expert {concentration.top1_share:.1%}).")
|
|
100
|
+
if peaks:
|
|
101
|
+
env = []
|
|
102
|
+
if peaks.get("peak_host_mem_pct") is not None:
|
|
103
|
+
env.append(f"peak host-mem {peaks['peak_host_mem_pct']:.0f}%")
|
|
104
|
+
if peaks.get("peak_gpu_power_pct") is not None:
|
|
105
|
+
env.append(f"peak power {peaks['peak_gpu_power_pct']:.0f}%")
|
|
106
|
+
if peaks.get("peak_gpu_vram_used_mib") is not None:
|
|
107
|
+
env.append(f"peak VRAM {peaks['peak_gpu_vram_used_mib'] / 1024:.1f} GiB")
|
|
108
|
+
stayed = peaks.get("stayed_within_envelope")
|
|
109
|
+
tail = ("stayed within the safety envelope" if stayed else
|
|
110
|
+
"BREACHED the safety envelope — aborted mid-run") if stayed is not None else "envelope recorded"
|
|
111
|
+
notes.append(f"safety: {', '.join(env) or 'no peaks'} over {peaks.get('samples', 0)} watchdog polls "
|
|
112
|
+
f"— {tail}.")
|
|
113
|
+
return Receipt(
|
|
114
|
+
runtime=plan.runtime, n_cpu_moe=plan.n_cpu_moe,
|
|
115
|
+
measured_decode_tok_s=round(decode_tok_s, 2) if decode_tok_s else None,
|
|
116
|
+
measured_prefill_tok_s=round(prefill_tok_s, 2) if prefill_tok_s else None,
|
|
117
|
+
measured_vram_used_mib=round(vram_used_mib, 1) if vram_used_mib else None,
|
|
118
|
+
predicted_decode_tok_s=pred, ceiling_decode_tok_s=round(ceiling, 2) if ceiling else None,
|
|
119
|
+
decode_error_pct=err, realized_efficiency_pct=eff_pct, within_band=within,
|
|
120
|
+
cleared_floor=(decode_tok_s >= plan.floor_tok_s) if decode_tok_s else None,
|
|
121
|
+
routing_cache_helps=concentration.cache_helps if concentration else None,
|
|
122
|
+
routing_hot_frac_for_coverage=round(concentration.hot_frac_for_coverage, 3) if concentration else None,
|
|
123
|
+
routing_concentration=round(concentration.concentration_score, 3) if concentration else None,
|
|
124
|
+
peak_gpu_power_pct=(peaks or {}).get("peak_gpu_power_pct"),
|
|
125
|
+
peak_gpu_temp_c=(peaks or {}).get("peak_gpu_temp_c"),
|
|
126
|
+
peak_gpu_vram_used_mib=(peaks or {}).get("peak_gpu_vram_used_mib"),
|
|
127
|
+
peak_host_mem_pct=(peaks or {}).get("peak_host_mem_pct"),
|
|
128
|
+
min_host_avail_mib=(peaks or {}).get("min_host_avail_mib"),
|
|
129
|
+
safety_samples=(peaks or {}).get("samples"),
|
|
130
|
+
stayed_within_envelope=(peaks or {}).get("stayed_within_envelope"),
|
|
131
|
+
method=method, notes=notes,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def plan_to_calibration_point(
|
|
136
|
+
plan: PlacementPlan,
|
|
137
|
+
measured_decode_tok_s: float,
|
|
138
|
+
model_name: str,
|
|
139
|
+
quant: Optional[str] = None,
|
|
140
|
+
created: Optional[str] = None,
|
|
141
|
+
rig: Optional[str] = None,
|
|
142
|
+
source: Optional[str] = None,
|
|
143
|
+
) -> CalibrationPoint:
|
|
144
|
+
"""Distill a (plan, measured-decode) pair into a CalibrationPoint for the store — the loop's
|
|
145
|
+
write-back. The ceiling comes from the plan (so efficiency is measured/ceiling); the bandwidth
|
|
146
|
+
assumptions ride along as provenance so the point stays auditable."""
|
|
147
|
+
a = plan.assumptions or {}
|
|
148
|
+
return CalibrationPoint(
|
|
149
|
+
model=model_name, quant=quant,
|
|
150
|
+
n_cpu_moe=plan.n_cpu_moe or 0, n_moe_layers=plan.n_moe_layers or 0,
|
|
151
|
+
ceiling_tok_s=plan.ceiling_decode_tok_s or plan.predicted_decode_tok_s or 0.0,
|
|
152
|
+
measured_tok_s=measured_decode_tok_s,
|
|
153
|
+
cpu_bw_gbps=a.get("cpu_mem_bw_gbps"), vram_bw_gbps=a.get("vram_bw_gbps"),
|
|
154
|
+
ctx_len=a.get("ctx_len"), created=created, rig=rig, source=source,
|
|
155
|
+
)
|