gpu-container 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ """`gpu-container-receipt` — close the loop: measure a plan, write the receipt, recalibrate.
2
+
3
+ gpu-container-receipt --plan plan.json --bench bench.json \
4
+ --model-name Qwen3-30B-A3B --quant gguf-q4_k_m --calibration-dir ./calib -o receipt.json
5
+
6
+ It pairs a llama-bench run (`-o json`, via --bench file or stdin) with the plan's forecast, emits a
7
+ Receipt (realized efficiency, calibrated-forecast error, within-band), and — when --calibration-dir
8
+ is given — appends a CalibrationPoint so the NEXT plan for this shape is calibrated. That write-back
9
+ is the recalibration loop. The verifier is a real GPU run, a DIFFERENT mechanism than the planner's
10
+ closed form (EXTERNAL_VERIFIER).
11
+
12
+ Exit code (ANDON): 0 = measured cleared the floor and sat at/below the ceiling; 3 = measured fell
13
+ below the >1 tok/s floor (the plan's ship was optimistic); 4 = measured EXCEEDED the ceiling (the
14
+ bandwidth model itself is wrong — halt and fix assumptions, don't just recalibrate efficiency).
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import json
20
+ import sys
21
+ from datetime import date
22
+ from typing import List, Optional
23
+
24
+ from ..errors import GpuContainerError, guard
25
+ from ..profiler.schema import PlacementPlan
26
+ from .calibration import CalibrationStore
27
+ from .receipt import build_receipt, parse_llama_bench, plan_to_calibration_point
28
+ from .receipt import _pick # decode/prefill row selector
29
+
30
+
31
+ def _read(path: Optional[str]) -> str:
32
+ if path in (None, "-"):
33
+ return sys.stdin.read()
34
+ with open(path, "r", encoding="utf-8") as f:
35
+ return f.read()
36
+
37
+
38
+ def _main(argv: Optional[List[str]] = None) -> int:
39
+ for _stream in (sys.stdout, sys.stderr):
40
+ try:
41
+ _stream.reconfigure(encoding="utf-8")
42
+ except (AttributeError, ValueError):
43
+ pass
44
+
45
+ ap = argparse.ArgumentParser(
46
+ prog="gpu-container-receipt",
47
+ description="Measure a plan against a llama-bench run -> receipt + recalibration write-back.",
48
+ )
49
+ ap.add_argument("--debug", action="store_true", help="show the full traceback on an unexpected error")
50
+ ap.add_argument("--plan", required=True, help="plan.json from gpu-container-plan")
51
+ ap.add_argument("--bench", help="llama-bench -o json output (file or '-' for stdin)")
52
+ ap.add_argument("--decode-tok-s", type=float, help="measured decode tok/s (instead of --bench)")
53
+ ap.add_argument("--prefill-tok-s", type=float, help="measured prefill tok/s (optional)")
54
+ ap.add_argument("--vram-used-mib", type=float, help="measured VRAM use (optional)")
55
+ ap.add_argument("--model-name", help="model name for the calibration point (e.g. Qwen3-30B-A3B)")
56
+ ap.add_argument("--quant", help="quant tag for the calibration point (e.g. gguf-q4_k_m)")
57
+ ap.add_argument("--calibration-dir", help="append a CalibrationPoint here (the recalibration write-back)")
58
+ ap.add_argument("--created", default=date.today().isoformat(), help="ISO date stamp (default: today)")
59
+ ap.add_argument("--rig", help="rig provenance for the calibration point")
60
+ ap.add_argument("--source", help="free-text provenance (which run)")
61
+ ap.add_argument("--trace", help="ActivationTrace JSON — fold the per-expert routing de-risk verdict into the receipt")
62
+ ap.add_argument("--coverage", type=float, default=0.90, help="routing-coverage target for --trace (default 0.90)")
63
+ ap.add_argument("--threshold", type=float, default=0.50, help="cache_helps threshold for --trace (default 0.50)")
64
+ ap.add_argument("--peaks", help="peak-metrics JSON from `gpu-container-watchdog run --peaks-out` — "
65
+ "fold the run's safety envelope (peak power/host-mem/VRAM) into the receipt")
66
+ ap.add_argument("-o", "--out", help="write the receipt JSON here (default: stdout)")
67
+ args = ap.parse_args(argv)
68
+
69
+ plan = PlacementPlan.from_json(_read(args.plan))
70
+
71
+ decode = args.decode_tok_s
72
+ prefill = args.prefill_tok_s
73
+ if args.bench is not None:
74
+ rows = parse_llama_bench(_read(args.bench))
75
+ if not rows:
76
+ raise GpuContainerError("INPUT_NO_BENCH_ROWS", "--bench produced no parseable llama-bench rows",
77
+ hint="pass `llama-bench -o json` output (a file or '-' for stdin)")
78
+ decode = _pick(rows, "tg") if decode is None else decode
79
+ prefill = _pick(rows, "pp") if prefill is None else prefill
80
+ if decode is None:
81
+ raise GpuContainerError("INPUT_NO_DECODE_RATE", "need a measured decode rate",
82
+ hint="give --bench output with a tg (token-generation) row, or --decode-tok-s")
83
+
84
+ # Optional per-expert routing de-risk: fold the concentration verdict into the receipt (--trace).
85
+ concentration = None
86
+ if args.trace:
87
+ from .activation import analyze_concentration, load_trace
88
+ tr = load_trace(args.trace)
89
+ if tr is None:
90
+ print(f"WARN: --trace {args.trace} could not be loaded; omitting the routing de-risk.", file=sys.stderr)
91
+ else:
92
+ concentration = analyze_concentration(tr, coverage_target=args.coverage,
93
+ cache_helps_threshold=args.threshold)
94
+
95
+ # Optional safety envelope: fold the supervised run's peak metrics into the receipt (--peaks).
96
+ peaks = None
97
+ if args.peaks:
98
+ try:
99
+ with open(args.peaks, "r", encoding="utf-8") as f:
100
+ peaks = json.load(f)
101
+ except (OSError, ValueError) as e:
102
+ print(f"WARN: --peaks {args.peaks} could not be loaded ({e}); omitting the safety envelope.",
103
+ file=sys.stderr)
104
+
105
+ receipt = build_receipt(plan, decode_tok_s=decode, prefill_tok_s=prefill,
106
+ vram_used_mib=args.vram_used_mib, method=args.source or "llama-bench",
107
+ concentration=concentration, peaks=peaks)
108
+
109
+ # The write-back: append this measurement to the calibration store so the next plan is calibrated.
110
+ if args.calibration_dir and args.model_name:
111
+ point = plan_to_calibration_point(
112
+ plan, measured_decode_tok_s=decode, model_name=args.model_name, quant=args.quant,
113
+ created=args.created, rig=args.rig, source=args.source or "gpu-container-receipt",
114
+ )
115
+ dest = CalibrationStore(args.calibration_dir).add(point)
116
+ receipt.notes.append(f"recalibration: appended a CalibrationPoint to {dest} "
117
+ f"(efficiency {point.efficiency * 100:.0f}% at N={point.n_cpu_moe}).")
118
+ elif args.calibration_dir:
119
+ receipt.notes.append("note: --calibration-dir given but no --model-name; skipped the write-back.")
120
+
121
+ js = receipt.to_json()
122
+ if args.out:
123
+ with open(args.out, "w", encoding="utf-8") as f:
124
+ f.write(js + "\n")
125
+ print(f"wrote {args.out}", file=sys.stderr)
126
+ else:
127
+ print(js)
128
+ for note in receipt.notes:
129
+ print(note, file=sys.stderr)
130
+
131
+ if receipt.ceiling_decode_tok_s and decode > receipt.ceiling_decode_tok_s:
132
+ return 4 # ANDON: measured beat the ceiling — the bandwidth model is wrong
133
+ if receipt.cleared_floor is False:
134
+ return 3 # below the >1 tok/s floor — the ship was optimistic
135
+ return 0
136
+
137
+
138
+ def main(argv: Optional[List[str]] = None) -> int:
139
+ return guard(_main, argv)
140
+
141
+
142
+ if __name__ == "__main__":
143
+ raise SystemExit(main())
@@ -0,0 +1,24 @@
1
+ """The profiler: hardware + model profiling -> a single, JSON-serializable Profile.
2
+
3
+ - `schema` — the Profile contract every downstream component reads.
4
+ - `hardware` — detect/measure the rig (GPU, platform, bandwidth, memory).
5
+ - `model` — analyze a model (dense vs MoE, KV growth, per-expert bytes).
6
+ - `cli` — `gpu-container-profile` -> writes profile.json.
7
+
8
+ Design rule (from docker-knowledge wave-1): a measurement we have NOT taken is `None`,
9
+ never a guessed number. The planner must treat `None` as "unknown — refuse to assume",
10
+ because honest refusal depends on honest inputs.
11
+ """
12
+ from .schema import ( # noqa: F401
13
+ SCHEMA_VERSION,
14
+ BandwidthInfo,
15
+ ExpertInfo,
16
+ GpuInfo,
17
+ HardwareProfile,
18
+ MemoryInfo,
19
+ ModelProfile,
20
+ PlacementPlan,
21
+ PlatformInfo,
22
+ Profile,
23
+ Receipt,
24
+ )
@@ -0,0 +1,122 @@
1
+ """Close the loop: write a measured profile's readouts back into the docker-knowledge KB.
2
+
3
+ The KB's `measurements` table is a key-value-with-provenance store (metric, value, unit,
4
+ context, tool, source_file, wave_id, measured_date); `v_baseline` is the read view over it.
5
+ `--emit-baseline` takes a profile.json produced INSIDE the container and records each
6
+ measured number as a row, plus drops the full profile under `baselines/<stem>.json` so the
7
+ `source_file` provenance points at a real artifact. Idempotent: re-emitting the same
8
+ `source_file` replaces its rows rather than duplicating them.
9
+
10
+ This runs on the HOST (where the KB lives), reading a profile that was measured in-container
11
+ — measurement and persistence are decoupled on purpose.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import os
17
+ import sqlite3
18
+ from typing import List, Optional, Tuple
19
+
20
+ from .schema import Profile
21
+
22
+ # wave to associate measured baselines with (the hw-measurement methodology wave)
23
+ _DEFAULT_WAVE_NUMBER = 2
24
+
25
+
26
+ def _slug(s: str) -> str:
27
+ return "".join(c if c.isalnum() else "-" for c in s.lower()).strip("-") or "rig"
28
+
29
+
30
+ def _context(profile: Profile, override: Optional[str]) -> str:
31
+ if override:
32
+ return override
33
+ p = profile.hardware.platform
34
+ parts = ["in-container" if p.in_container else "host"]
35
+ if p.wsl2:
36
+ parts.append("wsl2")
37
+ return " ".join(parts)
38
+
39
+
40
+ def _rows(profile: Profile) -> List[Tuple[str, float, str, str, str]]:
41
+ """(metric, value, unit, tool, note) for every measured (non-None) readout."""
42
+ hw = profile.hardware
43
+ bw, gpu, mem = hw.bandwidth, hw.gpu, hw.memory
44
+ det = bw.details or {}
45
+ pd, nd = det.get("pcie", {}) or {}, det.get("nvme", {}) or {}
46
+ pcie_note = f"pinned cudaMemcpy, median of {pd.get('iters')} @ {pd.get('buffer_mib')} MiB ({pd.get('convention','')})"
47
+ nvme_note = f"fio direct=1 {nd.get('ioengine')} on {nd.get('fs_type')} @ {nd.get('mount')}, size {nd.get('size_gib')}G"
48
+
49
+ out: List[Tuple[str, float, str, str, str]] = []
50
+
51
+ def add(metric, value, unit, tool, note):
52
+ if value is not None:
53
+ out.append((metric, float(value), unit, tool, note))
54
+
55
+ add("pcie_h2d_gbps", bw.pcie_h2d_gbps, "GB/s", "cudaMemcpy-bench", pcie_note)
56
+ add("pcie_d2h_gbps", bw.pcie_d2h_gbps, "GB/s", "cudaMemcpy-bench", pcie_note)
57
+ add("nvme_seq_read_gbps", bw.nvme_seq_read_gbps, "GB/s", "fio", nvme_note + " (optimistic ceiling)")
58
+ add("nvme_rand_qd1_iops", bw.nvme_rand_qd1_read_iops, "IOPS", "fio", nvme_note + " (the honest offload metric)")
59
+ add("nvme_rand_qd1_mbps", bw.nvme_rand_qd1_read_mbps, "MB/s", "fio", nvme_note)
60
+ add("pinnable_ram_ceiling_gib", mem.pinnable_ceiling_gib, "GiB", "cudaHostAlloc-probe",
61
+ (mem.pinnable_method or "") + (" [capped=lower-bound]" if mem.pinnable_capped else ""))
62
+ add("vram_total", gpu.vram_total_mib, "MiB", gpu.vram_source or "nvidia-smi", "device total")
63
+ add("vram_free", gpu.vram_free_mib, "MiB", gpu.vram_source or "nvidia-smi", "device free at profile time")
64
+ return out
65
+
66
+
67
+ def emit_baseline(
68
+ profile: Profile,
69
+ db_path: str,
70
+ baselines_dir: Optional[str] = None,
71
+ context: Optional[str] = None,
72
+ source_stem: Optional[str] = None,
73
+ measured_date: Optional[str] = None,
74
+ wave_number: int = _DEFAULT_WAVE_NUMBER,
75
+ ) -> dict:
76
+ """Write the profile's measured rows into `measurements` and drop the artifact. Returns a summary."""
77
+ if not os.path.exists(db_path):
78
+ return {"error": f"findings.db not found at {db_path}"}
79
+
80
+ measured_date = measured_date or profile.created
81
+ ctx = _context(profile, context)
82
+ stem = source_stem or f"{measured_date}-{_slug(profile.hardware.gpu.name)}"
83
+ source_file = f"baselines/{stem}.json"
84
+ baselines_dir = baselines_dir or os.path.join(os.path.dirname(os.path.abspath(db_path)), "baselines")
85
+
86
+ rows = _rows(profile)
87
+ if not rows:
88
+ return {"error": "no measured (non-None) readouts in profile — nothing to emit; "
89
+ "run the profiler in-container with benches enabled first"}
90
+
91
+ # 1) drop the profile artifact the source_file points at
92
+ os.makedirs(baselines_dir, exist_ok=True)
93
+ artifact_path = os.path.join(baselines_dir, f"{stem}.json")
94
+ with open(artifact_path, "w", encoding="utf-8") as f:
95
+ f.write(profile.to_json() + "\n")
96
+
97
+ # 2) write rows (idempotent by source_file)
98
+ conn = sqlite3.connect(db_path)
99
+ try:
100
+ cur = conn.cursor()
101
+ wid = cur.execute("SELECT id FROM waves WHERE wave_number=?", (wave_number,)).fetchone()
102
+ wave_id = wid[0] if wid else None
103
+ cur.execute("DELETE FROM measurements WHERE source_file=?", (source_file,))
104
+ cur.executemany(
105
+ "INSERT INTO measurements (metric, value, unit, context, tool, source_file, note, "
106
+ "wave_id, measured_date) VALUES (?,?,?,?,?,?,?,?,?)",
107
+ [(m, v, u, ctx, tool, source_file, note, wave_id, measured_date)
108
+ for (m, v, u, tool, note) in rows],
109
+ )
110
+ conn.commit()
111
+ finally:
112
+ conn.close()
113
+
114
+ return {
115
+ "written": len(rows),
116
+ "metrics": [r[0] for r in rows],
117
+ "source_file": source_file,
118
+ "artifact": artifact_path,
119
+ "context": ctx,
120
+ "db": db_path,
121
+ "wave_number": wave_number,
122
+ }
@@ -0,0 +1,151 @@
1
+ """`gpu-container-profile` — profile this rig (and optionally a model) -> profile.json.
2
+
3
+ The profile JSON is the contract the planner reads. Run it INSIDE the target container for
4
+ an honest hardware vantage (docker-knowledge `hw-measurement`); the PCIe/NVMe/pinnable
5
+ benchmarks need the CUDA runtime + fio + an ext4 bench volume that the container provides.
6
+
7
+ Two modes:
8
+ * MEASURE (default): detect + benchmark the rig now. `--no-bench` skips the benchmarks
9
+ (identity detection only); `--bench-dir` points the NVMe test at a mounted ext4 volume.
10
+ * EMIT (`--from-profile X.json --emit-baseline`): take a profile measured in-container and
11
+ record its readouts into the docker-knowledge KB (runs on the host, where the KB lives).
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import os
18
+ import sys
19
+ from typing import List, Optional
20
+
21
+ from ..errors import GpuContainerError, guard
22
+ from . import baseline as baseline_mod
23
+ from . import model as model_mod
24
+ from .hardware import profile_hardware
25
+ from .schema import SCHEMA_VERSION, Profile
26
+
27
+ # Conventional KB location on this rig (overridable via --baseline-db / $GPU_CONTAINER_KB_DB).
28
+ _DEFAULT_KB_DB = os.environ.get("GPU_CONTAINER_KB_DB") or r"E:\AI\readouts\docker-knowledge\findings.db"
29
+
30
+
31
+ def _today() -> str:
32
+ from datetime import date # host clock is fine for an interactive CLI
33
+ return date.today().isoformat()
34
+
35
+
36
+ def _build_notes(prof: Profile) -> List[str]:
37
+ notes: List[str] = []
38
+ bw = prof.hardware.bandwidth
39
+ if bw.pcie_h2d_gbps is None or bw.nvme_rand_qd1_read_iops is None:
40
+ notes.append(
41
+ "bandwidth partially/un-measured: the planner MUST treat None as unknown, "
42
+ "never zero or spec-sheet. See bandwidth.method/details for why."
43
+ )
44
+ if prof.hardware.platform.uvm_oversubscription is False:
45
+ notes.append(
46
+ "UVM oversubscription unavailable on this platform -> explicit placement only "
47
+ "(docker-knowledge container-runtime)."
48
+ )
49
+ mem = prof.hardware.memory
50
+ if mem.pinnable_ceiling_gib is not None:
51
+ c = mem.pinnable_ceiling_gib
52
+ if c < 1.0:
53
+ notes.append(
54
+ f"pinnable host-RAM ceiling measured at {c} GiB — small (the historical WSL2 "
55
+ "collapse); tightly caps the warm-tier KV/prefetch staging budget."
56
+ )
57
+ else:
58
+ bound = ">=" if mem.pinnable_capped else "~"
59
+ extra = " (probe safety-capped; true ceiling may be higher)" if mem.pinnable_capped else ""
60
+ notes.append(
61
+ f"pinnable host-RAM ceiling MEASURED at {bound}{c} GiB{extra} — ample warm-tier "
62
+ f"staging budget, well above the historical WSL2 ~300-500 MB cap (driver "
63
+ f"{prof.hardware.gpu.driver_version} appears to lift it). Measured, not assumed."
64
+ )
65
+ return notes
66
+
67
+
68
+ def _main(argv: Optional[List[str]] = None) -> int:
69
+ ap = argparse.ArgumentParser(
70
+ prog="gpu-container-profile",
71
+ description="Profile the rig (and optionally a model) into the placement-planner contract.",
72
+ )
73
+ ap.add_argument("--debug", action="store_true", help="show the full traceback on an unexpected error")
74
+ ap.add_argument("--model-config", help="path to a HuggingFace config.json to profile the model side")
75
+ ap.add_argument("--model-name", help="override the model name")
76
+ ap.add_argument("--quant", help="quant tag, e.g. gguf-q4_k_m")
77
+ ap.add_argument("--date", default=None, help="ISO date stamp (default: today)")
78
+ ap.add_argument("--no-bench", action="store_true",
79
+ help="skip the PCIe/NVMe/pinnable benchmarks (identity detection only)")
80
+ ap.add_argument("--bench-dir", help="directory for the fio NVMe test (an ext4-backed mounted volume; "
81
+ "default $GPU_CONTAINER_BENCH_DIR or /bench)")
82
+ ap.add_argument("-o", "--out", help="write the profile JSON here (default: stdout)")
83
+ # emit / close-the-loop
84
+ ap.add_argument("--from-profile", help="load an existing profile.json instead of detecting (for --emit-baseline)")
85
+ ap.add_argument("--emit-baseline", action="store_true",
86
+ help="write the profile's measured readouts into the docker-knowledge KB")
87
+ ap.add_argument("--baseline-db", default=_DEFAULT_KB_DB, help="path to docker-knowledge findings.db")
88
+ ap.add_argument("--baseline-context", help="override the measurement context label (e.g. the image tag)")
89
+ args = ap.parse_args(argv)
90
+
91
+ # Windows consoles default to cp1252; the profile JSON (ensure_ascii=False) can carry
92
+ # non-ASCII (e.g. accented model names) — make stdout utf-8 so printing never crashes.
93
+ try:
94
+ sys.stdout.reconfigure(encoding="utf-8")
95
+ except (AttributeError, ValueError):
96
+ pass
97
+
98
+ created = args.date or _today()
99
+
100
+ # --- obtain the profile (load or measure) -------------------------------------------
101
+ if args.from_profile:
102
+ try:
103
+ with open(args.from_profile, "r", encoding="utf-8") as f:
104
+ prof = Profile.from_json(f.read())
105
+ except (OSError, ValueError) as e:
106
+ raise GpuContainerError("INPUT_BAD_PROFILE", f"could not read {args.from_profile}",
107
+ hint="expected a profile.json from a prior `gpu-container-profile` run",
108
+ cause=str(e))
109
+ else:
110
+ hw = profile_hardware(created, run_benches=not args.no_bench, bench_dir=args.bench_dir)
111
+ mp = None
112
+ if args.model_config:
113
+ try:
114
+ with open(args.model_config, "r", encoding="utf-8") as f:
115
+ cfg = json.load(f)
116
+ except (OSError, ValueError) as e:
117
+ raise GpuContainerError("INPUT_BAD_MODEL_CONFIG", f"could not read {args.model_config}",
118
+ hint="expected a HuggingFace config.json", cause=str(e))
119
+ mp = model_mod.analyze_config(cfg, name=args.model_name, quant=args.quant)
120
+ prof = Profile(schema_version=SCHEMA_VERSION, created=created, hardware=hw, model=mp, notes=[])
121
+ prof.notes = _build_notes(prof)
122
+
123
+ # --- emit to the KB (close the loop) ------------------------------------------------
124
+ if args.emit_baseline:
125
+ summary = baseline_mod.emit_baseline(
126
+ prof, db_path=args.baseline_db, context=args.baseline_context, measured_date=created,
127
+ )
128
+ if "error" in summary:
129
+ raise GpuContainerError("RUNTIME_EMIT_BASELINE_FAILED", str(summary["error"]),
130
+ hint="check --baseline-db path and that the docker-knowledge KB is reachable")
131
+ print(f"emit-baseline: wrote {summary['written']} rows "
132
+ f"({', '.join(summary['metrics'])}) -> {summary['source_file']} "
133
+ f"[context: {summary['context']}] in {summary['db']}", file=sys.stderr)
134
+
135
+ # --- write/print the profile JSON ---------------------------------------------------
136
+ js = prof.to_json()
137
+ if args.out:
138
+ with open(args.out, "w", encoding="utf-8") as f:
139
+ f.write(js + "\n")
140
+ print(f"wrote {args.out}", file=sys.stderr)
141
+ else:
142
+ print(js)
143
+ return 0
144
+
145
+
146
+ def main(argv: Optional[List[str]] = None) -> int:
147
+ return guard(_main, argv)
148
+
149
+
150
+ if __name__ == "__main__":
151
+ raise SystemExit(main())