PyPI - gpu-container - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gpu-container 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

gpu_container/__init__.py +9 -0
gpu_container/__main__.py +60 -0
gpu_container/errors.py +72 -0
gpu_container/planner/__init__.py +17 -0
gpu_container/planner/activation.py +225 -0
gpu_container/planner/calibration.py +224 -0
gpu_container/planner/calibration_seed.json +44 -0
gpu_container/planner/cli.py +101 -0
gpu_container/planner/concentration_cli.py +120 -0
gpu_container/planner/placement.py +198 -0
gpu_container/planner/receipt.py +155 -0
gpu_container/planner/receipt_cli.py +143 -0
gpu_container/profiler/__init__.py +24 -0
gpu_container/profiler/baseline.py +122 -0
gpu_container/profiler/cli.py +151 -0
gpu_container/profiler/cuda_bench.py +306 -0
gpu_container/profiler/hardware.py +304 -0
gpu_container/profiler/model.py +178 -0
gpu_container/profiler/nvme_bench.py +158 -0
gpu_container/profiler/schema.py +245 -0
gpu_container/watchdog.py +563 -0
gpu_container-0.1.0.dist-info/METADATA +100 -0
gpu_container-0.1.0.dist-info/RECORD +26 -0
gpu_container-0.1.0.dist-info/WHEEL +4 -0
gpu_container-0.1.0.dist-info/entry_points.txt +7 -0
gpu_container-0.1.0.dist-info/licenses/LICENSE +21 -0

gpu_container/planner/receipt_cli.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""`gpu-container-receipt` — close the loop: measure a plan, write the receipt, recalibrate.
+    gpu-container-receipt --plan plan.json --bench bench.json \
+        --model-name Qwen3-30B-A3B --quant gguf-q4_k_m --calibration-dir ./calib -o receipt.json
+It pairs a llama-bench run (`-o json`, via --bench file or stdin) with the plan's forecast, emits a
+Receipt (realized efficiency, calibrated-forecast error, within-band), and — when --calibration-dir
+is given — appends a CalibrationPoint so the NEXT plan for this shape is calibrated. That write-back
+is the recalibration loop. The verifier is a real GPU run, a DIFFERENT mechanism than the planner's
+closed form (EXTERNAL_VERIFIER).
+Exit code (ANDON): 0 = measured cleared the floor and sat at/below the ceiling; 3 = measured fell
+below the >1 tok/s floor (the plan's ship was optimistic); 4 = measured EXCEEDED the ceiling (the
+bandwidth model itself is wrong — halt and fix assumptions, don't just recalibrate efficiency).
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from datetime import date
+from typing import List, Optional
+from ..errors import GpuContainerError, guard
+from ..profiler.schema import PlacementPlan
+from .calibration import CalibrationStore
+from .receipt import build_receipt, parse_llama_bench, plan_to_calibration_point
+from .receipt import _pick  # decode/prefill row selector
+def _read(path: Optional[str]) -> str:
+    if path in (None, "-"):
+        return sys.stdin.read()
+    with open(path, "r", encoding="utf-8") as f:
+        return f.read()
+def _main(argv: Optional[List[str]] = None) -> int:
+    for _stream in (sys.stdout, sys.stderr):
+        try:
+            _stream.reconfigure(encoding="utf-8")
+        except (AttributeError, ValueError):
+            pass
+    ap = argparse.ArgumentParser(
+        prog="gpu-container-receipt",
+        description="Measure a plan against a llama-bench run -> receipt + recalibration write-back.",
+    )
+    ap.add_argument("--debug", action="store_true", help="show the full traceback on an unexpected error")
+    ap.add_argument("--plan", required=True, help="plan.json from gpu-container-plan")
+    ap.add_argument("--bench", help="llama-bench -o json output (file or '-' for stdin)")
+    ap.add_argument("--decode-tok-s", type=float, help="measured decode tok/s (instead of --bench)")
+    ap.add_argument("--prefill-tok-s", type=float, help="measured prefill tok/s (optional)")
+    ap.add_argument("--vram-used-mib", type=float, help="measured VRAM use (optional)")
+    ap.add_argument("--model-name", help="model name for the calibration point (e.g. Qwen3-30B-A3B)")
+    ap.add_argument("--quant", help="quant tag for the calibration point (e.g. gguf-q4_k_m)")
+    ap.add_argument("--calibration-dir", help="append a CalibrationPoint here (the recalibration write-back)")
+    ap.add_argument("--created", default=date.today().isoformat(), help="ISO date stamp (default: today)")
+    ap.add_argument("--rig", help="rig provenance for the calibration point")
+    ap.add_argument("--source", help="free-text provenance (which run)")
+    ap.add_argument("--trace", help="ActivationTrace JSON — fold the per-expert routing de-risk verdict into the receipt")
+    ap.add_argument("--coverage", type=float, default=0.90, help="routing-coverage target for --trace (default 0.90)")
+    ap.add_argument("--threshold", type=float, default=0.50, help="cache_helps threshold for --trace (default 0.50)")
+    ap.add_argument("--peaks", help="peak-metrics JSON from `gpu-container-watchdog run --peaks-out` — "
+                                    "fold the run's safety envelope (peak power/host-mem/VRAM) into the receipt")
+    ap.add_argument("-o", "--out", help="write the receipt JSON here (default: stdout)")
+    args = ap.parse_args(argv)
+    plan = PlacementPlan.from_json(_read(args.plan))
+    decode = args.decode_tok_s
+    prefill = args.prefill_tok_s
+    if args.bench is not None:
+        rows = parse_llama_bench(_read(args.bench))
+        if not rows:
+            raise GpuContainerError("INPUT_NO_BENCH_ROWS", "--bench produced no parseable llama-bench rows",
+                                    hint="pass `llama-bench -o json` output (a file or '-' for stdin)")
+        decode = _pick(rows, "tg") if decode is None else decode
+        prefill = _pick(rows, "pp") if prefill is None else prefill
+    if decode is None:
+        raise GpuContainerError("INPUT_NO_DECODE_RATE", "need a measured decode rate",
+                                hint="give --bench output with a tg (token-generation) row, or --decode-tok-s")
+    # Optional per-expert routing de-risk: fold the concentration verdict into the receipt (--trace).
+    concentration = None
+    if args.trace:
+        from .activation import analyze_concentration, load_trace
+        tr = load_trace(args.trace)
+        if tr is None:
+            print(f"WARN: --trace {args.trace} could not be loaded; omitting the routing de-risk.", file=sys.stderr)
+        else:
+            concentration = analyze_concentration(tr, coverage_target=args.coverage,
+                                                   cache_helps_threshold=args.threshold)
+    # Optional safety envelope: fold the supervised run's peak metrics into the receipt (--peaks).
+    peaks = None
+    if args.peaks:
+        try:
+            with open(args.peaks, "r", encoding="utf-8") as f:
+                peaks = json.load(f)
+        except (OSError, ValueError) as e:
+            print(f"WARN: --peaks {args.peaks} could not be loaded ({e}); omitting the safety envelope.",
+                  file=sys.stderr)
+    receipt = build_receipt(plan, decode_tok_s=decode, prefill_tok_s=prefill,
+                            vram_used_mib=args.vram_used_mib, method=args.source or "llama-bench",
+                            concentration=concentration, peaks=peaks)
+    # The write-back: append this measurement to the calibration store so the next plan is calibrated.
+    if args.calibration_dir and args.model_name:
+        point = plan_to_calibration_point(
+            plan, measured_decode_tok_s=decode, model_name=args.model_name, quant=args.quant,
+            created=args.created, rig=args.rig, source=args.source or "gpu-container-receipt",
+        )
+        dest = CalibrationStore(args.calibration_dir).add(point)
+        receipt.notes.append(f"recalibration: appended a CalibrationPoint to {dest} "
+                             f"(efficiency {point.efficiency * 100:.0f}% at N={point.n_cpu_moe}).")
+    elif args.calibration_dir:
+        receipt.notes.append("note: --calibration-dir given but no --model-name; skipped the write-back.")
+    js = receipt.to_json()
+    if args.out:
+        with open(args.out, "w", encoding="utf-8") as f:
+            f.write(js + "\n")
+        print(f"wrote {args.out}", file=sys.stderr)
+    else:
+        print(js)
+    for note in receipt.notes:
+        print(note, file=sys.stderr)
+    if receipt.ceiling_decode_tok_s and decode > receipt.ceiling_decode_tok_s:
+        return 4  # ANDON: measured beat the ceiling — the bandwidth model is wrong
+    if receipt.cleared_floor is False:
+        return 3  # below the >1 tok/s floor — the ship was optimistic
+    return 0
+def main(argv: Optional[List[str]] = None) -> int:
+    return guard(_main, argv)
+if __name__ == "__main__":
+    raise SystemExit(main())

gpu_container/profiler/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""The profiler: hardware + model profiling -> a single, JSON-serializable Profile.
+- `schema`   — the Profile contract every downstream component reads.
+- `hardware` — detect/measure the rig (GPU, platform, bandwidth, memory).
+- `model`    — analyze a model (dense vs MoE, KV growth, per-expert bytes).
+- `cli`      — `gpu-container-profile` -> writes profile.json.
+Design rule (from docker-knowledge wave-1): a measurement we have NOT taken is `None`,
+never a guessed number. The planner must treat `None` as "unknown — refuse to assume",
+because honest refusal depends on honest inputs.
+"""
+from .schema import (  # noqa: F401
+    SCHEMA_VERSION,
+    BandwidthInfo,
+    ExpertInfo,
+    GpuInfo,
+    HardwareProfile,
+    MemoryInfo,
+    ModelProfile,
+    PlacementPlan,
+    PlatformInfo,
+    Profile,
+    Receipt,
+)

gpu_container/profiler/baseline.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Close the loop: write a measured profile's readouts back into the docker-knowledge KB.
+The KB's `measurements` table is a key-value-with-provenance store (metric, value, unit,
+context, tool, source_file, wave_id, measured_date); `v_baseline` is the read view over it.
+`--emit-baseline` takes a profile.json produced INSIDE the container and records each
+measured number as a row, plus drops the full profile under `baselines/<stem>.json` so the
+`source_file` provenance points at a real artifact. Idempotent: re-emitting the same
+`source_file` replaces its rows rather than duplicating them.
+This runs on the HOST (where the KB lives), reading a profile that was measured in-container
+— measurement and persistence are decoupled on purpose.
+"""
+from __future__ import annotations
+import json
+import os
+import sqlite3
+from typing import List, Optional, Tuple
+from .schema import Profile
+# wave to associate measured baselines with (the hw-measurement methodology wave)
+_DEFAULT_WAVE_NUMBER = 2
+def _slug(s: str) -> str:
+    return "".join(c if c.isalnum() else "-" for c in s.lower()).strip("-") or "rig"
+def _context(profile: Profile, override: Optional[str]) -> str:
+    if override:
+        return override
+    p = profile.hardware.platform
+    parts = ["in-container" if p.in_container else "host"]
+    if p.wsl2:
+        parts.append("wsl2")
+    return " ".join(parts)
+def _rows(profile: Profile) -> List[Tuple[str, float, str, str, str]]:
+    """(metric, value, unit, tool, note) for every measured (non-None) readout."""
+    hw = profile.hardware
+    bw, gpu, mem = hw.bandwidth, hw.gpu, hw.memory
+    det = bw.details or {}
+    pd, nd = det.get("pcie", {}) or {}, det.get("nvme", {}) or {}
+    pcie_note = f"pinned cudaMemcpy, median of {pd.get('iters')} @ {pd.get('buffer_mib')} MiB ({pd.get('convention','')})"
+    nvme_note = f"fio direct=1 {nd.get('ioengine')} on {nd.get('fs_type')} @ {nd.get('mount')}, size {nd.get('size_gib')}G"
+    out: List[Tuple[str, float, str, str, str]] = []
+    def add(metric, value, unit, tool, note):
+        if value is not None:
+            out.append((metric, float(value), unit, tool, note))
+    add("pcie_h2d_gbps", bw.pcie_h2d_gbps, "GB/s", "cudaMemcpy-bench", pcie_note)
+    add("pcie_d2h_gbps", bw.pcie_d2h_gbps, "GB/s", "cudaMemcpy-bench", pcie_note)
+    add("nvme_seq_read_gbps", bw.nvme_seq_read_gbps, "GB/s", "fio", nvme_note + " (optimistic ceiling)")
+    add("nvme_rand_qd1_iops", bw.nvme_rand_qd1_read_iops, "IOPS", "fio", nvme_note + " (the honest offload metric)")
+    add("nvme_rand_qd1_mbps", bw.nvme_rand_qd1_read_mbps, "MB/s", "fio", nvme_note)
+    add("pinnable_ram_ceiling_gib", mem.pinnable_ceiling_gib, "GiB", "cudaHostAlloc-probe",
+        (mem.pinnable_method or "") + (" [capped=lower-bound]" if mem.pinnable_capped else ""))
+    add("vram_total", gpu.vram_total_mib, "MiB", gpu.vram_source or "nvidia-smi", "device total")
+    add("vram_free", gpu.vram_free_mib, "MiB", gpu.vram_source or "nvidia-smi", "device free at profile time")
+    return out
+def emit_baseline(
+    profile: Profile,
+    db_path: str,
+    baselines_dir: Optional[str] = None,
+    context: Optional[str] = None,
+    source_stem: Optional[str] = None,
+    measured_date: Optional[str] = None,
+    wave_number: int = _DEFAULT_WAVE_NUMBER,
+) -> dict:
+    """Write the profile's measured rows into `measurements` and drop the artifact. Returns a summary."""
+    if not os.path.exists(db_path):
+        return {"error": f"findings.db not found at {db_path}"}
+    measured_date = measured_date or profile.created
+    ctx = _context(profile, context)
+    stem = source_stem or f"{measured_date}-{_slug(profile.hardware.gpu.name)}"
+    source_file = f"baselines/{stem}.json"
+    baselines_dir = baselines_dir or os.path.join(os.path.dirname(os.path.abspath(db_path)), "baselines")
+    rows = _rows(profile)
+    if not rows:
+        return {"error": "no measured (non-None) readouts in profile — nothing to emit; "
+                         "run the profiler in-container with benches enabled first"}
+    # 1) drop the profile artifact the source_file points at
+    os.makedirs(baselines_dir, exist_ok=True)
+    artifact_path = os.path.join(baselines_dir, f"{stem}.json")
+    with open(artifact_path, "w", encoding="utf-8") as f:
+        f.write(profile.to_json() + "\n")
+    # 2) write rows (idempotent by source_file)
+    conn = sqlite3.connect(db_path)
+    try:
+        cur = conn.cursor()
+        wid = cur.execute("SELECT id FROM waves WHERE wave_number=?", (wave_number,)).fetchone()
+        wave_id = wid[0] if wid else None
+        cur.execute("DELETE FROM measurements WHERE source_file=?", (source_file,))
+        cur.executemany(
+            "INSERT INTO measurements (metric, value, unit, context, tool, source_file, note, "
+            "wave_id, measured_date) VALUES (?,?,?,?,?,?,?,?,?)",
+            [(m, v, u, ctx, tool, source_file, note, wave_id, measured_date)
+             for (m, v, u, tool, note) in rows],
+        )
+        conn.commit()
+    finally:
+        conn.close()
+    return {
+        "written": len(rows),
+        "metrics": [r[0] for r in rows],
+        "source_file": source_file,
+        "artifact": artifact_path,
+        "context": ctx,
+        "db": db_path,
+        "wave_number": wave_number,
+    }

gpu_container/profiler/cli.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""`gpu-container-profile` — profile this rig (and optionally a model) -> profile.json.
+The profile JSON is the contract the planner reads. Run it INSIDE the target container for
+an honest hardware vantage (docker-knowledge `hw-measurement`); the PCIe/NVMe/pinnable
+benchmarks need the CUDA runtime + fio + an ext4 bench volume that the container provides.
+Two modes:
+  * MEASURE (default): detect + benchmark the rig now. `--no-bench` skips the benchmarks
+    (identity detection only); `--bench-dir` points the NVMe test at a mounted ext4 volume.
+  * EMIT (`--from-profile X.json --emit-baseline`): take a profile measured in-container and
+    record its readouts into the docker-knowledge KB (runs on the host, where the KB lives).
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+from typing import List, Optional
+from ..errors import GpuContainerError, guard
+from . import baseline as baseline_mod
+from . import model as model_mod
+from .hardware import profile_hardware
+from .schema import SCHEMA_VERSION, Profile
+# Conventional KB location on this rig (overridable via --baseline-db / $GPU_CONTAINER_KB_DB).
+_DEFAULT_KB_DB = os.environ.get("GPU_CONTAINER_KB_DB") or r"E:\AI\readouts\docker-knowledge\findings.db"
+def _today() -> str:
+    from datetime import date  # host clock is fine for an interactive CLI
+    return date.today().isoformat()
+def _build_notes(prof: Profile) -> List[str]:
+    notes: List[str] = []
+    bw = prof.hardware.bandwidth
+    if bw.pcie_h2d_gbps is None or bw.nvme_rand_qd1_read_iops is None:
+        notes.append(
+            "bandwidth partially/un-measured: the planner MUST treat None as unknown, "
+            "never zero or spec-sheet. See bandwidth.method/details for why."
+        )
+    if prof.hardware.platform.uvm_oversubscription is False:
+        notes.append(
+            "UVM oversubscription unavailable on this platform -> explicit placement only "
+            "(docker-knowledge container-runtime)."
+        )
+    mem = prof.hardware.memory
+    if mem.pinnable_ceiling_gib is not None:
+        c = mem.pinnable_ceiling_gib
+        if c < 1.0:
+            notes.append(
+                f"pinnable host-RAM ceiling measured at {c} GiB — small (the historical WSL2 "
+                "collapse); tightly caps the warm-tier KV/prefetch staging budget."
+            )
+        else:
+            bound = ">=" if mem.pinnable_capped else "~"
+            extra = " (probe safety-capped; true ceiling may be higher)" if mem.pinnable_capped else ""
+            notes.append(
+                f"pinnable host-RAM ceiling MEASURED at {bound}{c} GiB{extra} — ample warm-tier "
+                f"staging budget, well above the historical WSL2 ~300-500 MB cap (driver "
+                f"{prof.hardware.gpu.driver_version} appears to lift it). Measured, not assumed."
+            )
+    return notes
+def _main(argv: Optional[List[str]] = None) -> int:
+    ap = argparse.ArgumentParser(
+        prog="gpu-container-profile",
+        description="Profile the rig (and optionally a model) into the placement-planner contract.",
+    )
+    ap.add_argument("--debug", action="store_true", help="show the full traceback on an unexpected error")
+    ap.add_argument("--model-config", help="path to a HuggingFace config.json to profile the model side")
+    ap.add_argument("--model-name", help="override the model name")
+    ap.add_argument("--quant", help="quant tag, e.g. gguf-q4_k_m")
+    ap.add_argument("--date", default=None, help="ISO date stamp (default: today)")
+    ap.add_argument("--no-bench", action="store_true",
+                    help="skip the PCIe/NVMe/pinnable benchmarks (identity detection only)")
+    ap.add_argument("--bench-dir", help="directory for the fio NVMe test (an ext4-backed mounted volume; "
+                                        "default $GPU_CONTAINER_BENCH_DIR or /bench)")
+    ap.add_argument("-o", "--out", help="write the profile JSON here (default: stdout)")
+    # emit / close-the-loop
+    ap.add_argument("--from-profile", help="load an existing profile.json instead of detecting (for --emit-baseline)")
+    ap.add_argument("--emit-baseline", action="store_true",
+                    help="write the profile's measured readouts into the docker-knowledge KB")
+    ap.add_argument("--baseline-db", default=_DEFAULT_KB_DB, help="path to docker-knowledge findings.db")
+    ap.add_argument("--baseline-context", help="override the measurement context label (e.g. the image tag)")
+    args = ap.parse_args(argv)
+    # Windows consoles default to cp1252; the profile JSON (ensure_ascii=False) can carry
+    # non-ASCII (e.g. accented model names) — make stdout utf-8 so printing never crashes.
+    try:
+        sys.stdout.reconfigure(encoding="utf-8")
+    except (AttributeError, ValueError):
+        pass
+    created = args.date or _today()
+    # --- obtain the profile (load or measure) -------------------------------------------
+    if args.from_profile:
+        try:
+            with open(args.from_profile, "r", encoding="utf-8") as f:
+                prof = Profile.from_json(f.read())
+        except (OSError, ValueError) as e:
+            raise GpuContainerError("INPUT_BAD_PROFILE", f"could not read {args.from_profile}",
+                                    hint="expected a profile.json from a prior `gpu-container-profile` run",
+                                    cause=str(e))
+    else:
+        hw = profile_hardware(created, run_benches=not args.no_bench, bench_dir=args.bench_dir)
+        mp = None
+        if args.model_config:
+            try:
+                with open(args.model_config, "r", encoding="utf-8") as f:
+                    cfg = json.load(f)
+            except (OSError, ValueError) as e:
+                raise GpuContainerError("INPUT_BAD_MODEL_CONFIG", f"could not read {args.model_config}",
+                                        hint="expected a HuggingFace config.json", cause=str(e))
+            mp = model_mod.analyze_config(cfg, name=args.model_name, quant=args.quant)
+        prof = Profile(schema_version=SCHEMA_VERSION, created=created, hardware=hw, model=mp, notes=[])
+        prof.notes = _build_notes(prof)
+    # --- emit to the KB (close the loop) ------------------------------------------------
+    if args.emit_baseline:
+        summary = baseline_mod.emit_baseline(
+            prof, db_path=args.baseline_db, context=args.baseline_context, measured_date=created,
+        )
+        if "error" in summary:
+            raise GpuContainerError("RUNTIME_EMIT_BASELINE_FAILED", str(summary["error"]),
+                                    hint="check --baseline-db path and that the docker-knowledge KB is reachable")
+        print(f"emit-baseline: wrote {summary['written']} rows "
+              f"({', '.join(summary['metrics'])}) -> {summary['source_file']} "
+              f"[context: {summary['context']}] in {summary['db']}", file=sys.stderr)
+    # --- write/print the profile JSON ---------------------------------------------------
+    js = prof.to_json()
+    if args.out:
+        with open(args.out, "w", encoding="utf-8") as f:
+            f.write(js + "\n")
+        print(f"wrote {args.out}", file=sys.stderr)
+    else:
+        print(js)
+    return 0
+def main(argv: Optional[List[str]] = None) -> int:
+    return guard(_main, argv)
+if __name__ == "__main__":
+    raise SystemExit(main())