gitm-labs 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitm/__init__.py +9 -0
- gitm/_paths.py +109 -0
- gitm/agents/__init__.py +12 -0
- gitm/agents/policy.py +48 -0
- gitm/api.py +37 -0
- gitm/bench/__init__.py +30 -0
- gitm/bench/__main__.py +10 -0
- gitm/bench/baseline.py +169 -0
- gitm/bench/cli.py +290 -0
- gitm/bench/edge_manifest.py +138 -0
- gitm/bench/manifest.py +183 -0
- gitm/bench/profile.py +299 -0
- gitm/bench/reproduce.py +120 -0
- gitm/bench/results.py +68 -0
- gitm/bench/runner.py +137 -0
- gitm/bench/schema.py +168 -0
- gitm/bench/templates/results.md.j2 +37 -0
- gitm/benchmarks/__init__.py +0 -0
- gitm/benchmarks/kitti/__init__.py +9 -0
- gitm/benchmarks/kitti/baseline.py +249 -0
- gitm/benchmarks/kitti/workunit.py +223 -0
- gitm/cli.py +120 -0
- gitm/doctor.py +29 -0
- gitm/kernels/__init__.py +13 -0
- gitm/kernels/library.py +24 -0
- gitm/kernels/library.yaml +345 -0
- gitm/kernels/spec.py +51 -0
- gitm/optimizer/__init__.py +32 -0
- gitm/optimizer/apply.py +206 -0
- gitm/optimizer/attribution.py +90 -0
- gitm/optimizer/dr.py +154 -0
- gitm/optimizer/invariants.py +45 -0
- gitm/optimizer/monitor.py +164 -0
- gitm/optimizer/multibasis.py +86 -0
- gitm/optimizer/qualification.py +77 -0
- gitm/optimizer/replay.py +59 -0
- gitm/optimizer/replay_validation.py +125 -0
- gitm/optimizer/report.py +110 -0
- gitm/optimizer/templates/report.md.j2 +41 -0
- gitm/planner/__init__.py +22 -0
- gitm/planner/graph.py +117 -0
- gitm/planner/roofline.py +96 -0
- gitm/routing/__init__.py +0 -0
- gitm/routing/scorer_v0.py +89 -0
- gitm/scheduler/__init__.py +18 -0
- gitm/scheduler/loop.py +205 -0
- gitm/telemetry/__init__.py +18 -0
- gitm/telemetry/backends/__init__.py +13 -0
- gitm/telemetry/backends/amd.py +40 -0
- gitm/telemetry/backends/base.py +27 -0
- gitm/telemetry/backends/discover.py +43 -0
- gitm/telemetry/backends/nvidia.py +141 -0
- gitm/telemetry/collector.py +85 -0
- gitm/telemetry/schema.py +78 -0
- gitm/telemetry/sinks/__init__.py +50 -0
- gitm/telemetry/sinks/jsonl.py +33 -0
- gitm/telemetry/sinks/otlp.py +54 -0
- gitm/telemetry/sinks/prometheus.py +48 -0
- gitm/telemetry/sinks/s3.py +52 -0
- gitm/tracer/__init__.py +20 -0
- gitm/tracer/_cupti/__init__.py +24 -0
- gitm/tracer/_cupti/build.py +193 -0
- gitm/tracer/_cupti/cupti_shim.c +294 -0
- gitm/tracer/_cupti_decode.py +123 -0
- gitm/tracer/capture.py +108 -0
- gitm/tracer/cupti.py +46 -0
- gitm/tracer/schema.py +80 -0
- gitm_labs-0.0.1.dist-info/METADATA +264 -0
- gitm_labs-0.0.1.dist-info/RECORD +71 -0
- gitm_labs-0.0.1.dist-info/WHEEL +4 -0
- gitm_labs-0.0.1.dist-info/entry_points.txt +2 -0
gitm/__init__.py
ADDED
gitm/_paths.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Resolve data locations.
|
|
2
|
+
|
|
3
|
+
GITM's canonical data store is **S3**. Datasets, plus the durable copy of run
|
|
4
|
+
outputs, traces, and telemetry, all live under an ``s3://`` root. Datasets are
|
|
5
|
+
far too large to hold on local disk wholesale (the AlphaFold2 DBs alone are
|
|
6
|
+
~2.2 TB), so the local filesystem is treated only as *bounded scratch*: the
|
|
7
|
+
active run's working set is staged in, used, and evicted. Nothing here ever
|
|
8
|
+
assumes a dataset lives on local disk.
|
|
9
|
+
|
|
10
|
+
Two roots:
|
|
11
|
+
|
|
12
|
+
* ``$GITM_S3_ROOT`` — ``s3://bucket/prefix``, the canonical store. Datasets at
|
|
13
|
+
``<s3_root>/datasets/<name>/``; durable run/trace/telemetry archives at
|
|
14
|
+
``<s3_root>/{runs,traces,telemetry}/``.
|
|
15
|
+
* ``$GITM_SCRATCH`` — a local *ephemeral* directory for the active run's
|
|
16
|
+
outputs and staged inputs (small: a run writes here, then the durable copy is
|
|
17
|
+
synced to S3). Defaults to ``~/.cache/gitm``. Never holds datasets at rest.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
DEFAULT_SCRATCH = "~/.cache/gitm"
|
|
27
|
+
|
|
28
|
+
# Local scratch subdirectories. Note: no ``datasets`` — datasets are never
|
|
29
|
+
# materialized wholesale on local disk; they are staged on demand into
|
|
30
|
+
# ``staging/`` from S3 for the duration of a run, then evicted.
|
|
31
|
+
_SCRATCH_SUBDIRS = ("runs", "traces", "telemetry", "staging")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def s3_root(override: str | None = None) -> str | None:
|
|
35
|
+
"""Return the canonical ``s3://`` root, or ``None`` if unconfigured.
|
|
36
|
+
|
|
37
|
+
Resolution order: explicit ``override``, then ``$GITM_S3_ROOT``. Returned
|
|
38
|
+
without a trailing slash. Returns ``None`` when neither is set so callers
|
|
39
|
+
can degrade gracefully (e.g. a local run with no archival). Callers that
|
|
40
|
+
*require* S3 — anything touching datasets — should use :func:`dataset_uri`
|
|
41
|
+
or :func:`require_s3_root`, which raise with a clear message instead.
|
|
42
|
+
"""
|
|
43
|
+
raw = override or os.environ.get("GITM_S3_ROOT")
|
|
44
|
+
if not raw:
|
|
45
|
+
return None
|
|
46
|
+
return raw.rstrip("/")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def require_s3_root(override: str | None = None) -> str:
|
|
50
|
+
"""Like :func:`s3_root` but raise if no canonical store is configured."""
|
|
51
|
+
root = s3_root(override)
|
|
52
|
+
if root is None:
|
|
53
|
+
raise RuntimeError(
|
|
54
|
+
"No S3 root configured. GITM datasets live in S3 and are never "
|
|
55
|
+
"stored on local disk. Set $GITM_S3_ROOT, e.g.\n"
|
|
56
|
+
" export GITM_S3_ROOT=s3://gitm-data/prod"
|
|
57
|
+
)
|
|
58
|
+
return root
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def dataset_uri(name: str, *, s3_root_override: str | None = None) -> str:
|
|
62
|
+
"""Canonical ``s3://`` URI for a dataset.
|
|
63
|
+
|
|
64
|
+
``dataset_uri("hft/hft_1b_seed42")`` -> ``s3://.../datasets/hft/hft_1b_seed42``.
|
|
65
|
+
"""
|
|
66
|
+
return f"{require_s3_root(s3_root_override)}/datasets/{name.strip('/')}"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def durable_uri(kind: str, run_id: str, *, s3_root_override: str | None = None) -> str:
|
|
70
|
+
"""Canonical ``s3://`` archive URI for a run output.
|
|
71
|
+
|
|
72
|
+
``kind`` is one of ``runs``, ``traces``, ``telemetry`` — the durable
|
|
73
|
+
destination a scratch artifact is synced to once the run completes.
|
|
74
|
+
"""
|
|
75
|
+
if kind not in ("runs", "traces", "telemetry"):
|
|
76
|
+
raise ValueError(f"unknown durable artifact kind: {kind!r}")
|
|
77
|
+
return f"{require_s3_root(s3_root_override)}/{kind}/{run_id}"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def scratch_root(override: str | None = None) -> Path:
|
|
81
|
+
"""Return the local scratch directory as an absolute Path.
|
|
82
|
+
|
|
83
|
+
Resolution order: explicit ``override``, then ``$GITM_SCRATCH``, then
|
|
84
|
+
``~/.cache/gitm``. Ephemeral — holds the active run's outputs and staged
|
|
85
|
+
working set only, never datasets at rest. Created if absent.
|
|
86
|
+
"""
|
|
87
|
+
raw = override or os.environ.get("GITM_SCRATCH") or DEFAULT_SCRATCH
|
|
88
|
+
root = Path(raw).expanduser().resolve()
|
|
89
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
for sub in _SCRATCH_SUBDIRS:
|
|
91
|
+
(root / sub).mkdir(parents=True, exist_ok=True)
|
|
92
|
+
return root
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def traces_dir(override: str | None = None) -> Path:
|
|
96
|
+
return scratch_root(override) / "traces"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def runs_dir(override: str | None = None) -> Path:
|
|
100
|
+
return scratch_root(override) / "runs"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def telemetry_dir(override: str | None = None) -> Path:
|
|
104
|
+
return scratch_root(override) / "telemetry"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def staging_dir(override: str | None = None) -> Path:
|
|
108
|
+
"""Local landing zone for datasets staged in from S3 for the active run."""
|
|
109
|
+
return scratch_root(override) / "staging"
|
gitm/agents/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Autonomous decision policy — selects interventions, drives rollback.
|
|
2
|
+
|
|
3
|
+
The agent layer is intentionally thin: rank candidates by predicted delta
|
|
4
|
+
returned from counterfactual replay, pre-filter by safety gate, apply with
|
|
5
|
+
rollback, observe live delta, persist the chain into the provenance trail.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from gitm.agents.policy import Policy, RankedCandidate, select_interventions
|
|
11
|
+
|
|
12
|
+
__all__ = ["Policy", "RankedCandidate", "select_interventions"]
|
gitm/agents/policy.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Selection policy: pre-filter by safety, rank by predicted delta, return top-N."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
from gitm.kernels.spec import InterventionSpec
|
|
9
|
+
from gitm.optimizer.replay import predict_delta
|
|
10
|
+
from gitm.tracer.schema import Trace
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class RankedCandidate:
|
|
15
|
+
spec: InterventionSpec
|
|
16
|
+
predicted_delta: float
|
|
17
|
+
rejected_reason: str | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Policy:
|
|
22
|
+
"""Greedy by predicted delta with safety pre-filter."""
|
|
23
|
+
|
|
24
|
+
require_qualification_commit: bool = False
|
|
25
|
+
skip_high_risk: bool = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def select_interventions(
|
|
29
|
+
trace: Trace,
|
|
30
|
+
library: Iterable[InterventionSpec],
|
|
31
|
+
policy: Policy,
|
|
32
|
+
top_n: int = 5,
|
|
33
|
+
) -> list[RankedCandidate]:
|
|
34
|
+
candidates: list[RankedCandidate] = []
|
|
35
|
+
|
|
36
|
+
for spec in library:
|
|
37
|
+
reason: str | None = None
|
|
38
|
+
if policy.skip_high_risk and spec.safety.tier == "high_risk":
|
|
39
|
+
reason = "policy.skip_high_risk"
|
|
40
|
+
elif spec.safety.requires_qualification_commit and not policy.require_qualification_commit:
|
|
41
|
+
reason = "safety.requires_qualification_commit"
|
|
42
|
+
delta = predict_delta(trace, spec) if reason is None else 0.0
|
|
43
|
+
candidates.append(RankedCandidate(spec=spec, predicted_delta=delta, rejected_reason=reason))
|
|
44
|
+
|
|
45
|
+
candidates.sort(
|
|
46
|
+
key=lambda c: (c.rejected_reason is not None, -c.predicted_delta, c.spec.name)
|
|
47
|
+
)
|
|
48
|
+
return candidates[:top_n]
|
gitm/api.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Public embedded API.
|
|
2
|
+
|
|
3
|
+
from gitm import optimize
|
|
4
|
+
optimize(engine, budget="24h", target=0.15)
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from gitm.scheduler import LoopConfig, run_loop
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def optimize(
|
|
15
|
+
engine: Any | None = None,
|
|
16
|
+
*,
|
|
17
|
+
workload: str | None = None,
|
|
18
|
+
budget: str = "24h",
|
|
19
|
+
target: float = 0.15,
|
|
20
|
+
scratch: str | None = None,
|
|
21
|
+
) -> dict[str, Any]:
|
|
22
|
+
"""Run the autonomous 24-hour optimization loop and return a report.
|
|
23
|
+
|
|
24
|
+
Either pass an ``engine`` (e.g. a running vLLM engine handle) for the
|
|
25
|
+
embedded path, or pass ``workload`` (e.g. ``"vllm-decode"``) for the CLI
|
|
26
|
+
path. ``budget`` and ``target`` follow the SKU contract: a verified floor
|
|
27
|
+
of ``target`` fraction improvement within ``budget`` wall time, or a
|
|
28
|
+
qualification-gate diagnostic explaining why the floor was not committed.
|
|
29
|
+
"""
|
|
30
|
+
cfg = LoopConfig(
|
|
31
|
+
engine=engine,
|
|
32
|
+
workload=workload,
|
|
33
|
+
budget=budget,
|
|
34
|
+
target=target,
|
|
35
|
+
scratch=scratch,
|
|
36
|
+
)
|
|
37
|
+
return run_loop(cfg)
|
gitm/bench/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Shared benchmark systems layer.
|
|
2
|
+
|
|
3
|
+
The benchmark *layer* is deliberately dumb and identical across domains (HFT,
|
|
4
|
+
biotech, edge/robotics) so the runtime layer — planner, deviation monitor,
|
|
5
|
+
causal attribution — does the real work against a heterogeneous workload mix
|
|
6
|
+
without per-benchmark plumbing. Everything domain-specific lives in a single
|
|
7
|
+
``bench.toml`` per benchmark; everything mechanical lives here and is reused.
|
|
8
|
+
|
|
9
|
+
What this package gives every benchmark pair:
|
|
10
|
+
|
|
11
|
+
* :mod:`gitm.bench.schema` — the canonical data shapes: ``BenchConfig`` (parsed
|
|
12
|
+
``bench.toml``), ``StallPhase`` (one row of the stall-breakdown table), and
|
|
13
|
+
``BaselineRun`` (the ``<name>_baseline_N.json`` contract).
|
|
14
|
+
* :mod:`gitm.bench.manifest` — streaming sha256 manifest build + verify, so any
|
|
15
|
+
holder of ``manifest.yaml`` can re-fetch byte-identical TB-scale datasets.
|
|
16
|
+
* :mod:`gitm.bench.baseline` — the two sign-off gates: three seeds agree within
|
|
17
|
+
2 % (``spread``) and GPU active % stays under the ceiling (``saturation``).
|
|
18
|
+
* :mod:`gitm.bench.profile` — the GITM profiling wrapper around nsys/rocprof +
|
|
19
|
+
py-spy/sar that produces the stall-breakdown table.
|
|
20
|
+
* :mod:`gitm.bench.edge_manifest` — the nuScenes+KITTI ``manifest.jsonl`` builder.
|
|
21
|
+
* :mod:`gitm.bench.results` — renders ``results.md``.
|
|
22
|
+
|
|
23
|
+
Driven from each ``benchmarks/<name>/Makefile`` via ``python -m gitm.bench``.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from gitm.bench.schema import BaselineRun, BenchConfig, StallPhase
|
|
29
|
+
|
|
30
|
+
__all__ = ["BaselineRun", "BenchConfig", "StallPhase"]
|
gitm/bench/__main__.py
ADDED
gitm/bench/baseline.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Baseline aggregation and the two sign-off gates.
|
|
2
|
+
|
|
3
|
+
A baseline is *locked* when three convergent runs (one per seed) agree on the
|
|
4
|
+
top-line metric within 2 % — the recorded baseline is their mean. Two gates
|
|
5
|
+
decide sign-off, both encoded here so every benchmark is judged identically:
|
|
6
|
+
|
|
7
|
+
* **spread gate** — ``max-min`` over ``mean`` of the three metric values must
|
|
8
|
+
be under ``spread_tolerance`` (default 2 %). Convergence is what makes the
|
|
9
|
+
number trustworthy as an optimization reference.
|
|
10
|
+
* **saturation gate** — wall-clock-weighted GPU active % must stay under
|
|
11
|
+
``gpu_active_ceiling`` (default 85 %). A saturated benchmark has no residual
|
|
12
|
+
headroom for the runtime to find, so it trips the same-day swap rule.
|
|
13
|
+
|
|
14
|
+
A third, optional check compares the recorded mean against ``baseline_target``
|
|
15
|
+
(e.g. HFT ≥ 25 M events/s) in the configured direction.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import statistics
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from gitm.bench.schema import BaselineRun, BenchConfig
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load_runs(paths: list[str | Path]) -> list[BaselineRun]:
|
|
29
|
+
runs = []
|
|
30
|
+
for p in paths:
|
|
31
|
+
runs.append(BaselineRun.model_validate_json(Path(p).read_text()))
|
|
32
|
+
return runs
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class GateResult:
|
|
37
|
+
name: str
|
|
38
|
+
passed: bool
|
|
39
|
+
detail: str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class BaselineSummary:
|
|
44
|
+
benchmark: str
|
|
45
|
+
metric: str
|
|
46
|
+
n: int
|
|
47
|
+
mean: float
|
|
48
|
+
stddev: float
|
|
49
|
+
spread: float # (max - min) / mean
|
|
50
|
+
gpu_active_overall: float # worst (max) across runs
|
|
51
|
+
recorded: float # the number we publish = mean
|
|
52
|
+
gates: list[GateResult] = field(default_factory=list)
|
|
53
|
+
seeds: list[int] = field(default_factory=list)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def passed(self) -> bool:
|
|
57
|
+
return all(g.passed for g in self.gates)
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> dict:
|
|
60
|
+
return {
|
|
61
|
+
"benchmark": self.benchmark,
|
|
62
|
+
"metric": self.metric,
|
|
63
|
+
"n": self.n,
|
|
64
|
+
"seeds": self.seeds,
|
|
65
|
+
"recorded": self.recorded,
|
|
66
|
+
"mean": self.mean,
|
|
67
|
+
"stddev": self.stddev,
|
|
68
|
+
"spread": self.spread,
|
|
69
|
+
"gpu_active_overall": self.gpu_active_overall,
|
|
70
|
+
"passed": self.passed,
|
|
71
|
+
"gates": [
|
|
72
|
+
{"name": g.name, "passed": g.passed, "detail": g.detail}
|
|
73
|
+
for g in self.gates
|
|
74
|
+
],
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def aggregate(runs: list[BaselineRun], config: BenchConfig) -> BaselineSummary:
|
|
79
|
+
"""Aggregate baseline runs and evaluate the sign-off gates.
|
|
80
|
+
|
|
81
|
+
Does not assume exactly three runs — fewer is a gate failure, more is fine —
|
|
82
|
+
so a pair can iterate on two and still get a meaningful spread reading.
|
|
83
|
+
"""
|
|
84
|
+
if not runs:
|
|
85
|
+
raise ValueError("no baseline runs to aggregate")
|
|
86
|
+
|
|
87
|
+
for r in runs:
|
|
88
|
+
if r.benchmark != config.name:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"run benchmark {r.benchmark!r} != config {config.name!r}"
|
|
91
|
+
)
|
|
92
|
+
if r.metric != config.metric:
|
|
93
|
+
raise ValueError(f"run metric {r.metric!r} != config {config.metric!r}")
|
|
94
|
+
|
|
95
|
+
values = [r.metric_value for r in runs]
|
|
96
|
+
mean = statistics.fmean(values)
|
|
97
|
+
stddev = statistics.pstdev(values) if len(values) > 1 else 0.0
|
|
98
|
+
spread = (max(values) - min(values)) / mean if mean else float("inf")
|
|
99
|
+
gpu_overall = max(r.gpu_active_overall() for r in runs)
|
|
100
|
+
|
|
101
|
+
gates: list[GateResult] = []
|
|
102
|
+
|
|
103
|
+
# Gate 1: three convergent seeds.
|
|
104
|
+
n_ok = len(runs) >= 3
|
|
105
|
+
spread_ok = spread <= config.spread_tolerance
|
|
106
|
+
gates.append(
|
|
107
|
+
GateResult(
|
|
108
|
+
"count",
|
|
109
|
+
n_ok,
|
|
110
|
+
f"{len(runs)} run(s); need >= 3 convergent seeds",
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
gates.append(
|
|
114
|
+
GateResult(
|
|
115
|
+
"spread",
|
|
116
|
+
spread_ok,
|
|
117
|
+
f"spread {spread:.2%} vs tolerance {config.spread_tolerance:.2%}",
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Gate 2: saturation / swap rule.
|
|
122
|
+
sat_ok = gpu_overall < config.gpu_active_ceiling
|
|
123
|
+
gates.append(
|
|
124
|
+
GateResult(
|
|
125
|
+
"saturation",
|
|
126
|
+
sat_ok,
|
|
127
|
+
f"GPU active {gpu_overall:.1%} vs ceiling {config.gpu_active_ceiling:.0%}"
|
|
128
|
+
+ ("" if sat_ok else " — trips swap rule, shard same day"),
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Gate 3 (optional): metric vs target.
|
|
133
|
+
if config.baseline_target is not None:
|
|
134
|
+
if config.target_direction == "ge":
|
|
135
|
+
tgt_ok = mean >= config.baseline_target
|
|
136
|
+
cmp = ">="
|
|
137
|
+
else:
|
|
138
|
+
tgt_ok = mean <= config.baseline_target
|
|
139
|
+
cmp = "<="
|
|
140
|
+
gates.append(
|
|
141
|
+
GateResult(
|
|
142
|
+
"target",
|
|
143
|
+
tgt_ok,
|
|
144
|
+
f"mean {mean:.4g} {cmp} target {config.baseline_target:.4g}",
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return BaselineSummary(
|
|
149
|
+
benchmark=config.name,
|
|
150
|
+
metric=config.metric,
|
|
151
|
+
n=len(runs),
|
|
152
|
+
mean=mean,
|
|
153
|
+
stddev=stddev,
|
|
154
|
+
spread=spread,
|
|
155
|
+
gpu_active_overall=gpu_overall,
|
|
156
|
+
recorded=mean,
|
|
157
|
+
gates=gates,
|
|
158
|
+
seeds=sorted(r.seed for r in runs),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def aggregate_files(paths: list[str | Path], config: BenchConfig) -> BaselineSummary:
|
|
163
|
+
return aggregate(load_runs(paths), config)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def write_summary(summary: BaselineSummary, out: str | Path) -> Path:
|
|
167
|
+
out = Path(out)
|
|
168
|
+
out.write_text(json.dumps(summary.to_dict(), indent=2) + "\n")
|
|
169
|
+
return out
|