invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/tiers.yaml +61 -0
- invarlock/adapters/hf_loading.py +97 -0
- invarlock/calibration/__init__.py +6 -0
- invarlock/calibration/spectral_null.py +301 -0
- invarlock/calibration/variance_ve.py +154 -0
- invarlock/cli/app.py +15 -0
- invarlock/cli/commands/calibrate.py +576 -0
- invarlock/cli/commands/doctor.py +9 -3
- invarlock/cli/commands/explain_gates.py +53 -9
- invarlock/cli/commands/plugins.py +12 -2
- invarlock/cli/commands/run.py +181 -79
- invarlock/cli/commands/verify.py +40 -0
- invarlock/cli/config.py +11 -1
- invarlock/cli/determinism.py +252 -0
- invarlock/core/auto_tuning.py +215 -17
- invarlock/core/bootstrap.py +137 -5
- invarlock/core/registry.py +9 -4
- invarlock/core/runner.py +305 -35
- invarlock/eval/bench.py +467 -141
- invarlock/eval/bench_regression.py +12 -0
- invarlock/eval/bootstrap.py +3 -1
- invarlock/eval/data.py +29 -7
- invarlock/eval/primary_metric.py +20 -5
- invarlock/guards/rmt.py +536 -46
- invarlock/guards/spectral.py +217 -10
- invarlock/guards/variance.py +124 -42
- invarlock/reporting/certificate.py +476 -45
- invarlock/reporting/certificate_schema.py +4 -1
- invarlock/reporting/guards_analysis.py +108 -10
- invarlock/reporting/normalizer.py +24 -1
- invarlock/reporting/policy_utils.py +97 -15
- invarlock/reporting/primary_metric_utils.py +17 -0
- invarlock/reporting/validate.py +10 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
"""Calibration sweep harnesses (null + VE).
|
|
2
|
+
|
|
3
|
+
These commands run repeatable sweeps and emit stable artifacts for release notes:
|
|
4
|
+
- JSON (machine)
|
|
5
|
+
- CSV (spreadsheet)
|
|
6
|
+
- Markdown (human)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import csv
|
|
12
|
+
import json
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import typer
|
|
20
|
+
import yaml
|
|
21
|
+
from rich.console import Console
|
|
22
|
+
|
|
23
|
+
from invarlock.calibration.spectral_null import summarize_null_sweep_reports
|
|
24
|
+
from invarlock.calibration.variance_ve import summarize_ve_sweep_reports
|
|
25
|
+
from invarlock.guards.tier_config import get_tier_guard_config
|
|
26
|
+
|
|
27
|
+
console = Console()
|
|
28
|
+
|
|
29
|
+
calibrate_app = typer.Typer(
|
|
30
|
+
name="calibrate",
|
|
31
|
+
help="Run calibration sweeps and emit reports (JSON/CSV/Markdown).",
|
|
32
|
+
no_args_is_help=True,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class _SweepSpec:
|
|
38
|
+
tier: str
|
|
39
|
+
seed: int
|
|
40
|
+
windows: int | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _load_yaml(path: Path) -> dict[str, Any]:
|
|
44
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
45
|
+
if not isinstance(data, dict):
|
|
46
|
+
raise typer.BadParameter(f"Config must be a mapping: {path}")
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _dump_json(path: Path, payload: dict[str, Any]) -> None:
|
|
51
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
path.write_text(
|
|
53
|
+
json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _dump_markdown(path: Path, text: str) -> None:
|
|
58
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
path.write_text(text.strip() + "\n", encoding="utf-8")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _dump_csv(path: Path, rows: list[dict[str, Any]]) -> None:
|
|
63
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
if not rows:
|
|
65
|
+
path.write_text("", encoding="utf-8")
|
|
66
|
+
return
|
|
67
|
+
fields: list[str] = sorted({k for r in rows for k in r.keys()})
|
|
68
|
+
with path.open("w", encoding="utf-8", newline="") as handle:
|
|
69
|
+
writer = csv.DictWriter(handle, fieldnames=fields)
|
|
70
|
+
writer.writeheader()
|
|
71
|
+
for row in rows:
|
|
72
|
+
writer.writerow(row)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _materialize_sweep_specs(
|
|
76
|
+
*,
|
|
77
|
+
tiers: list[str] | None,
|
|
78
|
+
seeds: list[int] | None,
|
|
79
|
+
n_seeds: int,
|
|
80
|
+
seed_start: int,
|
|
81
|
+
windows: list[int] | None = None,
|
|
82
|
+
) -> list[_SweepSpec]:
|
|
83
|
+
tier_list = [t.strip().lower() for t in (tiers or []) if str(t).strip()]
|
|
84
|
+
if not tier_list:
|
|
85
|
+
tier_list = ["balanced", "conservative", "aggressive"]
|
|
86
|
+
|
|
87
|
+
seed_list = [int(s) for s in (seeds or [])]
|
|
88
|
+
if not seed_list:
|
|
89
|
+
seed_list = [int(seed_start) + i for i in range(int(n_seeds))]
|
|
90
|
+
|
|
91
|
+
out: list[_SweepSpec] = []
|
|
92
|
+
if windows:
|
|
93
|
+
for tier in tier_list:
|
|
94
|
+
for win in windows:
|
|
95
|
+
for seed in seed_list:
|
|
96
|
+
out.append(_SweepSpec(tier=tier, seed=seed, windows=int(win)))
|
|
97
|
+
else:
|
|
98
|
+
for tier in tier_list:
|
|
99
|
+
for seed in seed_list:
|
|
100
|
+
out.append(_SweepSpec(tier=tier, seed=seed))
|
|
101
|
+
return out
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _write_tiers_recommendation(
|
|
105
|
+
out_path: Path,
|
|
106
|
+
*,
|
|
107
|
+
recommendations: dict[str, dict[str, Any]],
|
|
108
|
+
) -> None:
|
|
109
|
+
"""Write a tiers.yaml-shaped patch file (only keys we recommend)."""
|
|
110
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
out_path.write_text(
|
|
112
|
+
yaml.safe_dump(recommendations, sort_keys=False),
|
|
113
|
+
encoding="utf-8",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@calibrate_app.command(
|
|
118
|
+
name="null-sweep",
|
|
119
|
+
help="Run a null (no-op edit) sweep and calibrate spectral κ/alpha empirically.",
|
|
120
|
+
)
|
|
121
|
+
def null_sweep(
|
|
122
|
+
config: Path = typer.Option(
|
|
123
|
+
Path("configs/calibration/null_sweep_ci.yaml"),
|
|
124
|
+
"--config",
|
|
125
|
+
exists=True,
|
|
126
|
+
dir_okay=False,
|
|
127
|
+
readable=True,
|
|
128
|
+
help="Base null-sweep YAML (noop edit).",
|
|
129
|
+
),
|
|
130
|
+
out: Path = typer.Option(
|
|
131
|
+
Path("reports/calibration/null_sweep"),
|
|
132
|
+
"--out",
|
|
133
|
+
help="Output directory for calibration artifacts.",
|
|
134
|
+
),
|
|
135
|
+
tiers: list[str] = typer.Option(
|
|
136
|
+
None,
|
|
137
|
+
"--tier",
|
|
138
|
+
help="Tier(s) to evaluate (repeatable). Defaults to all tiers.",
|
|
139
|
+
),
|
|
140
|
+
seed: list[int] = typer.Option(
|
|
141
|
+
None,
|
|
142
|
+
"--seed",
|
|
143
|
+
help="Seed(s) to run (repeatable). Overrides --n-seeds/--seed-start.",
|
|
144
|
+
),
|
|
145
|
+
n_seeds: int = typer.Option(10, "--n-seeds", min=1, help="Number of seeds to run."),
|
|
146
|
+
seed_start: int = typer.Option(42, "--seed-start", help="Starting seed."),
|
|
147
|
+
profile: str = typer.Option("ci", "--profile", help="Run profile (ci|release)."),
|
|
148
|
+
device: str | None = typer.Option(None, "--device", help="Device override."),
|
|
149
|
+
safety_margin: float = typer.Option(
|
|
150
|
+
0.05, "--safety-margin", help="Safety margin applied to κ recommendations."
|
|
151
|
+
),
|
|
152
|
+
target_any_warning_rate: float = typer.Option(
|
|
153
|
+
0.01,
|
|
154
|
+
"--target-any-warning-rate",
|
|
155
|
+
help="Target run-level spectral warning rate under the null.",
|
|
156
|
+
),
|
|
157
|
+
) -> None:
|
|
158
|
+
# Keep import light: only pull run machinery when invoked.
|
|
159
|
+
from .run import run_command
|
|
160
|
+
|
|
161
|
+
base = _load_yaml(config)
|
|
162
|
+
specs = _materialize_sweep_specs(
|
|
163
|
+
tiers=tiers, seeds=seed, n_seeds=n_seeds, seed_start=seed_start
|
|
164
|
+
)
|
|
165
|
+
specs = sorted(specs, key=lambda s: (s.tier, s.seed))
|
|
166
|
+
|
|
167
|
+
run_rows: list[dict[str, Any]] = []
|
|
168
|
+
reports_by_tier: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
169
|
+
|
|
170
|
+
run_root = out / "runs"
|
|
171
|
+
cfg_root = out / "configs"
|
|
172
|
+
cfg_root.mkdir(parents=True, exist_ok=True)
|
|
173
|
+
|
|
174
|
+
for spec in specs:
|
|
175
|
+
cfg = json.loads(json.dumps(base)) # safe deep copy without yaml anchors
|
|
176
|
+
cfg.setdefault("dataset", {})["seed"] = int(spec.seed)
|
|
177
|
+
cfg.setdefault("auto", {})["tier"] = spec.tier
|
|
178
|
+
|
|
179
|
+
# Per-run config + output roots to avoid timestamp collisions.
|
|
180
|
+
run_out = run_root / spec.tier / f"seed_{spec.seed}"
|
|
181
|
+
cfg.setdefault("output", {})["dir"] = str(run_out)
|
|
182
|
+
cfg_path = cfg_root / f"null_{spec.tier}_{spec.seed}.yaml"
|
|
183
|
+
cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8")
|
|
184
|
+
|
|
185
|
+
report_path = run_command(
|
|
186
|
+
config=str(cfg_path),
|
|
187
|
+
device=device,
|
|
188
|
+
profile=profile,
|
|
189
|
+
out=str(run_out),
|
|
190
|
+
tier=spec.tier,
|
|
191
|
+
)
|
|
192
|
+
if not isinstance(report_path, str):
|
|
193
|
+
continue
|
|
194
|
+
report = json.loads(Path(report_path).read_text(encoding="utf-8"))
|
|
195
|
+
reports_by_tier[spec.tier].append(report)
|
|
196
|
+
|
|
197
|
+
spectral = None
|
|
198
|
+
for g in report.get("guards", []):
|
|
199
|
+
if isinstance(g, dict) and g.get("name") == "spectral":
|
|
200
|
+
spectral = g
|
|
201
|
+
break
|
|
202
|
+
metrics = spectral.get("metrics", {}) if isinstance(spectral, dict) else {}
|
|
203
|
+
selection = (
|
|
204
|
+
metrics.get("multiple_testing_selection")
|
|
205
|
+
if isinstance(metrics.get("multiple_testing_selection"), dict)
|
|
206
|
+
else {}
|
|
207
|
+
)
|
|
208
|
+
fam_z_summary = (
|
|
209
|
+
metrics.get("family_z_summary")
|
|
210
|
+
if isinstance(metrics.get("family_z_summary"), dict)
|
|
211
|
+
else {}
|
|
212
|
+
)
|
|
213
|
+
candidate_counts = (
|
|
214
|
+
selection.get("family_violation_counts")
|
|
215
|
+
if isinstance(selection.get("family_violation_counts"), dict)
|
|
216
|
+
else {}
|
|
217
|
+
)
|
|
218
|
+
selected_families = selection.get("families_selected")
|
|
219
|
+
selected_set = (
|
|
220
|
+
{str(x) for x in selected_families}
|
|
221
|
+
if isinstance(selected_families, list)
|
|
222
|
+
else set()
|
|
223
|
+
)
|
|
224
|
+
selected_by_family: dict[str, int] = defaultdict(int)
|
|
225
|
+
violations = (
|
|
226
|
+
spectral.get("violations", []) if isinstance(spectral, dict) else []
|
|
227
|
+
)
|
|
228
|
+
if isinstance(violations, list):
|
|
229
|
+
for v in violations:
|
|
230
|
+
if isinstance(v, dict) and v.get("family") is not None:
|
|
231
|
+
selected_by_family[str(v.get("family"))] += 1
|
|
232
|
+
caps_applied = metrics.get("caps_applied")
|
|
233
|
+
try:
|
|
234
|
+
caps_applied = int(caps_applied) if caps_applied is not None else 0
|
|
235
|
+
except Exception:
|
|
236
|
+
caps_applied = 0
|
|
237
|
+
row: dict[str, Any] = {
|
|
238
|
+
"tier": spec.tier,
|
|
239
|
+
"seed": spec.seed,
|
|
240
|
+
"caps_applied": caps_applied,
|
|
241
|
+
"caps_exceeded": bool(metrics.get("caps_exceeded", False)),
|
|
242
|
+
"selected_families": ",".join(sorted(selected_set)),
|
|
243
|
+
}
|
|
244
|
+
for fam, vals in fam_z_summary.items():
|
|
245
|
+
if not isinstance(vals, dict):
|
|
246
|
+
continue
|
|
247
|
+
max_z = vals.get("max")
|
|
248
|
+
try:
|
|
249
|
+
if max_z is not None and max_z == max_z:
|
|
250
|
+
row[f"max_z_{fam}"] = float(max_z)
|
|
251
|
+
except Exception:
|
|
252
|
+
continue
|
|
253
|
+
for fam, count in candidate_counts.items():
|
|
254
|
+
try:
|
|
255
|
+
row[f"candidate_{fam}"] = int(count)
|
|
256
|
+
except Exception:
|
|
257
|
+
continue
|
|
258
|
+
for fam, count in selected_by_family.items():
|
|
259
|
+
row[f"selected_{fam}"] = int(count)
|
|
260
|
+
run_rows.append(row)
|
|
261
|
+
|
|
262
|
+
summaries: dict[str, Any] = {}
|
|
263
|
+
tiers_patch: dict[str, dict[str, Any]] = {}
|
|
264
|
+
for tier_name, tier_reports in sorted(reports_by_tier.items()):
|
|
265
|
+
summary = summarize_null_sweep_reports(
|
|
266
|
+
tier_reports,
|
|
267
|
+
tier=tier_name,
|
|
268
|
+
safety_margin=safety_margin,
|
|
269
|
+
target_any_warning_rate=target_any_warning_rate,
|
|
270
|
+
)
|
|
271
|
+
summaries[tier_name] = summary
|
|
272
|
+
rec = summary.get("recommendations", {}) if isinstance(summary, dict) else {}
|
|
273
|
+
spectral_patch = {
|
|
274
|
+
"spectral_guard": {
|
|
275
|
+
"family_caps": (rec.get("family_caps") or {}),
|
|
276
|
+
"multiple_testing": (rec.get("multiple_testing") or {}),
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
tiers_patch[tier_name] = spectral_patch
|
|
280
|
+
|
|
281
|
+
stamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
282
|
+
payload = {
|
|
283
|
+
"kind": "spectral_null_sweep",
|
|
284
|
+
"generated_at": stamp,
|
|
285
|
+
"config": {
|
|
286
|
+
"base_config": str(config),
|
|
287
|
+
"profile": str(profile),
|
|
288
|
+
"tiers": sorted(reports_by_tier.keys()),
|
|
289
|
+
"n_runs": int(sum(len(v) for v in reports_by_tier.values())),
|
|
290
|
+
},
|
|
291
|
+
"summaries": summaries,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
_dump_json(out / "null_sweep_report.json", payload)
|
|
295
|
+
_dump_csv(out / "null_sweep_runs.csv", run_rows)
|
|
296
|
+
_write_tiers_recommendation(
|
|
297
|
+
out / "tiers_patch_spectral_null.yaml", recommendations=tiers_patch
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
md_lines = [
|
|
301
|
+
"# Spectral null-sweep calibration",
|
|
302
|
+
"",
|
|
303
|
+
f"- Generated: `{stamp}`",
|
|
304
|
+
f"- Base config: `{config}`",
|
|
305
|
+
"",
|
|
306
|
+
"## Recommendations (tiers.yaml patch)",
|
|
307
|
+
f"- `{out / 'tiers_patch_spectral_null.yaml'}`",
|
|
308
|
+
"",
|
|
309
|
+
"## Summary",
|
|
310
|
+
"",
|
|
311
|
+
"| Tier | Runs | Any-warning rate | α (recommended) |",
|
|
312
|
+
"|---|---:|---:|---:|",
|
|
313
|
+
]
|
|
314
|
+
for tier_name, summary in sorted(summaries.items()):
|
|
315
|
+
obs = summary.get("observed", {}) if isinstance(summary, dict) else {}
|
|
316
|
+
rec = summary.get("recommendations", {}) if isinstance(summary, dict) else {}
|
|
317
|
+
mt = rec.get("multiple_testing", {}) if isinstance(rec, dict) else {}
|
|
318
|
+
md_lines.append(
|
|
319
|
+
f"| {tier_name} | {summary.get('n_runs', 0)} | {obs.get('any_warning_rate', 0.0):.3f} | {float(mt.get('alpha', 0.0)):.6f} |"
|
|
320
|
+
)
|
|
321
|
+
_dump_markdown(out / "null_sweep_summary.md", "\n".join(md_lines))
|
|
322
|
+
|
|
323
|
+
console.print(f"[green]✅ Wrote null sweep artifacts under {out}[/green]")
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@calibrate_app.command(
|
|
327
|
+
name="ve-sweep",
|
|
328
|
+
help="Run VE predictive-gate sweeps and recommend min_effect_lognll per tier.",
|
|
329
|
+
)
|
|
330
|
+
def ve_sweep(
|
|
331
|
+
config: Path = typer.Option(
|
|
332
|
+
Path("configs/calibration/rmt_ve_sweep_ci.yaml"),
|
|
333
|
+
"--config",
|
|
334
|
+
exists=True,
|
|
335
|
+
dir_okay=False,
|
|
336
|
+
readable=True,
|
|
337
|
+
help="Base VE sweep YAML (quant_rtn edit).",
|
|
338
|
+
),
|
|
339
|
+
out: Path = typer.Option(
|
|
340
|
+
Path("reports/calibration/ve_sweep"),
|
|
341
|
+
"--out",
|
|
342
|
+
help="Output directory for calibration artifacts.",
|
|
343
|
+
),
|
|
344
|
+
tiers: list[str] = typer.Option(
|
|
345
|
+
None,
|
|
346
|
+
"--tier",
|
|
347
|
+
help="Tier(s) to evaluate (repeatable). Defaults to all tiers.",
|
|
348
|
+
),
|
|
349
|
+
seed: list[int] = typer.Option(
|
|
350
|
+
None,
|
|
351
|
+
"--seed",
|
|
352
|
+
help="Seed(s) to run (repeatable). Overrides --n-seeds/--seed-start.",
|
|
353
|
+
),
|
|
354
|
+
n_seeds: int = typer.Option(10, "--n-seeds", min=1, help="Number of seeds to run."),
|
|
355
|
+
seed_start: int = typer.Option(42, "--seed-start", help="Starting seed."),
|
|
356
|
+
window: list[int] = typer.Option(
|
|
357
|
+
None,
|
|
358
|
+
"--window",
|
|
359
|
+
help="Variance calibration window counts (repeatable). Defaults to 6, 8, 12, 16.",
|
|
360
|
+
),
|
|
361
|
+
target_enable_rate: float = typer.Option(
|
|
362
|
+
0.05,
|
|
363
|
+
"--target-enable-rate",
|
|
364
|
+
help="Target expected VE enable rate (predictive-gate lower bound).",
|
|
365
|
+
),
|
|
366
|
+
profile: str = typer.Option("ci", "--profile", help="Run profile (ci|release)."),
|
|
367
|
+
device: str | None = typer.Option(None, "--device", help="Device override."),
|
|
368
|
+
safety_margin: float = typer.Option(
|
|
369
|
+
0.0,
|
|
370
|
+
"--safety-margin",
|
|
371
|
+
help="Safety margin applied to min_effect recommendations.",
|
|
372
|
+
),
|
|
373
|
+
) -> None:
|
|
374
|
+
# Keep import light: only pull run machinery when invoked.
|
|
375
|
+
from .run import run_command
|
|
376
|
+
|
|
377
|
+
base = _load_yaml(config)
|
|
378
|
+
windows = [int(w) for w in (window or [])] or [6, 8, 12, 16]
|
|
379
|
+
specs = _materialize_sweep_specs(
|
|
380
|
+
tiers=tiers,
|
|
381
|
+
seeds=seed,
|
|
382
|
+
n_seeds=n_seeds,
|
|
383
|
+
seed_start=seed_start,
|
|
384
|
+
windows=windows,
|
|
385
|
+
)
|
|
386
|
+
specs = sorted(specs, key=lambda s: (s.tier, int(s.windows or 0), s.seed))
|
|
387
|
+
|
|
388
|
+
run_rows: list[dict[str, Any]] = []
|
|
389
|
+
reports_by_tier: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
390
|
+
reports_by_tier_window: dict[tuple[str, int], list[dict[str, Any]]] = defaultdict(
|
|
391
|
+
list
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
run_root = out / "runs"
|
|
395
|
+
cfg_root = out / "configs"
|
|
396
|
+
cfg_root.mkdir(parents=True, exist_ok=True)
|
|
397
|
+
|
|
398
|
+
for spec in specs:
|
|
399
|
+
win = int(spec.windows or 0)
|
|
400
|
+
cfg = json.loads(json.dumps(base)) # safe deep copy without yaml anchors
|
|
401
|
+
cfg.setdefault("dataset", {})["seed"] = int(spec.seed)
|
|
402
|
+
cfg.setdefault("auto", {})["tier"] = spec.tier
|
|
403
|
+
# Keep edit deterministic when it supports a seed knob.
|
|
404
|
+
plan = cfg.setdefault("edit", {}).setdefault("plan", {})
|
|
405
|
+
if isinstance(plan, dict) and "seed" in plan:
|
|
406
|
+
plan["seed"] = int(spec.seed)
|
|
407
|
+
|
|
408
|
+
# Override variance calibration windows and ensure min_coverage is feasible.
|
|
409
|
+
gv = cfg.setdefault("guards", {}).setdefault("variance", {})
|
|
410
|
+
if not isinstance(gv, dict):
|
|
411
|
+
gv = {}
|
|
412
|
+
cfg["guards"]["variance"] = gv
|
|
413
|
+
calib = gv.setdefault("calibration", {})
|
|
414
|
+
if not isinstance(calib, dict):
|
|
415
|
+
calib = {}
|
|
416
|
+
gv["calibration"] = calib
|
|
417
|
+
calib["windows"] = int(win)
|
|
418
|
+
calib["seed"] = int(spec.seed)
|
|
419
|
+
calib["min_coverage"] = int(max(1, min(win, win - 2)))
|
|
420
|
+
|
|
421
|
+
run_out = run_root / spec.tier / f"windows_{win}" / f"seed_{spec.seed}"
|
|
422
|
+
cfg.setdefault("output", {})["dir"] = str(run_out)
|
|
423
|
+
cfg_path = cfg_root / f"ve_{spec.tier}_w{win}_{spec.seed}.yaml"
|
|
424
|
+
cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8")
|
|
425
|
+
|
|
426
|
+
report_path = run_command(
|
|
427
|
+
config=str(cfg_path),
|
|
428
|
+
device=device,
|
|
429
|
+
profile=profile,
|
|
430
|
+
out=str(run_out),
|
|
431
|
+
tier=spec.tier,
|
|
432
|
+
)
|
|
433
|
+
if not isinstance(report_path, str):
|
|
434
|
+
continue
|
|
435
|
+
report = json.loads(Path(report_path).read_text(encoding="utf-8"))
|
|
436
|
+
reports_by_tier[spec.tier].append(report)
|
|
437
|
+
reports_by_tier_window[(spec.tier, win)].append(report)
|
|
438
|
+
|
|
439
|
+
variance = None
|
|
440
|
+
for g in report.get("guards", []):
|
|
441
|
+
if isinstance(g, dict) and g.get("name") == "variance":
|
|
442
|
+
variance = g
|
|
443
|
+
break
|
|
444
|
+
metrics = variance.get("metrics", {}) if isinstance(variance, dict) else {}
|
|
445
|
+
pg = (
|
|
446
|
+
metrics.get("predictive_gate", {})
|
|
447
|
+
if isinstance(metrics.get("predictive_gate"), dict)
|
|
448
|
+
else {}
|
|
449
|
+
)
|
|
450
|
+
delta_ci = pg.get("delta_ci")
|
|
451
|
+
try:
|
|
452
|
+
ci_width = (
|
|
453
|
+
float(delta_ci[1]) - float(delta_ci[0])
|
|
454
|
+
if isinstance(delta_ci, tuple | list) and len(delta_ci) == 2
|
|
455
|
+
else None
|
|
456
|
+
)
|
|
457
|
+
except Exception:
|
|
458
|
+
ci_width = None
|
|
459
|
+
run_rows.append(
|
|
460
|
+
{
|
|
461
|
+
"tier": spec.tier,
|
|
462
|
+
"seed": spec.seed,
|
|
463
|
+
"windows": win,
|
|
464
|
+
"predictive_evaluated": bool(pg.get("evaluated", False)),
|
|
465
|
+
"predictive_mean_delta": pg.get("mean_delta"),
|
|
466
|
+
"predictive_delta_ci_lo": (
|
|
467
|
+
delta_ci[0]
|
|
468
|
+
if isinstance(delta_ci, tuple | list) and len(delta_ci) == 2
|
|
469
|
+
else None
|
|
470
|
+
),
|
|
471
|
+
"predictive_delta_ci_hi": (
|
|
472
|
+
delta_ci[1]
|
|
473
|
+
if isinstance(delta_ci, tuple | list) and len(delta_ci) == 2
|
|
474
|
+
else None
|
|
475
|
+
),
|
|
476
|
+
"predictive_ci_width": ci_width,
|
|
477
|
+
}
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# Per-tier recommendation using all runs (across window values).
|
|
481
|
+
summaries: dict[str, Any] = {}
|
|
482
|
+
tiers_patch: dict[str, dict[str, Any]] = {}
|
|
483
|
+
for tier_name, tier_reports in sorted(reports_by_tier.items()):
|
|
484
|
+
var_cfg = get_tier_guard_config(tier_name, "variance_guard")
|
|
485
|
+
one_sided = bool(var_cfg.get("predictive_one_sided", True))
|
|
486
|
+
summary = summarize_ve_sweep_reports(
|
|
487
|
+
tier_reports,
|
|
488
|
+
tier=tier_name,
|
|
489
|
+
target_enable_rate=target_enable_rate,
|
|
490
|
+
safety_margin=safety_margin,
|
|
491
|
+
predictive_one_sided=one_sided,
|
|
492
|
+
)
|
|
493
|
+
summaries[tier_name] = summary
|
|
494
|
+
rec = summary.get("recommendations", {}) if isinstance(summary, dict) else {}
|
|
495
|
+
tiers_patch[tier_name] = {
|
|
496
|
+
"variance_guard": {"min_effect_lognll": rec.get("min_effect_lognll")}
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
# Power curve: mean CI width per (tier, windows).
|
|
500
|
+
power_curve: list[dict[str, Any]] = []
|
|
501
|
+
for (tier_name, win), items in sorted(reports_by_tier_window.items()):
|
|
502
|
+
widths: list[float] = []
|
|
503
|
+
for rep in items:
|
|
504
|
+
g = None
|
|
505
|
+
for gg in rep.get("guards", []):
|
|
506
|
+
if isinstance(gg, dict) and gg.get("name") == "variance":
|
|
507
|
+
g = gg
|
|
508
|
+
break
|
|
509
|
+
metrics = g.get("metrics", {}) if isinstance(g, dict) else {}
|
|
510
|
+
pg = (
|
|
511
|
+
metrics.get("predictive_gate", {})
|
|
512
|
+
if isinstance(metrics.get("predictive_gate"), dict)
|
|
513
|
+
else {}
|
|
514
|
+
)
|
|
515
|
+
delta_ci = pg.get("delta_ci")
|
|
516
|
+
if isinstance(delta_ci, tuple | list) and len(delta_ci) == 2:
|
|
517
|
+
try:
|
|
518
|
+
widths.append(float(delta_ci[1]) - float(delta_ci[0]))
|
|
519
|
+
except Exception:
|
|
520
|
+
continue
|
|
521
|
+
power_curve.append(
|
|
522
|
+
{
|
|
523
|
+
"tier": tier_name,
|
|
524
|
+
"windows": int(win),
|
|
525
|
+
"runs": int(len(items)),
|
|
526
|
+
"mean_ci_width": (sum(widths) / len(widths)) if widths else None,
|
|
527
|
+
}
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
stamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
531
|
+
payload = {
|
|
532
|
+
"kind": "variance_ve_sweep",
|
|
533
|
+
"generated_at": stamp,
|
|
534
|
+
"config": {
|
|
535
|
+
"base_config": str(config),
|
|
536
|
+
"profile": str(profile),
|
|
537
|
+
"tiers": sorted(reports_by_tier.keys()),
|
|
538
|
+
"windows": windows,
|
|
539
|
+
"n_runs": int(sum(len(v) for v in reports_by_tier.values())),
|
|
540
|
+
},
|
|
541
|
+
"summaries": summaries,
|
|
542
|
+
"power_curve": power_curve,
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
_dump_json(out / "ve_sweep_report.json", payload)
|
|
546
|
+
_dump_csv(out / "ve_sweep_runs.csv", run_rows)
|
|
547
|
+
_dump_csv(out / "ve_power_curve.csv", power_curve)
|
|
548
|
+
_write_tiers_recommendation(
|
|
549
|
+
out / "tiers_patch_variance_ve.yaml", recommendations=tiers_patch
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
md_lines = [
|
|
553
|
+
"# Variance (DD-VE) sweep calibration",
|
|
554
|
+
"",
|
|
555
|
+
f"- Generated: `{stamp}`",
|
|
556
|
+
f"- Base config: `{config}`",
|
|
557
|
+
"",
|
|
558
|
+
"## Recommendations (tiers.yaml patch)",
|
|
559
|
+
f"- `{out / 'tiers_patch_variance_ve.yaml'}`",
|
|
560
|
+
"",
|
|
561
|
+
"## Per-tier recommendation",
|
|
562
|
+
"",
|
|
563
|
+
"| Tier | Runs | Recommended min_effect_lognll | Expected enable rate |",
|
|
564
|
+
"|---|---:|---:|---:|",
|
|
565
|
+
]
|
|
566
|
+
for tier_name, summary in sorted(summaries.items()):
|
|
567
|
+
rec = summary.get("recommendations", {}) if isinstance(summary, dict) else {}
|
|
568
|
+
md_lines.append(
|
|
569
|
+
f"| {tier_name} | {summary.get('n_runs', 0)} | {float(rec.get('min_effect_lognll', 0.0)):.6f} | {float(rec.get('expected_enable_rate', 0.0)):.3f} |"
|
|
570
|
+
)
|
|
571
|
+
_dump_markdown(out / "ve_sweep_summary.md", "\n".join(md_lines))
|
|
572
|
+
|
|
573
|
+
console.print(f"[green]✅ Wrote VE sweep artifacts under {out}[/green]")
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
__all__ = ["calibrate_app"]
|
invarlock/cli/commands/doctor.py
CHANGED
|
@@ -794,11 +794,17 @@ def doctor_command(
|
|
|
794
794
|
try:
|
|
795
795
|
import math as _math
|
|
796
796
|
|
|
797
|
-
from invarlock.core.auto_tuning import
|
|
797
|
+
from invarlock.core.auto_tuning import get_tier_policies
|
|
798
798
|
|
|
799
799
|
use_tier = (tier or "balanced").lower()
|
|
800
|
-
|
|
801
|
-
|
|
800
|
+
tier_policies = get_tier_policies()
|
|
801
|
+
tier_defaults = tier_policies.get(
|
|
802
|
+
use_tier, tier_policies.get("balanced", {})
|
|
803
|
+
)
|
|
804
|
+
metrics_policy = (
|
|
805
|
+
tier_defaults.get("metrics", {})
|
|
806
|
+
if isinstance(tier_defaults, dict)
|
|
807
|
+
else {}
|
|
802
808
|
)
|
|
803
809
|
pm_policy = (
|
|
804
810
|
metrics_policy.get("pm_ratio", {})
|
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
import typer
|
|
7
7
|
from rich.console import Console
|
|
8
8
|
|
|
9
|
-
from invarlock.core.auto_tuning import
|
|
9
|
+
from invarlock.core.auto_tuning import get_tier_policies
|
|
10
10
|
from invarlock.reporting.certificate import make_certificate
|
|
11
11
|
|
|
12
12
|
console = Console()
|
|
@@ -49,15 +49,38 @@ def explain_gates_command(
|
|
|
49
49
|
"aggressive": 1.20,
|
|
50
50
|
"none": 1.10,
|
|
51
51
|
}
|
|
52
|
-
|
|
52
|
+
resolved_policy = (
|
|
53
|
+
cert.get("resolved_policy", {})
|
|
54
|
+
if isinstance(cert.get("resolved_policy"), dict)
|
|
55
|
+
else {}
|
|
56
|
+
)
|
|
53
57
|
metrics_policy = (
|
|
54
|
-
|
|
58
|
+
resolved_policy.get("metrics", {})
|
|
59
|
+
if isinstance(resolved_policy.get("metrics"), dict)
|
|
60
|
+
else {}
|
|
55
61
|
)
|
|
62
|
+
if not metrics_policy:
|
|
63
|
+
tier_policies = get_tier_policies()
|
|
64
|
+
tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
65
|
+
metrics_policy = (
|
|
66
|
+
tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
|
|
67
|
+
)
|
|
68
|
+
if not isinstance(metrics_policy, dict):
|
|
69
|
+
metrics_policy = {}
|
|
56
70
|
pm_policy = (
|
|
57
|
-
metrics_policy.get("pm_ratio", {})
|
|
71
|
+
metrics_policy.get("pm_ratio", {})
|
|
72
|
+
if isinstance(metrics_policy.get("pm_ratio"), dict)
|
|
73
|
+
else {}
|
|
58
74
|
)
|
|
59
75
|
hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
|
|
60
76
|
min_tokens = int(pm_policy.get("min_tokens", 0))
|
|
77
|
+
try:
|
|
78
|
+
limit_base = float(
|
|
79
|
+
pm_policy.get("ratio_limit_base", tier_thresholds.get(tier, 1.10))
|
|
80
|
+
or tier_thresholds.get(tier, 1.10)
|
|
81
|
+
)
|
|
82
|
+
except Exception:
|
|
83
|
+
limit_base = tier_thresholds.get(tier, 1.10)
|
|
61
84
|
limit_with_hyst = limit_base + max(0.0, hysteresis_ratio)
|
|
62
85
|
tokens_ok = True
|
|
63
86
|
telem = cert.get("telemetry", {}) if isinstance(cert.get("telemetry"), dict) else {}
|
|
@@ -70,9 +93,16 @@ def explain_gates_command(
|
|
|
70
93
|
tokens_ok = True
|
|
71
94
|
|
|
72
95
|
# Primary-metric ratio gate explanation (ppl-like kinds shown as ratios)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
96
|
+
ratio = None
|
|
97
|
+
ratio_ci = None
|
|
98
|
+
if isinstance(cert.get("primary_metric"), dict):
|
|
99
|
+
pm = cert.get("primary_metric", {})
|
|
100
|
+
ratio = pm.get("ratio_vs_baseline")
|
|
101
|
+
ratio_ci = pm.get("display_ci")
|
|
102
|
+
elif isinstance(cert.get("ppl"), dict): # legacy
|
|
103
|
+
ppl = cert.get("ppl", {})
|
|
104
|
+
ratio = ppl.get("ratio_vs_baseline")
|
|
105
|
+
ratio_ci = ppl.get("ratio_ci")
|
|
76
106
|
hysteresis_applied = bool(validation.get("hysteresis_applied"))
|
|
77
107
|
status = "PASS" if bool(validation.get("primary_metric_acceptable")) else "FAIL"
|
|
78
108
|
console.print("[bold]Gate: Primary Metric vs Baseline[/bold]")
|
|
@@ -109,8 +139,22 @@ def explain_gates_command(
|
|
|
109
139
|
pass
|
|
110
140
|
|
|
111
141
|
# Drift gate explanation
|
|
112
|
-
drift =
|
|
113
|
-
drift_ci =
|
|
142
|
+
drift = None
|
|
143
|
+
drift_ci = None
|
|
144
|
+
if isinstance(cert.get("primary_metric"), dict):
|
|
145
|
+
pm = cert.get("primary_metric", {})
|
|
146
|
+
preview = pm.get("preview")
|
|
147
|
+
final = pm.get("final")
|
|
148
|
+
if isinstance(preview, int | float) and isinstance(final, int | float):
|
|
149
|
+
try:
|
|
150
|
+
if float(preview) != 0.0:
|
|
151
|
+
drift = float(final) / float(preview)
|
|
152
|
+
except Exception:
|
|
153
|
+
drift = None
|
|
154
|
+
if isinstance(cert.get("ppl"), dict): # legacy
|
|
155
|
+
ppl = cert.get("ppl", {})
|
|
156
|
+
drift = ppl.get("preview_final_ratio", drift)
|
|
157
|
+
drift_ci = ppl.get("drift_ci")
|
|
114
158
|
drift_status = (
|
|
115
159
|
"PASS" if bool(validation.get("preview_final_drift_acceptable")) else "FAIL"
|
|
116
160
|
)
|