invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/tiers.yaml +61 -0
  3. invarlock/adapters/hf_loading.py +97 -0
  4. invarlock/calibration/__init__.py +6 -0
  5. invarlock/calibration/spectral_null.py +301 -0
  6. invarlock/calibration/variance_ve.py +154 -0
  7. invarlock/cli/app.py +15 -0
  8. invarlock/cli/commands/calibrate.py +576 -0
  9. invarlock/cli/commands/doctor.py +9 -3
  10. invarlock/cli/commands/explain_gates.py +53 -9
  11. invarlock/cli/commands/plugins.py +12 -2
  12. invarlock/cli/commands/run.py +181 -79
  13. invarlock/cli/commands/verify.py +40 -0
  14. invarlock/cli/config.py +11 -1
  15. invarlock/cli/determinism.py +252 -0
  16. invarlock/core/auto_tuning.py +215 -17
  17. invarlock/core/bootstrap.py +137 -5
  18. invarlock/core/registry.py +9 -4
  19. invarlock/core/runner.py +305 -35
  20. invarlock/eval/bench.py +467 -141
  21. invarlock/eval/bench_regression.py +12 -0
  22. invarlock/eval/bootstrap.py +3 -1
  23. invarlock/eval/data.py +29 -7
  24. invarlock/eval/primary_metric.py +20 -5
  25. invarlock/guards/rmt.py +536 -46
  26. invarlock/guards/spectral.py +217 -10
  27. invarlock/guards/variance.py +124 -42
  28. invarlock/reporting/certificate.py +476 -45
  29. invarlock/reporting/certificate_schema.py +4 -1
  30. invarlock/reporting/guards_analysis.py +108 -10
  31. invarlock/reporting/normalizer.py +24 -1
  32. invarlock/reporting/policy_utils.py +97 -15
  33. invarlock/reporting/primary_metric_utils.py +17 -0
  34. invarlock/reporting/validate.py +10 -10
  35. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
  36. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
  37. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
  38. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
  39. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
  40. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,576 @@
1
+ """Calibration sweep harnesses (null + VE).
2
+
3
+ These commands run repeatable sweeps and emit stable artifacts for release notes:
4
+ - JSON (machine)
5
+ - CSV (spreadsheet)
6
+ - Markdown (human)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import csv
12
+ import json
13
+ from collections import defaultdict
14
+ from dataclasses import dataclass
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import typer
20
+ import yaml
21
+ from rich.console import Console
22
+
23
+ from invarlock.calibration.spectral_null import summarize_null_sweep_reports
24
+ from invarlock.calibration.variance_ve import summarize_ve_sweep_reports
25
+ from invarlock.guards.tier_config import get_tier_guard_config
26
+
27
+ console = Console()
28
+
29
+ calibrate_app = typer.Typer(
30
+ name="calibrate",
31
+ help="Run calibration sweeps and emit reports (JSON/CSV/Markdown).",
32
+ no_args_is_help=True,
33
+ )
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class _SweepSpec:
38
+ tier: str
39
+ seed: int
40
+ windows: int | None = None
41
+
42
+
43
+ def _load_yaml(path: Path) -> dict[str, Any]:
44
+ data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
45
+ if not isinstance(data, dict):
46
+ raise typer.BadParameter(f"Config must be a mapping: {path}")
47
+ return data
48
+
49
+
50
+ def _dump_json(path: Path, payload: dict[str, Any]) -> None:
51
+ path.parent.mkdir(parents=True, exist_ok=True)
52
+ path.write_text(
53
+ json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8"
54
+ )
55
+
56
+
57
+ def _dump_markdown(path: Path, text: str) -> None:
58
+ path.parent.mkdir(parents=True, exist_ok=True)
59
+ path.write_text(text.strip() + "\n", encoding="utf-8")
60
+
61
+
62
+ def _dump_csv(path: Path, rows: list[dict[str, Any]]) -> None:
63
+ path.parent.mkdir(parents=True, exist_ok=True)
64
+ if not rows:
65
+ path.write_text("", encoding="utf-8")
66
+ return
67
+ fields: list[str] = sorted({k for r in rows for k in r.keys()})
68
+ with path.open("w", encoding="utf-8", newline="") as handle:
69
+ writer = csv.DictWriter(handle, fieldnames=fields)
70
+ writer.writeheader()
71
+ for row in rows:
72
+ writer.writerow(row)
73
+
74
+
75
+ def _materialize_sweep_specs(
76
+ *,
77
+ tiers: list[str] | None,
78
+ seeds: list[int] | None,
79
+ n_seeds: int,
80
+ seed_start: int,
81
+ windows: list[int] | None = None,
82
+ ) -> list[_SweepSpec]:
83
+ tier_list = [t.strip().lower() for t in (tiers or []) if str(t).strip()]
84
+ if not tier_list:
85
+ tier_list = ["balanced", "conservative", "aggressive"]
86
+
87
+ seed_list = [int(s) for s in (seeds or [])]
88
+ if not seed_list:
89
+ seed_list = [int(seed_start) + i for i in range(int(n_seeds))]
90
+
91
+ out: list[_SweepSpec] = []
92
+ if windows:
93
+ for tier in tier_list:
94
+ for win in windows:
95
+ for seed in seed_list:
96
+ out.append(_SweepSpec(tier=tier, seed=seed, windows=int(win)))
97
+ else:
98
+ for tier in tier_list:
99
+ for seed in seed_list:
100
+ out.append(_SweepSpec(tier=tier, seed=seed))
101
+ return out
102
+
103
+
104
+ def _write_tiers_recommendation(
105
+ out_path: Path,
106
+ *,
107
+ recommendations: dict[str, dict[str, Any]],
108
+ ) -> None:
109
+ """Write a tiers.yaml-shaped patch file (only keys we recommend)."""
110
+ out_path.parent.mkdir(parents=True, exist_ok=True)
111
+ out_path.write_text(
112
+ yaml.safe_dump(recommendations, sort_keys=False),
113
+ encoding="utf-8",
114
+ )
115
+
116
+
117
+ @calibrate_app.command(
118
+ name="null-sweep",
119
+ help="Run a null (no-op edit) sweep and calibrate spectral κ/alpha empirically.",
120
+ )
121
+ def null_sweep(
122
+ config: Path = typer.Option(
123
+ Path("configs/calibration/null_sweep_ci.yaml"),
124
+ "--config",
125
+ exists=True,
126
+ dir_okay=False,
127
+ readable=True,
128
+ help="Base null-sweep YAML (noop edit).",
129
+ ),
130
+ out: Path = typer.Option(
131
+ Path("reports/calibration/null_sweep"),
132
+ "--out",
133
+ help="Output directory for calibration artifacts.",
134
+ ),
135
+ tiers: list[str] = typer.Option(
136
+ None,
137
+ "--tier",
138
+ help="Tier(s) to evaluate (repeatable). Defaults to all tiers.",
139
+ ),
140
+ seed: list[int] = typer.Option(
141
+ None,
142
+ "--seed",
143
+ help="Seed(s) to run (repeatable). Overrides --n-seeds/--seed-start.",
144
+ ),
145
+ n_seeds: int = typer.Option(10, "--n-seeds", min=1, help="Number of seeds to run."),
146
+ seed_start: int = typer.Option(42, "--seed-start", help="Starting seed."),
147
+ profile: str = typer.Option("ci", "--profile", help="Run profile (ci|release)."),
148
+ device: str | None = typer.Option(None, "--device", help="Device override."),
149
+ safety_margin: float = typer.Option(
150
+ 0.05, "--safety-margin", help="Safety margin applied to κ recommendations."
151
+ ),
152
+ target_any_warning_rate: float = typer.Option(
153
+ 0.01,
154
+ "--target-any-warning-rate",
155
+ help="Target run-level spectral warning rate under the null.",
156
+ ),
157
+ ) -> None:
158
+ # Keep import light: only pull run machinery when invoked.
159
+ from .run import run_command
160
+
161
+ base = _load_yaml(config)
162
+ specs = _materialize_sweep_specs(
163
+ tiers=tiers, seeds=seed, n_seeds=n_seeds, seed_start=seed_start
164
+ )
165
+ specs = sorted(specs, key=lambda s: (s.tier, s.seed))
166
+
167
+ run_rows: list[dict[str, Any]] = []
168
+ reports_by_tier: dict[str, list[dict[str, Any]]] = defaultdict(list)
169
+
170
+ run_root = out / "runs"
171
+ cfg_root = out / "configs"
172
+ cfg_root.mkdir(parents=True, exist_ok=True)
173
+
174
+ for spec in specs:
175
+ cfg = json.loads(json.dumps(base)) # safe deep copy without yaml anchors
176
+ cfg.setdefault("dataset", {})["seed"] = int(spec.seed)
177
+ cfg.setdefault("auto", {})["tier"] = spec.tier
178
+
179
+ # Per-run config + output roots to avoid timestamp collisions.
180
+ run_out = run_root / spec.tier / f"seed_{spec.seed}"
181
+ cfg.setdefault("output", {})["dir"] = str(run_out)
182
+ cfg_path = cfg_root / f"null_{spec.tier}_{spec.seed}.yaml"
183
+ cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8")
184
+
185
+ report_path = run_command(
186
+ config=str(cfg_path),
187
+ device=device,
188
+ profile=profile,
189
+ out=str(run_out),
190
+ tier=spec.tier,
191
+ )
192
+ if not isinstance(report_path, str):
193
+ continue
194
+ report = json.loads(Path(report_path).read_text(encoding="utf-8"))
195
+ reports_by_tier[spec.tier].append(report)
196
+
197
+ spectral = None
198
+ for g in report.get("guards", []):
199
+ if isinstance(g, dict) and g.get("name") == "spectral":
200
+ spectral = g
201
+ break
202
+ metrics = spectral.get("metrics", {}) if isinstance(spectral, dict) else {}
203
+ selection = (
204
+ metrics.get("multiple_testing_selection")
205
+ if isinstance(metrics.get("multiple_testing_selection"), dict)
206
+ else {}
207
+ )
208
+ fam_z_summary = (
209
+ metrics.get("family_z_summary")
210
+ if isinstance(metrics.get("family_z_summary"), dict)
211
+ else {}
212
+ )
213
+ candidate_counts = (
214
+ selection.get("family_violation_counts")
215
+ if isinstance(selection.get("family_violation_counts"), dict)
216
+ else {}
217
+ )
218
+ selected_families = selection.get("families_selected")
219
+ selected_set = (
220
+ {str(x) for x in selected_families}
221
+ if isinstance(selected_families, list)
222
+ else set()
223
+ )
224
+ selected_by_family: dict[str, int] = defaultdict(int)
225
+ violations = (
226
+ spectral.get("violations", []) if isinstance(spectral, dict) else []
227
+ )
228
+ if isinstance(violations, list):
229
+ for v in violations:
230
+ if isinstance(v, dict) and v.get("family") is not None:
231
+ selected_by_family[str(v.get("family"))] += 1
232
+ caps_applied = metrics.get("caps_applied")
233
+ try:
234
+ caps_applied = int(caps_applied) if caps_applied is not None else 0
235
+ except Exception:
236
+ caps_applied = 0
237
+ row: dict[str, Any] = {
238
+ "tier": spec.tier,
239
+ "seed": spec.seed,
240
+ "caps_applied": caps_applied,
241
+ "caps_exceeded": bool(metrics.get("caps_exceeded", False)),
242
+ "selected_families": ",".join(sorted(selected_set)),
243
+ }
244
+ for fam, vals in fam_z_summary.items():
245
+ if not isinstance(vals, dict):
246
+ continue
247
+ max_z = vals.get("max")
248
+ try:
249
+ if max_z is not None and max_z == max_z:
250
+ row[f"max_z_{fam}"] = float(max_z)
251
+ except Exception:
252
+ continue
253
+ for fam, count in candidate_counts.items():
254
+ try:
255
+ row[f"candidate_{fam}"] = int(count)
256
+ except Exception:
257
+ continue
258
+ for fam, count in selected_by_family.items():
259
+ row[f"selected_{fam}"] = int(count)
260
+ run_rows.append(row)
261
+
262
+ summaries: dict[str, Any] = {}
263
+ tiers_patch: dict[str, dict[str, Any]] = {}
264
+ for tier_name, tier_reports in sorted(reports_by_tier.items()):
265
+ summary = summarize_null_sweep_reports(
266
+ tier_reports,
267
+ tier=tier_name,
268
+ safety_margin=safety_margin,
269
+ target_any_warning_rate=target_any_warning_rate,
270
+ )
271
+ summaries[tier_name] = summary
272
+ rec = summary.get("recommendations", {}) if isinstance(summary, dict) else {}
273
+ spectral_patch = {
274
+ "spectral_guard": {
275
+ "family_caps": (rec.get("family_caps") or {}),
276
+ "multiple_testing": (rec.get("multiple_testing") or {}),
277
+ }
278
+ }
279
+ tiers_patch[tier_name] = spectral_patch
280
+
281
+ stamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
282
+ payload = {
283
+ "kind": "spectral_null_sweep",
284
+ "generated_at": stamp,
285
+ "config": {
286
+ "base_config": str(config),
287
+ "profile": str(profile),
288
+ "tiers": sorted(reports_by_tier.keys()),
289
+ "n_runs": int(sum(len(v) for v in reports_by_tier.values())),
290
+ },
291
+ "summaries": summaries,
292
+ }
293
+
294
+ _dump_json(out / "null_sweep_report.json", payload)
295
+ _dump_csv(out / "null_sweep_runs.csv", run_rows)
296
+ _write_tiers_recommendation(
297
+ out / "tiers_patch_spectral_null.yaml", recommendations=tiers_patch
298
+ )
299
+
300
+ md_lines = [
301
+ "# Spectral null-sweep calibration",
302
+ "",
303
+ f"- Generated: `{stamp}`",
304
+ f"- Base config: `{config}`",
305
+ "",
306
+ "## Recommendations (tiers.yaml patch)",
307
+ f"- `{out / 'tiers_patch_spectral_null.yaml'}`",
308
+ "",
309
+ "## Summary",
310
+ "",
311
+ "| Tier | Runs | Any-warning rate | α (recommended) |",
312
+ "|---|---:|---:|---:|",
313
+ ]
314
+ for tier_name, summary in sorted(summaries.items()):
315
+ obs = summary.get("observed", {}) if isinstance(summary, dict) else {}
316
+ rec = summary.get("recommendations", {}) if isinstance(summary, dict) else {}
317
+ mt = rec.get("multiple_testing", {}) if isinstance(rec, dict) else {}
318
+ md_lines.append(
319
+ f"| {tier_name} | {summary.get('n_runs', 0)} | {obs.get('any_warning_rate', 0.0):.3f} | {float(mt.get('alpha', 0.0)):.6f} |"
320
+ )
321
+ _dump_markdown(out / "null_sweep_summary.md", "\n".join(md_lines))
322
+
323
+ console.print(f"[green]✅ Wrote null sweep artifacts under {out}[/green]")
324
+
325
+
326
+ @calibrate_app.command(
327
+ name="ve-sweep",
328
+ help="Run VE predictive-gate sweeps and recommend min_effect_lognll per tier.",
329
+ )
330
+ def ve_sweep(
331
+ config: Path = typer.Option(
332
+ Path("configs/calibration/rmt_ve_sweep_ci.yaml"),
333
+ "--config",
334
+ exists=True,
335
+ dir_okay=False,
336
+ readable=True,
337
+ help="Base VE sweep YAML (quant_rtn edit).",
338
+ ),
339
+ out: Path = typer.Option(
340
+ Path("reports/calibration/ve_sweep"),
341
+ "--out",
342
+ help="Output directory for calibration artifacts.",
343
+ ),
344
+ tiers: list[str] = typer.Option(
345
+ None,
346
+ "--tier",
347
+ help="Tier(s) to evaluate (repeatable). Defaults to all tiers.",
348
+ ),
349
+ seed: list[int] = typer.Option(
350
+ None,
351
+ "--seed",
352
+ help="Seed(s) to run (repeatable). Overrides --n-seeds/--seed-start.",
353
+ ),
354
+ n_seeds: int = typer.Option(10, "--n-seeds", min=1, help="Number of seeds to run."),
355
+ seed_start: int = typer.Option(42, "--seed-start", help="Starting seed."),
356
+ window: list[int] = typer.Option(
357
+ None,
358
+ "--window",
359
+ help="Variance calibration window counts (repeatable). Defaults to 6, 8, 12, 16.",
360
+ ),
361
+ target_enable_rate: float = typer.Option(
362
+ 0.05,
363
+ "--target-enable-rate",
364
+ help="Target expected VE enable rate (predictive-gate lower bound).",
365
+ ),
366
+ profile: str = typer.Option("ci", "--profile", help="Run profile (ci|release)."),
367
+ device: str | None = typer.Option(None, "--device", help="Device override."),
368
+ safety_margin: float = typer.Option(
369
+ 0.0,
370
+ "--safety-margin",
371
+ help="Safety margin applied to min_effect recommendations.",
372
+ ),
373
+ ) -> None:
374
+ # Keep import light: only pull run machinery when invoked.
375
+ from .run import run_command
376
+
377
+ base = _load_yaml(config)
378
+ windows = [int(w) for w in (window or [])] or [6, 8, 12, 16]
379
+ specs = _materialize_sweep_specs(
380
+ tiers=tiers,
381
+ seeds=seed,
382
+ n_seeds=n_seeds,
383
+ seed_start=seed_start,
384
+ windows=windows,
385
+ )
386
+ specs = sorted(specs, key=lambda s: (s.tier, int(s.windows or 0), s.seed))
387
+
388
+ run_rows: list[dict[str, Any]] = []
389
+ reports_by_tier: dict[str, list[dict[str, Any]]] = defaultdict(list)
390
+ reports_by_tier_window: dict[tuple[str, int], list[dict[str, Any]]] = defaultdict(
391
+ list
392
+ )
393
+
394
+ run_root = out / "runs"
395
+ cfg_root = out / "configs"
396
+ cfg_root.mkdir(parents=True, exist_ok=True)
397
+
398
+ for spec in specs:
399
+ win = int(spec.windows or 0)
400
+ cfg = json.loads(json.dumps(base)) # safe deep copy without yaml anchors
401
+ cfg.setdefault("dataset", {})["seed"] = int(spec.seed)
402
+ cfg.setdefault("auto", {})["tier"] = spec.tier
403
+ # Keep edit deterministic when it supports a seed knob.
404
+ plan = cfg.setdefault("edit", {}).setdefault("plan", {})
405
+ if isinstance(plan, dict) and "seed" in plan:
406
+ plan["seed"] = int(spec.seed)
407
+
408
+ # Override variance calibration windows and ensure min_coverage is feasible.
409
+ gv = cfg.setdefault("guards", {}).setdefault("variance", {})
410
+ if not isinstance(gv, dict):
411
+ gv = {}
412
+ cfg["guards"]["variance"] = gv
413
+ calib = gv.setdefault("calibration", {})
414
+ if not isinstance(calib, dict):
415
+ calib = {}
416
+ gv["calibration"] = calib
417
+ calib["windows"] = int(win)
418
+ calib["seed"] = int(spec.seed)
419
+ calib["min_coverage"] = int(max(1, min(win, win - 2)))
420
+
421
+ run_out = run_root / spec.tier / f"windows_{win}" / f"seed_{spec.seed}"
422
+ cfg.setdefault("output", {})["dir"] = str(run_out)
423
+ cfg_path = cfg_root / f"ve_{spec.tier}_w{win}_{spec.seed}.yaml"
424
+ cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8")
425
+
426
+ report_path = run_command(
427
+ config=str(cfg_path),
428
+ device=device,
429
+ profile=profile,
430
+ out=str(run_out),
431
+ tier=spec.tier,
432
+ )
433
+ if not isinstance(report_path, str):
434
+ continue
435
+ report = json.loads(Path(report_path).read_text(encoding="utf-8"))
436
+ reports_by_tier[spec.tier].append(report)
437
+ reports_by_tier_window[(spec.tier, win)].append(report)
438
+
439
+ variance = None
440
+ for g in report.get("guards", []):
441
+ if isinstance(g, dict) and g.get("name") == "variance":
442
+ variance = g
443
+ break
444
+ metrics = variance.get("metrics", {}) if isinstance(variance, dict) else {}
445
+ pg = (
446
+ metrics.get("predictive_gate", {})
447
+ if isinstance(metrics.get("predictive_gate"), dict)
448
+ else {}
449
+ )
450
+ delta_ci = pg.get("delta_ci")
451
+ try:
452
+ ci_width = (
453
+ float(delta_ci[1]) - float(delta_ci[0])
454
+ if isinstance(delta_ci, tuple | list) and len(delta_ci) == 2
455
+ else None
456
+ )
457
+ except Exception:
458
+ ci_width = None
459
+ run_rows.append(
460
+ {
461
+ "tier": spec.tier,
462
+ "seed": spec.seed,
463
+ "windows": win,
464
+ "predictive_evaluated": bool(pg.get("evaluated", False)),
465
+ "predictive_mean_delta": pg.get("mean_delta"),
466
+ "predictive_delta_ci_lo": (
467
+ delta_ci[0]
468
+ if isinstance(delta_ci, tuple | list) and len(delta_ci) == 2
469
+ else None
470
+ ),
471
+ "predictive_delta_ci_hi": (
472
+ delta_ci[1]
473
+ if isinstance(delta_ci, tuple | list) and len(delta_ci) == 2
474
+ else None
475
+ ),
476
+ "predictive_ci_width": ci_width,
477
+ }
478
+ )
479
+
480
+ # Per-tier recommendation using all runs (across window values).
481
+ summaries: dict[str, Any] = {}
482
+ tiers_patch: dict[str, dict[str, Any]] = {}
483
+ for tier_name, tier_reports in sorted(reports_by_tier.items()):
484
+ var_cfg = get_tier_guard_config(tier_name, "variance_guard")
485
+ one_sided = bool(var_cfg.get("predictive_one_sided", True))
486
+ summary = summarize_ve_sweep_reports(
487
+ tier_reports,
488
+ tier=tier_name,
489
+ target_enable_rate=target_enable_rate,
490
+ safety_margin=safety_margin,
491
+ predictive_one_sided=one_sided,
492
+ )
493
+ summaries[tier_name] = summary
494
+ rec = summary.get("recommendations", {}) if isinstance(summary, dict) else {}
495
+ tiers_patch[tier_name] = {
496
+ "variance_guard": {"min_effect_lognll": rec.get("min_effect_lognll")}
497
+ }
498
+
499
+ # Power curve: mean CI width per (tier, windows).
500
+ power_curve: list[dict[str, Any]] = []
501
+ for (tier_name, win), items in sorted(reports_by_tier_window.items()):
502
+ widths: list[float] = []
503
+ for rep in items:
504
+ g = None
505
+ for gg in rep.get("guards", []):
506
+ if isinstance(gg, dict) and gg.get("name") == "variance":
507
+ g = gg
508
+ break
509
+ metrics = g.get("metrics", {}) if isinstance(g, dict) else {}
510
+ pg = (
511
+ metrics.get("predictive_gate", {})
512
+ if isinstance(metrics.get("predictive_gate"), dict)
513
+ else {}
514
+ )
515
+ delta_ci = pg.get("delta_ci")
516
+ if isinstance(delta_ci, tuple | list) and len(delta_ci) == 2:
517
+ try:
518
+ widths.append(float(delta_ci[1]) - float(delta_ci[0]))
519
+ except Exception:
520
+ continue
521
+ power_curve.append(
522
+ {
523
+ "tier": tier_name,
524
+ "windows": int(win),
525
+ "runs": int(len(items)),
526
+ "mean_ci_width": (sum(widths) / len(widths)) if widths else None,
527
+ }
528
+ )
529
+
530
+ stamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
531
+ payload = {
532
+ "kind": "variance_ve_sweep",
533
+ "generated_at": stamp,
534
+ "config": {
535
+ "base_config": str(config),
536
+ "profile": str(profile),
537
+ "tiers": sorted(reports_by_tier.keys()),
538
+ "windows": windows,
539
+ "n_runs": int(sum(len(v) for v in reports_by_tier.values())),
540
+ },
541
+ "summaries": summaries,
542
+ "power_curve": power_curve,
543
+ }
544
+
545
+ _dump_json(out / "ve_sweep_report.json", payload)
546
+ _dump_csv(out / "ve_sweep_runs.csv", run_rows)
547
+ _dump_csv(out / "ve_power_curve.csv", power_curve)
548
+ _write_tiers_recommendation(
549
+ out / "tiers_patch_variance_ve.yaml", recommendations=tiers_patch
550
+ )
551
+
552
+ md_lines = [
553
+ "# Variance (DD-VE) sweep calibration",
554
+ "",
555
+ f"- Generated: `{stamp}`",
556
+ f"- Base config: `{config}`",
557
+ "",
558
+ "## Recommendations (tiers.yaml patch)",
559
+ f"- `{out / 'tiers_patch_variance_ve.yaml'}`",
560
+ "",
561
+ "## Per-tier recommendation",
562
+ "",
563
+ "| Tier | Runs | Recommended min_effect_lognll | Expected enable rate |",
564
+ "|---|---:|---:|---:|",
565
+ ]
566
+ for tier_name, summary in sorted(summaries.items()):
567
+ rec = summary.get("recommendations", {}) if isinstance(summary, dict) else {}
568
+ md_lines.append(
569
+ f"| {tier_name} | {summary.get('n_runs', 0)} | {float(rec.get('min_effect_lognll', 0.0)):.6f} | {float(rec.get('expected_enable_rate', 0.0)):.3f} |"
570
+ )
571
+ _dump_markdown(out / "ve_sweep_summary.md", "\n".join(md_lines))
572
+
573
+ console.print(f"[green]✅ Wrote VE sweep artifacts under {out}[/green]")
574
+
575
+
576
+ __all__ = ["calibrate_app"]
@@ -794,11 +794,17 @@ def doctor_command(
794
794
  try:
795
795
  import math as _math
796
796
 
797
- from invarlock.core.auto_tuning import TIER_POLICIES
797
+ from invarlock.core.auto_tuning import get_tier_policies
798
798
 
799
799
  use_tier = (tier or "balanced").lower()
800
- metrics_policy = TIER_POLICIES.get(use_tier, {}).get(
801
- "metrics", {}
800
+ tier_policies = get_tier_policies()
801
+ tier_defaults = tier_policies.get(
802
+ use_tier, tier_policies.get("balanced", {})
803
+ )
804
+ metrics_policy = (
805
+ tier_defaults.get("metrics", {})
806
+ if isinstance(tier_defaults, dict)
807
+ else {}
802
808
  )
803
809
  pm_policy = (
804
810
  metrics_policy.get("pm_ratio", {})
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  import typer
7
7
  from rich.console import Console
8
8
 
9
- from invarlock.core.auto_tuning import TIER_POLICIES
9
+ from invarlock.core.auto_tuning import get_tier_policies
10
10
  from invarlock.reporting.certificate import make_certificate
11
11
 
12
12
  console = Console()
@@ -49,15 +49,38 @@ def explain_gates_command(
49
49
  "aggressive": 1.20,
50
50
  "none": 1.10,
51
51
  }
52
- limit_base = tier_thresholds.get(tier, 1.10)
52
+ resolved_policy = (
53
+ cert.get("resolved_policy", {})
54
+ if isinstance(cert.get("resolved_policy"), dict)
55
+ else {}
56
+ )
53
57
  metrics_policy = (
54
- TIER_POLICIES.get(tier, {}).get("metrics", {}) if isinstance(tier, str) else {}
58
+ resolved_policy.get("metrics", {})
59
+ if isinstance(resolved_policy.get("metrics"), dict)
60
+ else {}
55
61
  )
62
+ if not metrics_policy:
63
+ tier_policies = get_tier_policies()
64
+ tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
65
+ metrics_policy = (
66
+ tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
67
+ )
68
+ if not isinstance(metrics_policy, dict):
69
+ metrics_policy = {}
56
70
  pm_policy = (
57
- metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
71
+ metrics_policy.get("pm_ratio", {})
72
+ if isinstance(metrics_policy.get("pm_ratio"), dict)
73
+ else {}
58
74
  )
59
75
  hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
60
76
  min_tokens = int(pm_policy.get("min_tokens", 0))
77
+ try:
78
+ limit_base = float(
79
+ pm_policy.get("ratio_limit_base", tier_thresholds.get(tier, 1.10))
80
+ or tier_thresholds.get(tier, 1.10)
81
+ )
82
+ except Exception:
83
+ limit_base = tier_thresholds.get(tier, 1.10)
61
84
  limit_with_hyst = limit_base + max(0.0, hysteresis_ratio)
62
85
  tokens_ok = True
63
86
  telem = cert.get("telemetry", {}) if isinstance(cert.get("telemetry"), dict) else {}
@@ -70,9 +93,16 @@ def explain_gates_command(
70
93
  tokens_ok = True
71
94
 
72
95
  # Primary-metric ratio gate explanation (ppl-like kinds shown as ratios)
73
- ppl = cert.get("ppl", {}) if isinstance(cert.get("ppl"), dict) else {}
74
- ratio = ppl.get("ratio_vs_baseline")
75
- ratio_ci = ppl.get("ratio_ci")
96
+ ratio = None
97
+ ratio_ci = None
98
+ if isinstance(cert.get("primary_metric"), dict):
99
+ pm = cert.get("primary_metric", {})
100
+ ratio = pm.get("ratio_vs_baseline")
101
+ ratio_ci = pm.get("display_ci")
102
+ elif isinstance(cert.get("ppl"), dict): # legacy
103
+ ppl = cert.get("ppl", {})
104
+ ratio = ppl.get("ratio_vs_baseline")
105
+ ratio_ci = ppl.get("ratio_ci")
76
106
  hysteresis_applied = bool(validation.get("hysteresis_applied"))
77
107
  status = "PASS" if bool(validation.get("primary_metric_acceptable")) else "FAIL"
78
108
  console.print("[bold]Gate: Primary Metric vs Baseline[/bold]")
@@ -109,8 +139,22 @@ def explain_gates_command(
109
139
  pass
110
140
 
111
141
  # Drift gate explanation
112
- drift = ppl.get("preview_final_ratio")
113
- drift_ci = ppl.get("drift_ci")
142
+ drift = None
143
+ drift_ci = None
144
+ if isinstance(cert.get("primary_metric"), dict):
145
+ pm = cert.get("primary_metric", {})
146
+ preview = pm.get("preview")
147
+ final = pm.get("final")
148
+ if isinstance(preview, int | float) and isinstance(final, int | float):
149
+ try:
150
+ if float(preview) != 0.0:
151
+ drift = float(final) / float(preview)
152
+ except Exception:
153
+ drift = None
154
+ if isinstance(cert.get("ppl"), dict): # legacy
155
+ ppl = cert.get("ppl", {})
156
+ drift = ppl.get("preview_final_ratio", drift)
157
+ drift_ci = ppl.get("drift_ci")
114
158
  drift_status = (
115
159
  "PASS" if bool(validation.get("preview_final_drift_acceptable")) else "FAIL"
116
160
  )