invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/tiers.yaml +61 -0
  3. invarlock/adapters/hf_loading.py +97 -0
  4. invarlock/calibration/__init__.py +6 -0
  5. invarlock/calibration/spectral_null.py +301 -0
  6. invarlock/calibration/variance_ve.py +154 -0
  7. invarlock/cli/app.py +15 -0
  8. invarlock/cli/commands/calibrate.py +576 -0
  9. invarlock/cli/commands/doctor.py +9 -3
  10. invarlock/cli/commands/explain_gates.py +53 -9
  11. invarlock/cli/commands/plugins.py +12 -2
  12. invarlock/cli/commands/run.py +181 -79
  13. invarlock/cli/commands/verify.py +40 -0
  14. invarlock/cli/config.py +11 -1
  15. invarlock/cli/determinism.py +252 -0
  16. invarlock/core/auto_tuning.py +215 -17
  17. invarlock/core/bootstrap.py +137 -5
  18. invarlock/core/registry.py +9 -4
  19. invarlock/core/runner.py +305 -35
  20. invarlock/eval/bench.py +467 -141
  21. invarlock/eval/bench_regression.py +12 -0
  22. invarlock/eval/bootstrap.py +3 -1
  23. invarlock/eval/data.py +29 -7
  24. invarlock/eval/primary_metric.py +20 -5
  25. invarlock/guards/rmt.py +536 -46
  26. invarlock/guards/spectral.py +217 -10
  27. invarlock/guards/variance.py +124 -42
  28. invarlock/reporting/certificate.py +476 -45
  29. invarlock/reporting/certificate_schema.py +4 -1
  30. invarlock/reporting/guards_analysis.py +108 -10
  31. invarlock/reporting/normalizer.py +24 -1
  32. invarlock/reporting/policy_utils.py +97 -15
  33. invarlock/reporting/primary_metric_utils.py +17 -0
  34. invarlock/reporting/validate.py +10 -10
  35. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
  36. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
  37. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
  38. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
  39. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
  40. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
@@ -29,7 +29,7 @@ try: # pragma: no cover - exercised in integration
29
29
  except Exception: # pragma: no cover
30
30
  jsonschema = None # type: ignore
31
31
 
32
- from invarlock.core.auto_tuning import TIER_POLICIES
32
+ from invarlock.core.auto_tuning import get_tier_policies
33
33
  from invarlock.core.bootstrap import (
34
34
  compute_paired_delta_log_ci,
35
35
  logspace_to_ratio_ci,
@@ -538,6 +538,175 @@ def _enforce_ratio_ci_alignment(
538
538
  )
539
539
 
540
540
 
541
+ def _enforce_display_ci_alignment(
542
+ ratio_ci_source: str,
543
+ primary_metric: Any,
544
+ logloss_delta_ci: Any,
545
+ window_plan_profile: str | None,
546
+ ) -> None:
547
+ """Ensure display_ci matches exp(ci) for ppl-like metrics when paired."""
548
+ if ratio_ci_source != "paired_baseline":
549
+ return
550
+ if not isinstance(primary_metric, dict) or not primary_metric:
551
+ return
552
+ try:
553
+ kind = str(primary_metric.get("kind", "")).lower()
554
+ except Exception:
555
+ return
556
+ if not kind.startswith("ppl"):
557
+ return
558
+
559
+ def _finite_bounds(bounds: Any) -> bool:
560
+ return (
561
+ isinstance(bounds, tuple | list)
562
+ and len(bounds) == 2
563
+ and all(isinstance(v, int | float) and math.isfinite(v) for v in bounds)
564
+ )
565
+
566
+ ci = primary_metric.get("ci")
567
+ if not _finite_bounds(ci):
568
+ if _finite_bounds(logloss_delta_ci):
569
+ primary_metric["ci"] = (
570
+ float(logloss_delta_ci[0]),
571
+ float(logloss_delta_ci[1]),
572
+ )
573
+ ci = primary_metric["ci"]
574
+ else:
575
+ profile = (window_plan_profile or "dev").lower()
576
+ if profile in {"ci", "release"}:
577
+ raise ValueError(
578
+ "primary_metric.ci missing for ppl-like metric under paired baseline."
579
+ )
580
+ return
581
+
582
+ expected = tuple(math.exp(float(bound)) for bound in ci)
583
+ display_ci = primary_metric.get("display_ci")
584
+ if not _finite_bounds(display_ci):
585
+ profile = (window_plan_profile or "dev").lower()
586
+ if profile in {"ci", "release"}:
587
+ raise ValueError(
588
+ "primary_metric.display_ci missing for ppl-like metric under paired baseline."
589
+ )
590
+ primary_metric["display_ci"] = [expected[0], expected[1]]
591
+ return
592
+
593
+ for observed, exp_val in zip(display_ci, expected, strict=False):
594
+ tolerance = 5e-4 * max(1.0, abs(exp_val))
595
+ if abs(float(observed) - float(exp_val)) > tolerance:
596
+ profile = (window_plan_profile or "dev").lower()
597
+ if profile in {"ci", "release"}:
598
+ raise ValueError(
599
+ "primary_metric.display_ci mismatch: bounds do not match exp(ci)."
600
+ )
601
+ primary_metric["display_ci"] = [expected[0], expected[1]]
602
+ break
603
+
604
+
605
+ def _enforce_pairing_and_coverage(
606
+ stats: dict[str, Any] | None,
607
+ window_plan_profile: str | None,
608
+ tier: str | None,
609
+ ) -> None:
610
+ """Enforce pairing and coverage contracts for CI/Release profiles."""
611
+ profile = (window_plan_profile or "dev").lower()
612
+ if profile not in {"ci", "release"}:
613
+ return
614
+ if not isinstance(stats, dict):
615
+ raise ValueError("Missing dataset window stats for CI/Release enforcement.")
616
+
617
+ match_fraction = stats.get("window_match_fraction")
618
+ overlap_fraction = stats.get("window_overlap_fraction")
619
+ if not (
620
+ isinstance(match_fraction, (int | float))
621
+ and math.isfinite(float(match_fraction))
622
+ ):
623
+ raise ValueError("CI/Release requires window_match_fraction.")
624
+ if float(match_fraction) < 0.999999:
625
+ raise ValueError(
626
+ f"CI/Release requires perfect pairing (window_match_fraction={float(match_fraction):.6f})."
627
+ )
628
+
629
+ if not (
630
+ isinstance(overlap_fraction, (int | float))
631
+ and math.isfinite(float(overlap_fraction))
632
+ ):
633
+ raise ValueError("CI/Release requires window_overlap_fraction.")
634
+ if float(overlap_fraction) > 1e-9:
635
+ raise ValueError(
636
+ f"CI/Release requires non-overlapping windows (window_overlap_fraction={float(overlap_fraction):.6f})."
637
+ )
638
+
639
+ def _coerce_count(value: Any) -> int | None:
640
+ if value is None or isinstance(value, bool):
641
+ return None
642
+ try:
643
+ val = float(value)
644
+ except (TypeError, ValueError):
645
+ return None
646
+ if not math.isfinite(val) or val < 0:
647
+ return None
648
+ if abs(val - round(val)) > 1e-9:
649
+ return None
650
+ return int(round(val))
651
+
652
+ actual_preview = _coerce_count(stats.get("actual_preview"))
653
+ actual_final = _coerce_count(stats.get("actual_final"))
654
+ if actual_preview is None or actual_final is None:
655
+ coverage = stats.get("coverage")
656
+ if isinstance(coverage, dict):
657
+ if actual_preview is None:
658
+ actual_preview = _coerce_count(coverage.get("preview", {}).get("used"))
659
+ if actual_final is None:
660
+ actual_final = _coerce_count(coverage.get("final", {}).get("used"))
661
+
662
+ if actual_preview is None or actual_final is None:
663
+ raise ValueError("CI/Release requires preview/final window counts.")
664
+ if actual_preview != actual_final:
665
+ raise ValueError(
666
+ f"CI/Release requires matching preview/final counts "
667
+ f"(preview={actual_preview}, final={actual_final})."
668
+ )
669
+
670
+ from invarlock.core.runner import BOOTSTRAP_COVERAGE_REQUIREMENTS
671
+
672
+ tier_key = str(tier or "balanced").lower()
673
+ floors = BOOTSTRAP_COVERAGE_REQUIREMENTS.get(
674
+ tier_key, BOOTSTRAP_COVERAGE_REQUIREMENTS["balanced"]
675
+ )
676
+ preview_floor = int(floors.get("preview", 0))
677
+ final_floor = int(floors.get("final", 0))
678
+ replicates_floor = int(floors.get("replicates", 0))
679
+
680
+ coverage = stats.get("coverage")
681
+ if not isinstance(coverage, dict):
682
+ raise ValueError("CI/Release requires bootstrap coverage stats.")
683
+
684
+ preview_used = _coerce_count(coverage.get("preview", {}).get("used"))
685
+ final_used = _coerce_count(coverage.get("final", {}).get("used"))
686
+ replicates_used = _coerce_count(coverage.get("replicates", {}).get("used"))
687
+
688
+ if replicates_used is None:
689
+ bootstrap = stats.get("bootstrap")
690
+ if isinstance(bootstrap, dict):
691
+ replicates_used = _coerce_count(
692
+ bootstrap.get("replicates", bootstrap.get("n"))
693
+ )
694
+
695
+ if preview_used is None or final_used is None or replicates_used is None:
696
+ raise ValueError("CI/Release requires preview/final/replicates coverage stats.")
697
+
698
+ if preview_used < preview_floor or final_used < final_floor:
699
+ raise ValueError(
700
+ "CI/Release requires preview/final coverage at or above tier floors "
701
+ f"(preview={preview_used}/{preview_floor}, final={final_used}/{final_floor})."
702
+ )
703
+ if replicates_used < replicates_floor:
704
+ raise ValueError(
705
+ "CI/Release requires bootstrap replicates at or above tier floors "
706
+ f"(replicates={replicates_used}/{replicates_floor})."
707
+ )
708
+
709
+
541
710
  def _fallback_paired_windows(
542
711
  paired_windows: int, coverage_summary: dict[str, Any]
543
712
  ) -> int:
@@ -598,6 +767,18 @@ def make_certificate(
598
767
  except Exception: # pragma: no cover
599
768
  pass
600
769
 
770
+ # Determinism preset (CI/Release provenance) when present.
771
+ try:
772
+ det = (
773
+ report.get("meta", {}).get("determinism")
774
+ if isinstance(report.get("meta"), dict)
775
+ else None
776
+ )
777
+ if isinstance(det, dict) and det:
778
+ meta["determinism"] = det
779
+ except Exception: # pragma: no cover
780
+ pass
781
+
601
782
  tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
602
783
  if not tokenizer_hash_meta:
603
784
  dataset_section = report.get("data", {})
@@ -627,6 +808,13 @@ def make_certificate(
627
808
 
628
809
  # Extract dataset configuration and compute hashes
629
810
  dataset_info = _extract_dataset_info(report)
811
+ try:
812
+ if isinstance(dataset_info, dict):
813
+ windows = dataset_info.get("windows")
814
+ if isinstance(windows, dict):
815
+ windows.setdefault("stats", {})
816
+ except Exception: # pragma: no cover
817
+ pass
630
818
 
631
819
  # Baseline reference (PM-only). Derive a primary_metric snapshot from baseline windows.
632
820
  # Prefer explicit baseline primary_metric when provided; otherwise compute from windows
@@ -741,15 +929,17 @@ def make_certificate(
741
929
  tier = str(auto_cfg.get("tier")).lower()
742
930
  except Exception: # pragma: no cover
743
931
  pass
932
+ tier_policies = get_tier_policies()
933
+ tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
744
934
  metrics_policy = (
745
- TIER_POLICIES.get(tier, {}).get("metrics", {})
746
- if isinstance(tier, str)
747
- else {}
935
+ tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
748
936
  )
749
- ppl_policy = (
750
- metrics_policy.get("ppl", {}) if isinstance(metrics_policy, dict) else {}
937
+ pm_policy = (
938
+ metrics_policy.get("pm_ratio", {})
939
+ if isinstance(metrics_policy, dict)
940
+ else {}
751
941
  )
752
- min_tokens = int(ppl_policy.get("min_tokens", 0))
942
+ min_tokens = int(pm_policy.get("min_tokens", 0))
753
943
  if (
754
944
  isinstance(total_tokens, int)
755
945
  and min_tokens > 0
@@ -786,6 +976,47 @@ def make_certificate(
786
976
  if paired:
787
977
  paired_run, paired_base = paired
788
978
  paired_windows = len(paired_run)
979
+ paired_weights: list[float] | None = None
980
+ try:
981
+ run_ids = (
982
+ run_windows.get("window_ids") if isinstance(run_windows, dict) else None
983
+ )
984
+ run_w = (
985
+ run_windows.get("token_counts")
986
+ if isinstance(run_windows, dict)
987
+ else None
988
+ )
989
+ base_ids = (
990
+ baseline_windows.get("window_ids")
991
+ if isinstance(baseline_windows, dict)
992
+ else None
993
+ )
994
+ if (
995
+ isinstance(run_ids, list)
996
+ and isinstance(run_w, list)
997
+ and isinstance(base_ids, list)
998
+ ):
999
+ base_set = {
1000
+ int(b_id) for b_id in base_ids if isinstance(b_id, int | float)
1001
+ }
1002
+ weights: list[float] = []
1003
+ for r_id, w in zip(run_ids, run_w, strict=False):
1004
+ if not isinstance(r_id, int | float):
1005
+ continue
1006
+ key = int(r_id)
1007
+ if key not in base_set:
1008
+ continue
1009
+ try:
1010
+ wv = float(w)
1011
+ except Exception:
1012
+ continue
1013
+ if not math.isfinite(wv):
1014
+ continue
1015
+ weights.append(float(max(wv, 0.0)))
1016
+ if weights:
1017
+ paired_weights = weights
1018
+ except Exception: # pragma: no cover
1019
+ paired_weights = None
789
1020
  method = str(metrics_bootstrap.get("method", "percentile")).lower()
790
1021
  replicates = int(
791
1022
  metrics_bootstrap.get(
@@ -813,6 +1044,7 @@ def make_certificate(
813
1044
  delta_ci = compute_paired_delta_log_ci(
814
1045
  paired_run,
815
1046
  paired_base,
1047
+ weights=paired_weights,
816
1048
  method=ci_method,
817
1049
  replicates=replicates,
818
1050
  alpha=alpha,
@@ -1053,6 +1285,115 @@ def make_certificate(
1053
1285
  if key in metrics_stats_source:
1054
1286
  ppl_analysis["stats"][key] = metrics_stats_source[key]
1055
1287
 
1288
+ # Derive requested/actual window counts for auditability when runners do not
1289
+ # emit a metrics.stats block (normalization may also drop it).
1290
+ try:
1291
+ stats_obj = ppl_analysis.get("stats", {})
1292
+ if isinstance(stats_obj, dict):
1293
+
1294
+ def _as_count(value: Any) -> int | None:
1295
+ if value is None or isinstance(value, bool):
1296
+ return None
1297
+ if isinstance(value, int):
1298
+ return int(value) if value >= 0 else None
1299
+ if isinstance(value, float) and math.isfinite(value):
1300
+ if abs(value - round(value)) > 1e-9 or value < 0:
1301
+ return None
1302
+ return int(round(value))
1303
+ return None
1304
+
1305
+ data_cfg = report.get("data", {}) if isinstance(report, dict) else {}
1306
+ data_cfg = data_cfg if isinstance(data_cfg, dict) else {}
1307
+ windows_cfg = (
1308
+ dataset_info.get("windows", {})
1309
+ if isinstance(dataset_info, dict)
1310
+ else {}
1311
+ )
1312
+ windows_cfg = windows_cfg if isinstance(windows_cfg, dict) else {}
1313
+
1314
+ req_prev = _as_count(stats_obj.get("requested_preview"))
1315
+ if req_prev is None:
1316
+ req_prev = _as_count(data_cfg.get("preview_n"))
1317
+ if req_prev is None:
1318
+ req_prev = _as_count(windows_cfg.get("preview"))
1319
+
1320
+ req_fin = _as_count(stats_obj.get("requested_final"))
1321
+ if req_fin is None:
1322
+ req_fin = _as_count(data_cfg.get("final_n"))
1323
+ if req_fin is None:
1324
+ req_fin = _as_count(windows_cfg.get("final"))
1325
+
1326
+ eval_windows = (
1327
+ report.get("evaluation_windows", {}) if isinstance(report, dict) else {}
1328
+ )
1329
+ eval_windows = eval_windows if isinstance(eval_windows, dict) else {}
1330
+
1331
+ def _len_ids(section: Any) -> int | None:
1332
+ if not isinstance(section, dict):
1333
+ return None
1334
+ ids = section.get("window_ids")
1335
+ if isinstance(ids, list):
1336
+ return int(len(ids))
1337
+ return None
1338
+
1339
+ act_prev = _as_count(stats_obj.get("actual_preview"))
1340
+ if act_prev is None:
1341
+ act_prev = _len_ids(eval_windows.get("preview"))
1342
+ if act_prev is None:
1343
+ cov_prev = (
1344
+ coverage_summary.get("preview")
1345
+ if isinstance(coverage_summary, dict)
1346
+ else None
1347
+ )
1348
+ if isinstance(cov_prev, dict):
1349
+ act_prev = _as_count(cov_prev.get("used"))
1350
+ if act_prev is None:
1351
+ act_prev = req_prev
1352
+
1353
+ act_fin = _as_count(stats_obj.get("actual_final"))
1354
+ if act_fin is None:
1355
+ act_fin = _len_ids(eval_windows.get("final"))
1356
+ if act_fin is None:
1357
+ cov_fin = (
1358
+ coverage_summary.get("final")
1359
+ if isinstance(coverage_summary, dict)
1360
+ else None
1361
+ )
1362
+ if isinstance(cov_fin, dict):
1363
+ act_fin = _as_count(cov_fin.get("used"))
1364
+ elif isinstance(coverage_summary, dict):
1365
+ act_fin = _as_count(coverage_summary.get("used"))
1366
+ if act_fin is None:
1367
+ act_fin = req_fin
1368
+
1369
+ if req_prev is not None:
1370
+ stats_obj["requested_preview"] = req_prev
1371
+ if req_fin is not None:
1372
+ stats_obj["requested_final"] = req_fin
1373
+ if act_prev is not None:
1374
+ stats_obj["actual_preview"] = act_prev
1375
+ if act_fin is not None:
1376
+ stats_obj["actual_final"] = act_fin
1377
+
1378
+ if "coverage_ok" not in stats_obj:
1379
+ if (
1380
+ isinstance(req_prev, int)
1381
+ and isinstance(req_fin, int)
1382
+ and isinstance(act_prev, int)
1383
+ and isinstance(act_fin, int)
1384
+ ):
1385
+ stats_obj["coverage_ok"] = (act_prev >= req_prev) and (
1386
+ act_fin >= req_fin
1387
+ )
1388
+ except Exception: # pragma: no cover
1389
+ pass
1390
+
1391
+ _enforce_pairing_and_coverage(
1392
+ ppl_analysis.get("stats", {}),
1393
+ window_plan_profile,
1394
+ auto.get("tier", "balanced"),
1395
+ )
1396
+
1056
1397
  if isinstance(window_plan_ctx, dict):
1057
1398
  ppl_analysis["window_plan"] = window_plan_ctx
1058
1399
 
@@ -1102,17 +1443,62 @@ def make_certificate(
1102
1443
  if variance_policy_digest:
1103
1444
  policies["variance"]["policy_digest"] = variance_policy_digest
1104
1445
 
1446
+ # Resolve tier/profile policy (canonical) and merge observed guard policies.
1447
+ profile = None
1448
+ explicit_overrides = None
1449
+ try:
1450
+ ctx = report.get("context") if isinstance(report, dict) else None
1451
+ if isinstance(ctx, dict) and ctx.get("profile"):
1452
+ profile = str(ctx.get("profile"))
1453
+ except Exception:
1454
+ profile = None
1455
+ try:
1456
+ window_plan = (
1457
+ report.get("metrics", {}).get("window_plan")
1458
+ if isinstance(report.get("metrics"), dict)
1459
+ else None
1460
+ )
1461
+ if (
1462
+ profile is None
1463
+ and isinstance(window_plan, dict)
1464
+ and window_plan.get("profile")
1465
+ ):
1466
+ profile = str(window_plan.get("profile"))
1467
+ except Exception:
1468
+ profile = None
1469
+ try:
1470
+ meta_cfg = (
1471
+ report.get("meta", {}).get("config")
1472
+ if isinstance(report.get("meta"), dict)
1473
+ else None
1474
+ )
1475
+ if isinstance(meta_cfg, dict) and isinstance(meta_cfg.get("guards"), dict):
1476
+ explicit_overrides = meta_cfg.get("guards")
1477
+ if explicit_overrides is None and isinstance(report.get("config"), dict):
1478
+ cfg2 = report.get("config")
1479
+ if isinstance(cfg2.get("guards"), dict):
1480
+ explicit_overrides = cfg2.get("guards")
1481
+ except Exception:
1482
+ explicit_overrides = None
1483
+
1105
1484
  resolved_policy = _build_resolved_policies(
1106
- auto.get("tier", "balanced"), spectral, rmt, variance
1485
+ auto.get("tier", "balanced"),
1486
+ spectral,
1487
+ rmt,
1488
+ variance,
1489
+ profile=profile,
1490
+ explicit_overrides=explicit_overrides,
1491
+ )
1492
+ overrides_list = _extract_policy_overrides(report)
1493
+ resolved_digest = _compute_policy_digest(
1494
+ {"resolved_policy": resolved_policy, "overrides": overrides_list}
1107
1495
  )
1108
- resolved_digest = _compute_policy_digest(resolved_policy)
1109
- policy_digest_value = variance_policy_digest or resolved_digest
1110
1496
  policy_provenance = {
1111
1497
  "tier": auto.get("tier", "balanced"),
1112
- "overrides": _extract_policy_overrides(report),
1113
- "policy_digest": policy_digest_value,
1498
+ "overrides": overrides_list,
1499
+ "policy_digest": resolved_digest,
1114
1500
  }
1115
- auto["policy_digest"] = policy_digest_value
1501
+ auto["policy_digest"] = resolved_digest
1116
1502
 
1117
1503
  for guard_name in ("spectral", "rmt", "variance"):
1118
1504
  if guard_name in resolved_policy:
@@ -1473,16 +1859,17 @@ def make_certificate(
1473
1859
  or (baseline_hash != thresholds_hash)
1474
1860
  )
1475
1861
 
1476
- # Hysteresis knobs snapshot
1477
- try:
1478
- metrics_policy = TIER_POLICIES.get(cur_tier, {}).get("metrics", {})
1479
- except Exception: # pragma: no cover
1862
+ # Hysteresis knobs snapshot (policy-resolved)
1863
+ metrics_policy = (
1864
+ resolved_policy.get("metrics", {}) if isinstance(resolved_policy, dict) else {}
1865
+ )
1866
+ if not isinstance(metrics_policy, dict):
1480
1867
  metrics_policy = {}
1481
1868
  ppl_hys = 0.0
1482
1869
  acc_hys = 0.0
1483
1870
  try:
1484
1871
  ppl_hys = float(
1485
- (metrics_policy.get("ppl") or {}).get("hysteresis_ratio", 0.0) or 0.0
1872
+ (metrics_policy.get("pm_ratio") or {}).get("hysteresis_ratio", 0.0) or 0.0
1486
1873
  )
1487
1874
  acc_hys = float(
1488
1875
  (metrics_policy.get("accuracy") or {}).get("hysteresis_delta_pp", 0.0)
@@ -1725,6 +2112,12 @@ def make_certificate(
1725
2112
  from .primary_metric_utils import attach_primary_metric as _attach_pm
1726
2113
 
1727
2114
  _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2115
+ _enforce_display_ci_alignment(
2116
+ ratio_ci_source,
2117
+ certificate.get("primary_metric"),
2118
+ logloss_delta_ci,
2119
+ window_plan_profile,
2120
+ )
1728
2121
 
1729
2122
  # Ensure primary_metric has display_ci populated for schema invariants
1730
2123
  try:
@@ -2204,11 +2597,24 @@ def _format_epsilon_map(epsilon_map: Any) -> dict[str, float]:
2204
2597
 
2205
2598
 
2206
2599
  def _build_resolved_policies(
2207
- tier: str, spectral: dict[str, Any], rmt: dict[str, Any], variance: dict[str, Any]
2600
+ tier: str,
2601
+ spectral: dict[str, Any],
2602
+ rmt: dict[str, Any],
2603
+ variance: dict[str, Any],
2604
+ *,
2605
+ profile: str | None = None,
2606
+ explicit_overrides: dict[str, dict[str, Any]] | None = None,
2208
2607
  ) -> dict[str, Any]:
2209
2608
  from .policy_utils import _build_resolved_policies as _impl
2210
2609
 
2211
- return _impl(tier, spectral, rmt, variance)
2610
+ return _impl(
2611
+ tier,
2612
+ spectral,
2613
+ rmt,
2614
+ variance,
2615
+ profile=profile,
2616
+ explicit_overrides=explicit_overrides,
2617
+ )
2212
2618
 
2213
2619
 
2214
2620
  def _compute_policy_digest(policy: dict[str, Any]) -> str:
@@ -2279,6 +2685,23 @@ def _prepare_guard_overhead_section(
2279
2685
  "threshold_percent": threshold * 100,
2280
2686
  "source": str(payload.get("source", "report")),
2281
2687
  }
2688
+ try:
2689
+ mode = payload.get("mode")
2690
+ if mode is None:
2691
+ mode = payload.get("guard_overhead_mode")
2692
+ if isinstance(mode, str) and mode.strip():
2693
+ sanitized["mode"] = mode.strip()
2694
+ except Exception:
2695
+ pass
2696
+ try:
2697
+ skipped = bool(payload.get("skipped", False))
2698
+ if skipped:
2699
+ sanitized["skipped"] = True
2700
+ reason = payload.get("skip_reason")
2701
+ if isinstance(reason, str) and reason.strip():
2702
+ sanitized["skip_reason"] = reason.strip()
2703
+ except Exception:
2704
+ pass
2282
2705
 
2283
2706
  # Prefer structured reports and reuse the validator when available
2284
2707
  bare_report = payload.pop("bare_report", None)
@@ -2292,8 +2715,8 @@ def _prepare_guard_overhead_section(
2292
2715
  {
2293
2716
  "overhead_ratio": metrics.get("overhead_ratio"),
2294
2717
  "overhead_percent": metrics.get("overhead_percent"),
2295
- "bare_final": metrics.get("bare_final"),
2296
- "guarded_final": metrics.get("guarded_final"),
2718
+ "bare_ppl": metrics.get("bare_ppl"),
2719
+ "guarded_ppl": metrics.get("guarded_ppl"),
2297
2720
  "messages": list(result.messages),
2298
2721
  "warnings": list(result.warnings),
2299
2722
  "errors": list(result.errors),
@@ -2305,12 +2728,8 @@ def _prepare_guard_overhead_section(
2305
2728
  return sanitized, bool(result.passed)
2306
2729
 
2307
2730
  # Fall back to direct ratio computation when reports are not provided
2308
- bare_ppl = _coerce_float(payload.get("bare_final")) or _coerce_float(
2309
- payload.get("bare_ppl")
2310
- )
2311
- guarded_ppl = _coerce_float(payload.get("guarded_final")) or _coerce_float(
2312
- payload.get("guarded_ppl")
2313
- )
2731
+ bare_ppl = _coerce_float(payload.get("bare_ppl"))
2732
+ guarded_ppl = _coerce_float(payload.get("guarded_ppl"))
2314
2733
  ratio = _coerce_float(payload.get("overhead_ratio"))
2315
2734
 
2316
2735
  if ratio is None and bare_ppl is not None and guarded_ppl is not None:
@@ -2449,6 +2868,12 @@ def _propagate_pairing_stats(
2449
2868
  coverage = pa_stats.get("coverage")
2450
2869
  if isinstance(coverage, dict) and coverage:
2451
2870
  stats["coverage"] = coverage
2871
+ bootstrap = pa_stats.get("bootstrap")
2872
+ if isinstance(bootstrap, dict) and bootstrap:
2873
+ stats["bootstrap"] = bootstrap
2874
+ paired_delta_summary = pa_stats.get("paired_delta_summary")
2875
+ if isinstance(paired_delta_summary, dict) and paired_delta_summary:
2876
+ stats["paired_delta_summary"] = paired_delta_summary
2452
2877
  wmf = pa_stats.get("window_match_fraction")
2453
2878
  if wmf is not None:
2454
2879
  stats["window_match_fraction"] = wmf
@@ -2674,12 +3099,31 @@ def _compute_validation_flags(
2674
3099
  }
2675
3100
  if _tiny_relax:
2676
3101
  tier = "aggressive"
3102
+
2677
3103
  tier_thresholds = {
2678
3104
  "conservative": 1.05,
2679
3105
  "balanced": 1.10,
2680
3106
  "aggressive": 1.20,
2681
3107
  "none": 1.10,
2682
3108
  }
3109
+ tier_policies = get_tier_policies()
3110
+ tier_policy = tier_policies.get(tier, tier_policies.get("balanced", {}))
3111
+ metrics_policy = (
3112
+ tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
3113
+ )
3114
+ pm_policy = (
3115
+ metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
3116
+ )
3117
+ ratio_limit_base = pm_policy.get("ratio_limit_base")
3118
+ try:
3119
+ if ratio_limit_base is not None:
3120
+ ratio_limit_base = float(ratio_limit_base)
3121
+ except Exception:
3122
+ ratio_limit_base = None
3123
+ if not isinstance(ratio_limit_base, (int | float)) or not math.isfinite(
3124
+ float(ratio_limit_base)
3125
+ ):
3126
+ ratio_limit_base = float(tier_thresholds.get(tier, 1.10))
2683
3127
  acceptance = pm_acceptance_range if isinstance(pm_acceptance_range, dict) else {}
2684
3128
  ratio_min_bound = None
2685
3129
  ratio_max_bound = None
@@ -2697,7 +3141,7 @@ def _compute_validation_flags(
2697
3141
  ratio_limit = (
2698
3142
  ratio_max_bound
2699
3143
  if isinstance(ratio_max_bound, (int | float)) and math.isfinite(ratio_max_bound)
2700
- else tier_thresholds.get(tier, 1.10)
3144
+ else float(ratio_limit_base)
2701
3145
  )
2702
3146
  if isinstance(target_ratio, int | float) and target_ratio > 0:
2703
3147
  ratio_limit = min(ratio_limit, float(target_ratio))
@@ -2726,13 +3170,6 @@ def _compute_validation_flags(
2726
3170
  except Exception: # pragma: no cover
2727
3171
  pass
2728
3172
  # Hysteresis and sample-size floors from tier policies
2729
- tier_policy = TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
2730
- metrics_policy = (
2731
- tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
2732
- )
2733
- pm_policy = (
2734
- metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
2735
- )
2736
3173
  hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
2737
3174
  min_tokens = int(pm_policy.get("min_tokens", 0))
2738
3175
  # Evaluate sample-size sufficiency
@@ -2804,7 +3241,9 @@ def _compute_validation_flags(
2804
3241
  summary = spectral.get("summary", {}) if isinstance(spectral, dict) else {}
2805
3242
  max_caps = spectral.get("max_caps") or summary.get("max_caps")
2806
3243
  if max_caps is None:
2807
- default_spectral = TIER_POLICIES.get(tier, {}).get("spectral", {})
3244
+ default_spectral = (
3245
+ tier_policy.get("spectral", {}) if isinstance(tier_policy, dict) else {}
3246
+ )
2808
3247
  max_caps = default_spectral.get("max_caps", 5)
2809
3248
  spectral_stable = spectral.get("caps_applied", 0) <= int(max_caps)
2810
3249
  if spectral.get("caps_exceeded"):
@@ -2871,14 +3310,6 @@ def _compute_validation_flags(
2871
3310
  flags["primary_metric_acceptable"] = bool(ok)
2872
3311
  elif kind in {"accuracy", "vqa_accuracy"}:
2873
3312
  # Read thresholds from tier policy if available
2874
- tier_policy = (
2875
- TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
2876
- )
2877
- metrics_policy = (
2878
- tier_policy.get("metrics", {})
2879
- if isinstance(tier_policy, dict)
2880
- else {}
2881
- )
2882
3313
  acc_policy = (
2883
3314
  metrics_policy.get("accuracy", {})
2884
3315
  if isinstance(metrics_policy, dict)