invarlock 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -500,7 +500,7 @@ class SpectralGuard(Guard):
500
500
  if self.ignore_preview_inflation and phase == "after_edit":
501
501
  continue
502
502
 
503
- if z_score > kappa_cap:
503
+ if abs(z_score) > kappa_cap:
504
504
  violations.append(
505
505
  {
506
506
  "type": "family_z_cap",
@@ -403,29 +403,36 @@ def _predictive_gate_outcome(
403
403
  ):
404
404
  return False, "ci_unavailable"
405
405
 
406
+ lower = float(delta_ci[0])
406
407
  upper = float(delta_ci[1])
407
408
  min_effect = float(min_effect or 0.0)
408
409
 
410
+ # CI must clear zero (and the min-effect band when provided).
409
411
  if one_sided:
410
- # One-sided improvement (ΔlogNLL < 0): certify a minimum effect by
411
- # requiring the *upper* bound of the (two-sided) CI to clear -min_effect.
412
412
  if upper >= 0.0:
413
413
  return False, "ci_contains_zero"
414
414
  if mean_delta >= 0.0:
415
415
  return False, "mean_not_negative"
416
- gain_lower_bound = -upper # worst-case gain under CI
417
- if gain_lower_bound < min_effect:
416
+ if upper > -min_effect:
417
+ return False, "gain_below_threshold"
418
+ if mean_delta > -min_effect:
418
419
  return False, "gain_below_threshold"
419
420
  return True, "ci_gain_met"
420
421
 
421
- # Two-sided improvement: CI must be strictly below zero.
422
- if upper >= 0.0:
422
+ # Two-sided: detect regressions outside the +min_effect band, but only
423
+ # enable VE for negative improvements.
424
+ if lower <= 0.0 <= upper:
423
425
  return False, "ci_contains_zero"
424
-
425
- gain_lower_bound = -upper # Convert ΔlogNLL CI to gain CI lower bound.
426
- if gain_lower_bound < min_effect:
426
+ if lower > 0.0:
427
+ if lower >= min_effect and mean_delta >= min_effect:
428
+ return False, "regression_detected"
429
+ return False, "mean_not_negative"
430
+ if upper > -min_effect:
431
+ return False, "gain_below_threshold"
432
+ if mean_delta >= 0.0:
433
+ return False, "mean_not_negative"
434
+ if mean_delta > -min_effect:
427
435
  return False, "gain_below_threshold"
428
-
429
436
  return True, "ci_gain_met"
430
437
 
431
438
 
@@ -1441,12 +1448,17 @@ class VarianceGuard(Guard):
1441
1448
 
1442
1449
  device = next(model.parameters()).device
1443
1450
  torch.manual_seed(calib_seed)
1444
- ppl_no_ve_samples, loss_no_ve_samples = self._compute_ppl_for_batches(
1445
- model, calibration_batches, device
1451
+ (
1452
+ ppl_no_ve_samples,
1453
+ loss_no_ve_samples,
1454
+ token_counts,
1455
+ ) = self._compute_ppl_for_batches(
1456
+ model, calibration_batches, device, return_counts=True
1446
1457
  )
1447
1458
  coverage = min(len(calibration_batches), len(ppl_no_ve_samples))
1448
1459
  ppl_with_ve_samples: list[float] = []
1449
1460
  loss_with_ve_samples: list[float] = []
1461
+ token_counts_with: list[int] = []
1450
1462
  ratio_ci: tuple[float, float] | None = None
1451
1463
 
1452
1464
  enable_success = False
@@ -1462,10 +1474,12 @@ class VarianceGuard(Guard):
1462
1474
  try:
1463
1475
  torch.manual_seed(calib_seed)
1464
1476
  if enable_success:
1465
- ppl_with_ve_samples, loss_with_ve_samples = (
1466
- self._compute_ppl_for_batches(
1467
- model, calibration_batches, device
1468
- )
1477
+ (
1478
+ ppl_with_ve_samples,
1479
+ loss_with_ve_samples,
1480
+ token_counts_with,
1481
+ ) = self._compute_ppl_for_batches(
1482
+ model, calibration_batches, device, return_counts=True
1469
1483
  )
1470
1484
  finally:
1471
1485
  if enable_success:
@@ -1478,6 +1492,8 @@ class VarianceGuard(Guard):
1478
1492
  coverage,
1479
1493
  len(ppl_with_ve_samples) if ppl_with_ve_samples else coverage,
1480
1494
  len(loss_with_ve_samples) if loss_with_ve_samples else coverage,
1495
+ len(token_counts) if token_counts else coverage,
1496
+ len(token_counts_with) if token_counts_with else coverage,
1481
1497
  )
1482
1498
  self._calibration_stats.update(
1483
1499
  {
@@ -1546,6 +1562,7 @@ class VarianceGuard(Guard):
1546
1562
  loss_no_ve_samples = loss_no_ve_samples[:coverage]
1547
1563
  ppl_with_ve_samples = ppl_with_ve_samples[:coverage]
1548
1564
  loss_with_ve_samples = loss_with_ve_samples[:coverage]
1565
+ token_counts = token_counts[:coverage]
1549
1566
 
1550
1567
  ratios = [
1551
1568
  with_val / no_val
@@ -1602,6 +1619,7 @@ class VarianceGuard(Guard):
1602
1619
  delta_ci = compute_paired_delta_log_ci(
1603
1620
  loss_with_ve_samples,
1604
1621
  loss_no_ve_samples,
1622
+ weights=token_counts,
1605
1623
  method="bca",
1606
1624
  replicates=500,
1607
1625
  alpha=self._policy.get("alpha", 0.05),
@@ -1617,18 +1635,31 @@ class VarianceGuard(Guard):
1617
1635
  )
1618
1636
 
1619
1637
  predictive_state["evaluated"] = True
1620
- mean_delta = float(
1621
- np.mean(
1622
- [
1623
- with_loss - no_loss
1624
- for with_loss, no_loss in zip(
1625
- loss_with_ve_samples,
1626
- loss_no_ve_samples,
1627
- strict=False,
1628
- )
1629
- ]
1638
+ if token_counts:
1639
+ sw = 0.0
1640
+ swx = 0.0
1641
+ for with_loss, no_loss, weight in zip(
1642
+ loss_with_ve_samples,
1643
+ loss_no_ve_samples,
1644
+ token_counts,
1645
+ strict=False,
1646
+ ):
1647
+ sw += float(weight)
1648
+ swx += float(weight) * (with_loss - no_loss)
1649
+ mean_delta = float(swx / sw) if sw > 0 else float("nan")
1650
+ else:
1651
+ mean_delta = float(
1652
+ np.mean(
1653
+ [
1654
+ with_loss - no_loss
1655
+ for with_loss, no_loss in zip(
1656
+ loss_with_ve_samples,
1657
+ loss_no_ve_samples,
1658
+ strict=False,
1659
+ )
1660
+ ]
1661
+ )
1630
1662
  )
1631
- )
1632
1663
  predictive_state["mean_delta"] = mean_delta
1633
1664
 
1634
1665
  if delta_ci is not None and all(
@@ -1875,12 +1906,19 @@ class VarianceGuard(Guard):
1875
1906
  model: nn.Module,
1876
1907
  batches: list[Any],
1877
1908
  device: torch.device,
1878
- ) -> tuple[list[float], list[float]]:
1909
+ *,
1910
+ return_counts: bool = False,
1911
+ ) -> tuple[list[float], list[float]] | tuple[list[float], list[float], list[int]]:
1879
1912
  """Compute per-batch perplexity and log-loss values for deterministic calibration."""
1880
1913
  ppl_values: list[float] = []
1881
1914
  loss_values: list[float] = []
1915
+ token_counts: list[int] = []
1882
1916
  if not batches:
1883
- return ppl_values, loss_values
1917
+ return (
1918
+ (ppl_values, loss_values, token_counts)
1919
+ if return_counts
1920
+ else (ppl_values, loss_values)
1921
+ )
1884
1922
 
1885
1923
  model_was_training = model.training
1886
1924
  model.eval()
@@ -1919,12 +1957,29 @@ class VarianceGuard(Guard):
1919
1957
  if math.isfinite(ppl):
1920
1958
  ppl_values.append(ppl)
1921
1959
  loss_values.append(loss)
1960
+ if return_counts:
1961
+ count = None
1962
+ try:
1963
+ if labels is not None and isinstance(
1964
+ labels, torch.Tensor
1965
+ ):
1966
+ count = int((labels != -100).sum().item())
1967
+ except Exception:
1968
+ count = None
1969
+ if count is None:
1970
+ try:
1971
+ count = int(inputs.numel())
1972
+ except Exception:
1973
+ count = 0
1974
+ token_counts.append(int(max(count, 0)))
1922
1975
  except Exception:
1923
1976
  continue
1924
1977
 
1925
1978
  if model_was_training:
1926
1979
  model.train()
1927
1980
 
1981
+ if return_counts:
1982
+ return ppl_values, loss_values, token_counts
1928
1983
  return ppl_values, loss_values
1929
1984
 
1930
1985
  def _bootstrap_mean_ci(
@@ -2111,12 +2166,17 @@ class VarianceGuard(Guard):
2111
2166
  if calibration_batches:
2112
2167
  device = next(model.parameters()).device
2113
2168
  torch.manual_seed(calib_seed)
2114
- ppl_no_ve_samples, loss_no_ve_samples = self._compute_ppl_for_batches(
2115
- model, calibration_batches, device
2169
+ (
2170
+ ppl_no_ve_samples,
2171
+ loss_no_ve_samples,
2172
+ token_counts,
2173
+ ) = self._compute_ppl_for_batches(
2174
+ model, calibration_batches, device, return_counts=True
2116
2175
  )
2117
2176
  coverage = min(len(calibration_batches), len(ppl_no_ve_samples))
2118
2177
  ppl_with_ve_samples: list[float] = []
2119
2178
  loss_with_ve_samples: list[float] = []
2179
+ token_counts_with: list[int] = []
2120
2180
  ratio_ci: tuple[float, float] | None = None
2121
2181
 
2122
2182
  enable_success = False
@@ -2135,8 +2195,9 @@ class VarianceGuard(Guard):
2135
2195
  (
2136
2196
  ppl_with_ve_samples,
2137
2197
  loss_with_ve_samples,
2198
+ token_counts_with,
2138
2199
  ) = self._compute_ppl_for_batches(
2139
- model, calibration_batches, device
2200
+ model, calibration_batches, device, return_counts=True
2140
2201
  )
2141
2202
  finally:
2142
2203
  if enable_success:
@@ -2149,6 +2210,8 @@ class VarianceGuard(Guard):
2149
2210
  coverage,
2150
2211
  len(ppl_with_ve_samples) if ppl_with_ve_samples else coverage,
2151
2212
  len(loss_with_ve_samples) if loss_with_ve_samples else coverage,
2213
+ len(token_counts) if token_counts else coverage,
2214
+ len(token_counts_with) if token_counts_with else coverage,
2152
2215
  )
2153
2216
  self._calibration_stats.update(
2154
2217
  {"coverage": coverage, "status": "insufficient"}
@@ -2181,6 +2244,8 @@ class VarianceGuard(Guard):
2181
2244
  loss_no_ve_samples = loss_no_ve_samples[:coverage]
2182
2245
  ppl_with_ve_samples = ppl_with_ve_samples[:coverage]
2183
2246
  loss_with_ve_samples = loss_with_ve_samples[:coverage]
2247
+ token_counts = token_counts[:coverage]
2248
+ token_counts_with = token_counts_with[:coverage]
2184
2249
 
2185
2250
  ratios = [
2186
2251
  with_val / no_val
@@ -2219,6 +2284,7 @@ class VarianceGuard(Guard):
2219
2284
  delta_ci = compute_paired_delta_log_ci(
2220
2285
  loss_with_ve_samples,
2221
2286
  loss_no_ve_samples,
2287
+ weights=token_counts,
2222
2288
  method="bca",
2223
2289
  replicates=500,
2224
2290
  alpha=self._policy.get("alpha", 0.05),
@@ -2234,18 +2300,31 @@ class VarianceGuard(Guard):
2234
2300
  )
2235
2301
 
2236
2302
  predictive_state["evaluated"] = True
2237
- mean_delta = float(
2238
- np.mean(
2239
- [
2240
- with_loss - no_loss
2241
- for with_loss, no_loss in zip(
2242
- loss_with_ve_samples,
2243
- loss_no_ve_samples,
2244
- strict=False,
2245
- )
2246
- ]
2303
+ if token_counts:
2304
+ sw = 0.0
2305
+ swx = 0.0
2306
+ for with_loss, no_loss, weight in zip(
2307
+ loss_with_ve_samples,
2308
+ loss_no_ve_samples,
2309
+ token_counts,
2310
+ strict=False,
2311
+ ):
2312
+ sw += float(weight)
2313
+ swx += float(weight) * (with_loss - no_loss)
2314
+ mean_delta = float(swx / sw) if sw > 0 else float("nan")
2315
+ else:
2316
+ mean_delta = float(
2317
+ np.mean(
2318
+ [
2319
+ with_loss - no_loss
2320
+ for with_loss, no_loss in zip(
2321
+ loss_with_ve_samples,
2322
+ loss_no_ve_samples,
2323
+ strict=False,
2324
+ )
2325
+ ]
2326
+ )
2247
2327
  )
2248
- )
2249
2328
  predictive_state["mean_delta"] = mean_delta
2250
2329
 
2251
2330
  if delta_ci is not None and all(
@@ -538,6 +538,188 @@ def _enforce_ratio_ci_alignment(
538
538
  )
539
539
 
540
540
 
541
+ def _enforce_display_ci_alignment(
542
+ ratio_ci_source: str,
543
+ primary_metric: Any,
544
+ logloss_delta_ci: Any,
545
+ window_plan_profile: str | None,
546
+ ) -> None:
547
+ """Ensure display_ci matches exp(ci) for ppl-like metrics when paired."""
548
+ if ratio_ci_source != "paired_baseline":
549
+ return
550
+ if not isinstance(primary_metric, dict) or not primary_metric:
551
+ return
552
+ try:
553
+ kind = str(primary_metric.get("kind", "")).lower()
554
+ except Exception:
555
+ return
556
+ if not kind.startswith("ppl"):
557
+ return
558
+
559
+ def _finite_bounds(bounds: Any) -> bool:
560
+ return (
561
+ isinstance(bounds, tuple | list)
562
+ and len(bounds) == 2
563
+ and all(isinstance(v, int | float) and math.isfinite(v) for v in bounds)
564
+ )
565
+
566
+ ci = primary_metric.get("ci")
567
+ if not _finite_bounds(ci):
568
+ if _finite_bounds(logloss_delta_ci):
569
+ primary_metric["ci"] = (
570
+ float(logloss_delta_ci[0]),
571
+ float(logloss_delta_ci[1]),
572
+ )
573
+ ci = primary_metric["ci"]
574
+ else:
575
+ profile = (window_plan_profile or "dev").lower()
576
+ if profile in {"ci", "release"}:
577
+ raise ValueError(
578
+ "primary_metric.ci missing for ppl-like metric under paired baseline."
579
+ )
580
+ return
581
+
582
+ expected = tuple(math.exp(float(bound)) for bound in ci)
583
+ display_ci = primary_metric.get("display_ci")
584
+ if not _finite_bounds(display_ci):
585
+ profile = (window_plan_profile or "dev").lower()
586
+ if profile in {"ci", "release"}:
587
+ raise ValueError(
588
+ "primary_metric.display_ci missing for ppl-like metric under paired baseline."
589
+ )
590
+ primary_metric["display_ci"] = [expected[0], expected[1]]
591
+ return
592
+
593
+ for observed, exp_val in zip(display_ci, expected, strict=False):
594
+ tolerance = 5e-4 * max(1.0, abs(exp_val))
595
+ if abs(float(observed) - float(exp_val)) > tolerance:
596
+ profile = (window_plan_profile or "dev").lower()
597
+ if profile in {"ci", "release"}:
598
+ raise ValueError(
599
+ "primary_metric.display_ci mismatch: bounds do not match exp(ci)."
600
+ )
601
+ primary_metric["display_ci"] = [expected[0], expected[1]]
602
+ break
603
+
604
+
605
+ def _enforce_pairing_and_coverage(
606
+ stats: dict[str, Any] | None,
607
+ window_plan_profile: str | None,
608
+ tier: str | None,
609
+ ) -> None:
610
+ """Enforce pairing and coverage contracts for CI/Release profiles."""
611
+ profile = (window_plan_profile or "dev").lower()
612
+ if profile not in {"ci", "release"}:
613
+ return
614
+ if not isinstance(stats, dict):
615
+ raise ValueError("Missing dataset window stats for CI/Release enforcement.")
616
+
617
+ pairing_reason = stats.get("window_pairing_reason")
618
+ if pairing_reason is not None:
619
+ raise ValueError(
620
+ "CI/Release requires paired baseline evidence "
621
+ f"(window_pairing_reason={pairing_reason!r})."
622
+ )
623
+
624
+ match_fraction = stats.get("window_match_fraction")
625
+ overlap_fraction = stats.get("window_overlap_fraction")
626
+ if not (
627
+ isinstance(match_fraction, (int | float))
628
+ and math.isfinite(float(match_fraction))
629
+ ):
630
+ raise ValueError("CI/Release requires window_match_fraction.")
631
+ if float(match_fraction) < 0.999999:
632
+ raise ValueError(
633
+ f"CI/Release requires perfect pairing (window_match_fraction={float(match_fraction):.6f})."
634
+ )
635
+
636
+ if not (
637
+ isinstance(overlap_fraction, (int | float))
638
+ and math.isfinite(float(overlap_fraction))
639
+ ):
640
+ raise ValueError("CI/Release requires window_overlap_fraction.")
641
+ if float(overlap_fraction) > 1e-9:
642
+ raise ValueError(
643
+ f"CI/Release requires non-overlapping windows (window_overlap_fraction={float(overlap_fraction):.6f})."
644
+ )
645
+
646
+ def _coerce_count(value: Any) -> int | None:
647
+ if value is None or isinstance(value, bool):
648
+ return None
649
+ try:
650
+ val = float(value)
651
+ except (TypeError, ValueError):
652
+ return None
653
+ if not math.isfinite(val) or val < 0:
654
+ return None
655
+ if abs(val - round(val)) > 1e-9:
656
+ return None
657
+ return int(round(val))
658
+
659
+ paired_windows = _coerce_count(stats.get("paired_windows"))
660
+ if paired_windows is None:
661
+ raise ValueError("CI/Release requires paired_windows metric.")
662
+ if paired_windows == 0:
663
+ raise ValueError("CI/Release requires paired_windows > 0.")
664
+
665
+ actual_preview = _coerce_count(stats.get("actual_preview"))
666
+ actual_final = _coerce_count(stats.get("actual_final"))
667
+ if actual_preview is None or actual_final is None:
668
+ coverage = stats.get("coverage")
669
+ if isinstance(coverage, dict):
670
+ if actual_preview is None:
671
+ actual_preview = _coerce_count(coverage.get("preview", {}).get("used"))
672
+ if actual_final is None:
673
+ actual_final = _coerce_count(coverage.get("final", {}).get("used"))
674
+
675
+ if actual_preview is None or actual_final is None:
676
+ raise ValueError("CI/Release requires preview/final window counts.")
677
+ if actual_preview != actual_final:
678
+ raise ValueError(
679
+ f"CI/Release requires matching preview/final counts "
680
+ f"(preview={actual_preview}, final={actual_final})."
681
+ )
682
+
683
+ from invarlock.core.runner import BOOTSTRAP_COVERAGE_REQUIREMENTS
684
+
685
+ tier_key = str(tier or "balanced").lower()
686
+ floors = BOOTSTRAP_COVERAGE_REQUIREMENTS.get(
687
+ tier_key, BOOTSTRAP_COVERAGE_REQUIREMENTS["balanced"]
688
+ )
689
+ preview_floor = int(floors.get("preview", 0))
690
+ final_floor = int(floors.get("final", 0))
691
+ replicates_floor = int(floors.get("replicates", 0))
692
+
693
+ coverage = stats.get("coverage")
694
+ if not isinstance(coverage, dict):
695
+ raise ValueError("CI/Release requires bootstrap coverage stats.")
696
+
697
+ preview_used = _coerce_count(coverage.get("preview", {}).get("used"))
698
+ final_used = _coerce_count(coverage.get("final", {}).get("used"))
699
+ replicates_used = _coerce_count(coverage.get("replicates", {}).get("used"))
700
+
701
+ if replicates_used is None:
702
+ bootstrap = stats.get("bootstrap")
703
+ if isinstance(bootstrap, dict):
704
+ replicates_used = _coerce_count(
705
+ bootstrap.get("replicates", bootstrap.get("n"))
706
+ )
707
+
708
+ if preview_used is None or final_used is None or replicates_used is None:
709
+ raise ValueError("CI/Release requires preview/final/replicates coverage stats.")
710
+
711
+ if preview_used < preview_floor or final_used < final_floor:
712
+ raise ValueError(
713
+ "CI/Release requires preview/final coverage at or above tier floors "
714
+ f"(preview={preview_used}/{preview_floor}, final={final_used}/{final_floor})."
715
+ )
716
+ if replicates_used < replicates_floor:
717
+ raise ValueError(
718
+ "CI/Release requires bootstrap replicates at or above tier floors "
719
+ f"(replicates={replicates_used}/{replicates_floor})."
720
+ )
721
+
722
+
541
723
  def _fallback_paired_windows(
542
724
  paired_windows: int, coverage_summary: dict[str, Any]
543
725
  ) -> int:
@@ -807,6 +989,47 @@ def make_certificate(
807
989
  if paired:
808
990
  paired_run, paired_base = paired
809
991
  paired_windows = len(paired_run)
992
+ paired_weights: list[float] | None = None
993
+ try:
994
+ run_ids = (
995
+ run_windows.get("window_ids") if isinstance(run_windows, dict) else None
996
+ )
997
+ run_w = (
998
+ run_windows.get("token_counts")
999
+ if isinstance(run_windows, dict)
1000
+ else None
1001
+ )
1002
+ base_ids = (
1003
+ baseline_windows.get("window_ids")
1004
+ if isinstance(baseline_windows, dict)
1005
+ else None
1006
+ )
1007
+ if (
1008
+ isinstance(run_ids, list)
1009
+ and isinstance(run_w, list)
1010
+ and isinstance(base_ids, list)
1011
+ ):
1012
+ base_set = {
1013
+ int(b_id) for b_id in base_ids if isinstance(b_id, int | float)
1014
+ }
1015
+ weights: list[float] = []
1016
+ for r_id, w in zip(run_ids, run_w, strict=False):
1017
+ if not isinstance(r_id, int | float):
1018
+ continue
1019
+ key = int(r_id)
1020
+ if key not in base_set:
1021
+ continue
1022
+ try:
1023
+ wv = float(w)
1024
+ except Exception:
1025
+ continue
1026
+ if not math.isfinite(wv):
1027
+ continue
1028
+ weights.append(float(max(wv, 0.0)))
1029
+ if weights:
1030
+ paired_weights = weights
1031
+ except Exception: # pragma: no cover
1032
+ paired_weights = None
810
1033
  method = str(metrics_bootstrap.get("method", "percentile")).lower()
811
1034
  replicates = int(
812
1035
  metrics_bootstrap.get(
@@ -834,6 +1057,7 @@ def make_certificate(
834
1057
  delta_ci = compute_paired_delta_log_ci(
835
1058
  paired_run,
836
1059
  paired_base,
1060
+ weights=paired_weights,
837
1061
  method=ci_method,
838
1062
  replicates=replicates,
839
1063
  alpha=alpha,
@@ -977,6 +1201,20 @@ def make_certificate(
977
1201
  _enforce_ratio_ci_alignment(ratio_ci_source, ratio_ci, logloss_delta_ci)
978
1202
 
979
1203
  paired_windows = _fallback_paired_windows(paired_windows, coverage_summary)
1204
+ # Prefer runner-reported paired window count when available (signal used for
1205
+ # CI/Release enforcement); fall back to evidence-based pairing or coverage
1206
+ # heuristics when the metric is missing.
1207
+ try:
1208
+ paired_windows_signal = (
1209
+ report.get("metrics", {}).get("paired_windows")
1210
+ if isinstance(report.get("metrics"), dict)
1211
+ else None
1212
+ )
1213
+ except Exception: # pragma: no cover
1214
+ paired_windows_signal = None
1215
+ paired_windows_signal_int = _coerce_int(paired_windows_signal)
1216
+ if paired_windows_signal_int is not None and paired_windows_signal_int >= 0:
1217
+ paired_windows = paired_windows_signal_int
980
1218
 
981
1219
  # Primary-metric stats for gating/summary (PM-only)
982
1220
  try:
@@ -1156,13 +1394,13 @@ def make_certificate(
1156
1394
  act_fin = req_fin
1157
1395
 
1158
1396
  if req_prev is not None:
1159
- stats_obj.setdefault("requested_preview", req_prev)
1397
+ stats_obj["requested_preview"] = req_prev
1160
1398
  if req_fin is not None:
1161
- stats_obj.setdefault("requested_final", req_fin)
1399
+ stats_obj["requested_final"] = req_fin
1162
1400
  if act_prev is not None:
1163
- stats_obj.setdefault("actual_preview", act_prev)
1401
+ stats_obj["actual_preview"] = act_prev
1164
1402
  if act_fin is not None:
1165
- stats_obj.setdefault("actual_final", act_fin)
1403
+ stats_obj["actual_final"] = act_fin
1166
1404
 
1167
1405
  if "coverage_ok" not in stats_obj:
1168
1406
  if (
@@ -1177,6 +1415,12 @@ def make_certificate(
1177
1415
  except Exception: # pragma: no cover
1178
1416
  pass
1179
1417
 
1418
+ _enforce_pairing_and_coverage(
1419
+ ppl_analysis.get("stats", {}),
1420
+ window_plan_profile,
1421
+ auto.get("tier", "balanced"),
1422
+ )
1423
+
1180
1424
  if isinstance(window_plan_ctx, dict):
1181
1425
  ppl_analysis["window_plan"] = window_plan_ctx
1182
1426
 
@@ -1895,6 +2139,12 @@ def make_certificate(
1895
2139
  from .primary_metric_utils import attach_primary_metric as _attach_pm
1896
2140
 
1897
2141
  _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2142
+ _enforce_display_ci_alignment(
2143
+ ratio_ci_source,
2144
+ certificate.get("primary_metric"),
2145
+ logloss_delta_ci,
2146
+ window_plan_profile,
2147
+ )
1898
2148
 
1899
2149
  # Ensure primary_metric has display_ci populated for schema invariants
1900
2150
  try:
@@ -2492,8 +2742,8 @@ def _prepare_guard_overhead_section(
2492
2742
  {
2493
2743
  "overhead_ratio": metrics.get("overhead_ratio"),
2494
2744
  "overhead_percent": metrics.get("overhead_percent"),
2495
- "bare_final": metrics.get("bare_final"),
2496
- "guarded_final": metrics.get("guarded_final"),
2745
+ "bare_ppl": metrics.get("bare_ppl"),
2746
+ "guarded_ppl": metrics.get("guarded_ppl"),
2497
2747
  "messages": list(result.messages),
2498
2748
  "warnings": list(result.warnings),
2499
2749
  "errors": list(result.errors),
@@ -2505,12 +2755,8 @@ def _prepare_guard_overhead_section(
2505
2755
  return sanitized, bool(result.passed)
2506
2756
 
2507
2757
  # Fall back to direct ratio computation when reports are not provided
2508
- bare_ppl = _coerce_float(payload.get("bare_final")) or _coerce_float(
2509
- payload.get("bare_ppl")
2510
- )
2511
- guarded_ppl = _coerce_float(payload.get("guarded_final")) or _coerce_float(
2512
- payload.get("guarded_ppl")
2513
- )
2758
+ bare_ppl = _coerce_float(payload.get("bare_ppl"))
2759
+ guarded_ppl = _coerce_float(payload.get("guarded_ppl"))
2514
2760
  ratio = _coerce_float(payload.get("overhead_ratio"))
2515
2761
 
2516
2762
  if ratio is None and bare_ppl is not None and guarded_ppl is not None:
@@ -194,6 +194,9 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
194
194
  "window_pairing_reason",
195
195
  "window_pairing_preview",
196
196
  "window_pairing_final",
197
+ "window_plan",
198
+ "window_capacity",
199
+ "stats",
197
200
  "total_tokens",
198
201
  "preview_total_tokens",
199
202
  "final_total_tokens",
@@ -579,9 +579,7 @@ def _extract_policy_overrides(report: RunReport) -> list[str]:
579
579
 
580
580
 
581
581
  def _compute_policy_digest(policy: dict[str, Any]) -> str:
582
- canonical = json.dumps(
583
- policy, sort_keys=True, default=str, separators=(",", ":"), ensure_ascii=True
584
- )
582
+ canonical = json.dumps(policy, sort_keys=True, default=str)
585
583
  return hashlib.sha256(canonical.encode()).hexdigest()[:16]
586
584
 
587
585