invarlock 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -500,7 +500,7 @@ class SpectralGuard(Guard):
500
500
  if self.ignore_preview_inflation and phase == "after_edit":
501
501
  continue
502
502
 
503
- if z_score > kappa_cap:
503
+ if abs(z_score) > kappa_cap:
504
504
  violations.append(
505
505
  {
506
506
  "type": "family_z_cap",
@@ -403,29 +403,36 @@ def _predictive_gate_outcome(
403
403
  ):
404
404
  return False, "ci_unavailable"
405
405
 
406
+ lower = float(delta_ci[0])
406
407
  upper = float(delta_ci[1])
407
408
  min_effect = float(min_effect or 0.0)
408
409
 
410
+ # CI must clear zero (and the min-effect band when provided).
409
411
  if one_sided:
410
- # One-sided improvement (ΔlogNLL < 0): certify a minimum effect by
411
- # requiring the *upper* bound of the (two-sided) CI to clear -min_effect.
412
412
  if upper >= 0.0:
413
413
  return False, "ci_contains_zero"
414
414
  if mean_delta >= 0.0:
415
415
  return False, "mean_not_negative"
416
- gain_lower_bound = -upper # worst-case gain under CI
417
- if gain_lower_bound < min_effect:
416
+ if upper > -min_effect:
417
+ return False, "gain_below_threshold"
418
+ if mean_delta > -min_effect:
418
419
  return False, "gain_below_threshold"
419
420
  return True, "ci_gain_met"
420
421
 
421
- # Two-sided improvement: CI must be strictly below zero.
422
- if upper >= 0.0:
422
+ # Two-sided: detect regressions outside the +min_effect band, but only
423
+ # enable VE for negative improvements.
424
+ if lower <= 0.0 <= upper:
423
425
  return False, "ci_contains_zero"
424
-
425
- gain_lower_bound = -upper # Convert ΔlogNLL CI to gain CI lower bound.
426
- if gain_lower_bound < min_effect:
426
+ if lower > 0.0:
427
+ if lower >= min_effect and mean_delta >= min_effect:
428
+ return False, "regression_detected"
429
+ return False, "mean_not_negative"
430
+ if upper > -min_effect:
431
+ return False, "gain_below_threshold"
432
+ if mean_delta >= 0.0:
433
+ return False, "mean_not_negative"
434
+ if mean_delta > -min_effect:
427
435
  return False, "gain_below_threshold"
428
-
429
436
  return True, "ci_gain_met"
430
437
 
431
438
 
@@ -1441,12 +1448,17 @@ class VarianceGuard(Guard):
1441
1448
 
1442
1449
  device = next(model.parameters()).device
1443
1450
  torch.manual_seed(calib_seed)
1444
- ppl_no_ve_samples, loss_no_ve_samples = self._compute_ppl_for_batches(
1445
- model, calibration_batches, device
1451
+ (
1452
+ ppl_no_ve_samples,
1453
+ loss_no_ve_samples,
1454
+ token_counts,
1455
+ ) = self._compute_ppl_for_batches(
1456
+ model, calibration_batches, device, return_counts=True
1446
1457
  )
1447
1458
  coverage = min(len(calibration_batches), len(ppl_no_ve_samples))
1448
1459
  ppl_with_ve_samples: list[float] = []
1449
1460
  loss_with_ve_samples: list[float] = []
1461
+ token_counts_with: list[int] = []
1450
1462
  ratio_ci: tuple[float, float] | None = None
1451
1463
 
1452
1464
  enable_success = False
@@ -1462,10 +1474,12 @@ class VarianceGuard(Guard):
1462
1474
  try:
1463
1475
  torch.manual_seed(calib_seed)
1464
1476
  if enable_success:
1465
- ppl_with_ve_samples, loss_with_ve_samples = (
1466
- self._compute_ppl_for_batches(
1467
- model, calibration_batches, device
1468
- )
1477
+ (
1478
+ ppl_with_ve_samples,
1479
+ loss_with_ve_samples,
1480
+ token_counts_with,
1481
+ ) = self._compute_ppl_for_batches(
1482
+ model, calibration_batches, device, return_counts=True
1469
1483
  )
1470
1484
  finally:
1471
1485
  if enable_success:
@@ -1478,6 +1492,8 @@ class VarianceGuard(Guard):
1478
1492
  coverage,
1479
1493
  len(ppl_with_ve_samples) if ppl_with_ve_samples else coverage,
1480
1494
  len(loss_with_ve_samples) if loss_with_ve_samples else coverage,
1495
+ len(token_counts) if token_counts else coverage,
1496
+ len(token_counts_with) if token_counts_with else coverage,
1481
1497
  )
1482
1498
  self._calibration_stats.update(
1483
1499
  {
@@ -1546,6 +1562,7 @@ class VarianceGuard(Guard):
1546
1562
  loss_no_ve_samples = loss_no_ve_samples[:coverage]
1547
1563
  ppl_with_ve_samples = ppl_with_ve_samples[:coverage]
1548
1564
  loss_with_ve_samples = loss_with_ve_samples[:coverage]
1565
+ token_counts = token_counts[:coverage]
1549
1566
 
1550
1567
  ratios = [
1551
1568
  with_val / no_val
@@ -1602,6 +1619,7 @@ class VarianceGuard(Guard):
1602
1619
  delta_ci = compute_paired_delta_log_ci(
1603
1620
  loss_with_ve_samples,
1604
1621
  loss_no_ve_samples,
1622
+ weights=token_counts,
1605
1623
  method="bca",
1606
1624
  replicates=500,
1607
1625
  alpha=self._policy.get("alpha", 0.05),
@@ -1617,18 +1635,31 @@ class VarianceGuard(Guard):
1617
1635
  )
1618
1636
 
1619
1637
  predictive_state["evaluated"] = True
1620
- mean_delta = float(
1621
- np.mean(
1622
- [
1623
- with_loss - no_loss
1624
- for with_loss, no_loss in zip(
1625
- loss_with_ve_samples,
1626
- loss_no_ve_samples,
1627
- strict=False,
1628
- )
1629
- ]
1638
+ if token_counts:
1639
+ sw = 0.0
1640
+ swx = 0.0
1641
+ for with_loss, no_loss, weight in zip(
1642
+ loss_with_ve_samples,
1643
+ loss_no_ve_samples,
1644
+ token_counts,
1645
+ strict=False,
1646
+ ):
1647
+ sw += float(weight)
1648
+ swx += float(weight) * (with_loss - no_loss)
1649
+ mean_delta = float(swx / sw) if sw > 0 else float("nan")
1650
+ else:
1651
+ mean_delta = float(
1652
+ np.mean(
1653
+ [
1654
+ with_loss - no_loss
1655
+ for with_loss, no_loss in zip(
1656
+ loss_with_ve_samples,
1657
+ loss_no_ve_samples,
1658
+ strict=False,
1659
+ )
1660
+ ]
1661
+ )
1630
1662
  )
1631
- )
1632
1663
  predictive_state["mean_delta"] = mean_delta
1633
1664
 
1634
1665
  if delta_ci is not None and all(
@@ -1875,12 +1906,19 @@ class VarianceGuard(Guard):
1875
1906
  model: nn.Module,
1876
1907
  batches: list[Any],
1877
1908
  device: torch.device,
1878
- ) -> tuple[list[float], list[float]]:
1909
+ *,
1910
+ return_counts: bool = False,
1911
+ ) -> tuple[list[float], list[float]] | tuple[list[float], list[float], list[int]]:
1879
1912
  """Compute per-batch perplexity and log-loss values for deterministic calibration."""
1880
1913
  ppl_values: list[float] = []
1881
1914
  loss_values: list[float] = []
1915
+ token_counts: list[int] = []
1882
1916
  if not batches:
1883
- return ppl_values, loss_values
1917
+ return (
1918
+ (ppl_values, loss_values, token_counts)
1919
+ if return_counts
1920
+ else (ppl_values, loss_values)
1921
+ )
1884
1922
 
1885
1923
  model_was_training = model.training
1886
1924
  model.eval()
@@ -1919,12 +1957,29 @@ class VarianceGuard(Guard):
1919
1957
  if math.isfinite(ppl):
1920
1958
  ppl_values.append(ppl)
1921
1959
  loss_values.append(loss)
1960
+ if return_counts:
1961
+ count = None
1962
+ try:
1963
+ if labels is not None and isinstance(
1964
+ labels, torch.Tensor
1965
+ ):
1966
+ count = int((labels != -100).sum().item())
1967
+ except Exception:
1968
+ count = None
1969
+ if count is None:
1970
+ try:
1971
+ count = int(inputs.numel())
1972
+ except Exception:
1973
+ count = 0
1974
+ token_counts.append(int(max(count, 0)))
1922
1975
  except Exception:
1923
1976
  continue
1924
1977
 
1925
1978
  if model_was_training:
1926
1979
  model.train()
1927
1980
 
1981
+ if return_counts:
1982
+ return ppl_values, loss_values, token_counts
1928
1983
  return ppl_values, loss_values
1929
1984
 
1930
1985
  def _bootstrap_mean_ci(
@@ -2111,12 +2166,17 @@ class VarianceGuard(Guard):
2111
2166
  if calibration_batches:
2112
2167
  device = next(model.parameters()).device
2113
2168
  torch.manual_seed(calib_seed)
2114
- ppl_no_ve_samples, loss_no_ve_samples = self._compute_ppl_for_batches(
2115
- model, calibration_batches, device
2169
+ (
2170
+ ppl_no_ve_samples,
2171
+ loss_no_ve_samples,
2172
+ token_counts,
2173
+ ) = self._compute_ppl_for_batches(
2174
+ model, calibration_batches, device, return_counts=True
2116
2175
  )
2117
2176
  coverage = min(len(calibration_batches), len(ppl_no_ve_samples))
2118
2177
  ppl_with_ve_samples: list[float] = []
2119
2178
  loss_with_ve_samples: list[float] = []
2179
+ token_counts_with: list[int] = []
2120
2180
  ratio_ci: tuple[float, float] | None = None
2121
2181
 
2122
2182
  enable_success = False
@@ -2135,8 +2195,9 @@ class VarianceGuard(Guard):
2135
2195
  (
2136
2196
  ppl_with_ve_samples,
2137
2197
  loss_with_ve_samples,
2198
+ token_counts_with,
2138
2199
  ) = self._compute_ppl_for_batches(
2139
- model, calibration_batches, device
2200
+ model, calibration_batches, device, return_counts=True
2140
2201
  )
2141
2202
  finally:
2142
2203
  if enable_success:
@@ -2149,6 +2210,8 @@ class VarianceGuard(Guard):
2149
2210
  coverage,
2150
2211
  len(ppl_with_ve_samples) if ppl_with_ve_samples else coverage,
2151
2212
  len(loss_with_ve_samples) if loss_with_ve_samples else coverage,
2213
+ len(token_counts) if token_counts else coverage,
2214
+ len(token_counts_with) if token_counts_with else coverage,
2152
2215
  )
2153
2216
  self._calibration_stats.update(
2154
2217
  {"coverage": coverage, "status": "insufficient"}
@@ -2181,6 +2244,8 @@ class VarianceGuard(Guard):
2181
2244
  loss_no_ve_samples = loss_no_ve_samples[:coverage]
2182
2245
  ppl_with_ve_samples = ppl_with_ve_samples[:coverage]
2183
2246
  loss_with_ve_samples = loss_with_ve_samples[:coverage]
2247
+ token_counts = token_counts[:coverage]
2248
+ token_counts_with = token_counts_with[:coverage]
2184
2249
 
2185
2250
  ratios = [
2186
2251
  with_val / no_val
@@ -2219,6 +2284,7 @@ class VarianceGuard(Guard):
2219
2284
  delta_ci = compute_paired_delta_log_ci(
2220
2285
  loss_with_ve_samples,
2221
2286
  loss_no_ve_samples,
2287
+ weights=token_counts,
2222
2288
  method="bca",
2223
2289
  replicates=500,
2224
2290
  alpha=self._policy.get("alpha", 0.05),
@@ -2234,18 +2300,31 @@ class VarianceGuard(Guard):
2234
2300
  )
2235
2301
 
2236
2302
  predictive_state["evaluated"] = True
2237
- mean_delta = float(
2238
- np.mean(
2239
- [
2240
- with_loss - no_loss
2241
- for with_loss, no_loss in zip(
2242
- loss_with_ve_samples,
2243
- loss_no_ve_samples,
2244
- strict=False,
2245
- )
2246
- ]
2303
+ if token_counts:
2304
+ sw = 0.0
2305
+ swx = 0.0
2306
+ for with_loss, no_loss, weight in zip(
2307
+ loss_with_ve_samples,
2308
+ loss_no_ve_samples,
2309
+ token_counts,
2310
+ strict=False,
2311
+ ):
2312
+ sw += float(weight)
2313
+ swx += float(weight) * (with_loss - no_loss)
2314
+ mean_delta = float(swx / sw) if sw > 0 else float("nan")
2315
+ else:
2316
+ mean_delta = float(
2317
+ np.mean(
2318
+ [
2319
+ with_loss - no_loss
2320
+ for with_loss, no_loss in zip(
2321
+ loss_with_ve_samples,
2322
+ loss_no_ve_samples,
2323
+ strict=False,
2324
+ )
2325
+ ]
2326
+ )
2247
2327
  )
2248
- )
2249
2328
  predictive_state["mean_delta"] = mean_delta
2250
2329
 
2251
2330
  if delta_ci is not None and all(
@@ -538,6 +538,175 @@ def _enforce_ratio_ci_alignment(
538
538
  )
539
539
 
540
540
 
541
+ def _enforce_display_ci_alignment(
542
+ ratio_ci_source: str,
543
+ primary_metric: Any,
544
+ logloss_delta_ci: Any,
545
+ window_plan_profile: str | None,
546
+ ) -> None:
547
+ """Ensure display_ci matches exp(ci) for ppl-like metrics when paired."""
548
+ if ratio_ci_source != "paired_baseline":
549
+ return
550
+ if not isinstance(primary_metric, dict) or not primary_metric:
551
+ return
552
+ try:
553
+ kind = str(primary_metric.get("kind", "")).lower()
554
+ except Exception:
555
+ return
556
+ if not kind.startswith("ppl"):
557
+ return
558
+
559
+ def _finite_bounds(bounds: Any) -> bool:
560
+ return (
561
+ isinstance(bounds, tuple | list)
562
+ and len(bounds) == 2
563
+ and all(isinstance(v, int | float) and math.isfinite(v) for v in bounds)
564
+ )
565
+
566
+ ci = primary_metric.get("ci")
567
+ if not _finite_bounds(ci):
568
+ if _finite_bounds(logloss_delta_ci):
569
+ primary_metric["ci"] = (
570
+ float(logloss_delta_ci[0]),
571
+ float(logloss_delta_ci[1]),
572
+ )
573
+ ci = primary_metric["ci"]
574
+ else:
575
+ profile = (window_plan_profile or "dev").lower()
576
+ if profile in {"ci", "release"}:
577
+ raise ValueError(
578
+ "primary_metric.ci missing for ppl-like metric under paired baseline."
579
+ )
580
+ return
581
+
582
+ expected = tuple(math.exp(float(bound)) for bound in ci)
583
+ display_ci = primary_metric.get("display_ci")
584
+ if not _finite_bounds(display_ci):
585
+ profile = (window_plan_profile or "dev").lower()
586
+ if profile in {"ci", "release"}:
587
+ raise ValueError(
588
+ "primary_metric.display_ci missing for ppl-like metric under paired baseline."
589
+ )
590
+ primary_metric["display_ci"] = [expected[0], expected[1]]
591
+ return
592
+
593
+ for observed, exp_val in zip(display_ci, expected, strict=False):
594
+ tolerance = 5e-4 * max(1.0, abs(exp_val))
595
+ if abs(float(observed) - float(exp_val)) > tolerance:
596
+ profile = (window_plan_profile or "dev").lower()
597
+ if profile in {"ci", "release"}:
598
+ raise ValueError(
599
+ "primary_metric.display_ci mismatch: bounds do not match exp(ci)."
600
+ )
601
+ primary_metric["display_ci"] = [expected[0], expected[1]]
602
+ break
603
+
604
+
605
+ def _enforce_pairing_and_coverage(
606
+ stats: dict[str, Any] | None,
607
+ window_plan_profile: str | None,
608
+ tier: str | None,
609
+ ) -> None:
610
+ """Enforce pairing and coverage contracts for CI/Release profiles."""
611
+ profile = (window_plan_profile or "dev").lower()
612
+ if profile not in {"ci", "release"}:
613
+ return
614
+ if not isinstance(stats, dict):
615
+ raise ValueError("Missing dataset window stats for CI/Release enforcement.")
616
+
617
+ match_fraction = stats.get("window_match_fraction")
618
+ overlap_fraction = stats.get("window_overlap_fraction")
619
+ if not (
620
+ isinstance(match_fraction, (int | float))
621
+ and math.isfinite(float(match_fraction))
622
+ ):
623
+ raise ValueError("CI/Release requires window_match_fraction.")
624
+ if float(match_fraction) < 0.999999:
625
+ raise ValueError(
626
+ f"CI/Release requires perfect pairing (window_match_fraction={float(match_fraction):.6f})."
627
+ )
628
+
629
+ if not (
630
+ isinstance(overlap_fraction, (int | float))
631
+ and math.isfinite(float(overlap_fraction))
632
+ ):
633
+ raise ValueError("CI/Release requires window_overlap_fraction.")
634
+ if float(overlap_fraction) > 1e-9:
635
+ raise ValueError(
636
+ f"CI/Release requires non-overlapping windows (window_overlap_fraction={float(overlap_fraction):.6f})."
637
+ )
638
+
639
+ def _coerce_count(value: Any) -> int | None:
640
+ if value is None or isinstance(value, bool):
641
+ return None
642
+ try:
643
+ val = float(value)
644
+ except (TypeError, ValueError):
645
+ return None
646
+ if not math.isfinite(val) or val < 0:
647
+ return None
648
+ if abs(val - round(val)) > 1e-9:
649
+ return None
650
+ return int(round(val))
651
+
652
+ actual_preview = _coerce_count(stats.get("actual_preview"))
653
+ actual_final = _coerce_count(stats.get("actual_final"))
654
+ if actual_preview is None or actual_final is None:
655
+ coverage = stats.get("coverage")
656
+ if isinstance(coverage, dict):
657
+ if actual_preview is None:
658
+ actual_preview = _coerce_count(coverage.get("preview", {}).get("used"))
659
+ if actual_final is None:
660
+ actual_final = _coerce_count(coverage.get("final", {}).get("used"))
661
+
662
+ if actual_preview is None or actual_final is None:
663
+ raise ValueError("CI/Release requires preview/final window counts.")
664
+ if actual_preview != actual_final:
665
+ raise ValueError(
666
+ f"CI/Release requires matching preview/final counts "
667
+ f"(preview={actual_preview}, final={actual_final})."
668
+ )
669
+
670
+ from invarlock.core.runner import BOOTSTRAP_COVERAGE_REQUIREMENTS
671
+
672
+ tier_key = str(tier or "balanced").lower()
673
+ floors = BOOTSTRAP_COVERAGE_REQUIREMENTS.get(
674
+ tier_key, BOOTSTRAP_COVERAGE_REQUIREMENTS["balanced"]
675
+ )
676
+ preview_floor = int(floors.get("preview", 0))
677
+ final_floor = int(floors.get("final", 0))
678
+ replicates_floor = int(floors.get("replicates", 0))
679
+
680
+ coverage = stats.get("coverage")
681
+ if not isinstance(coverage, dict):
682
+ raise ValueError("CI/Release requires bootstrap coverage stats.")
683
+
684
+ preview_used = _coerce_count(coverage.get("preview", {}).get("used"))
685
+ final_used = _coerce_count(coverage.get("final", {}).get("used"))
686
+ replicates_used = _coerce_count(coverage.get("replicates", {}).get("used"))
687
+
688
+ if replicates_used is None:
689
+ bootstrap = stats.get("bootstrap")
690
+ if isinstance(bootstrap, dict):
691
+ replicates_used = _coerce_count(
692
+ bootstrap.get("replicates", bootstrap.get("n"))
693
+ )
694
+
695
+ if preview_used is None or final_used is None or replicates_used is None:
696
+ raise ValueError("CI/Release requires preview/final/replicates coverage stats.")
697
+
698
+ if preview_used < preview_floor or final_used < final_floor:
699
+ raise ValueError(
700
+ "CI/Release requires preview/final coverage at or above tier floors "
701
+ f"(preview={preview_used}/{preview_floor}, final={final_used}/{final_floor})."
702
+ )
703
+ if replicates_used < replicates_floor:
704
+ raise ValueError(
705
+ "CI/Release requires bootstrap replicates at or above tier floors "
706
+ f"(replicates={replicates_used}/{replicates_floor})."
707
+ )
708
+
709
+
541
710
  def _fallback_paired_windows(
542
711
  paired_windows: int, coverage_summary: dict[str, Any]
543
712
  ) -> int:
@@ -807,6 +976,47 @@ def make_certificate(
807
976
  if paired:
808
977
  paired_run, paired_base = paired
809
978
  paired_windows = len(paired_run)
979
+ paired_weights: list[float] | None = None
980
+ try:
981
+ run_ids = (
982
+ run_windows.get("window_ids") if isinstance(run_windows, dict) else None
983
+ )
984
+ run_w = (
985
+ run_windows.get("token_counts")
986
+ if isinstance(run_windows, dict)
987
+ else None
988
+ )
989
+ base_ids = (
990
+ baseline_windows.get("window_ids")
991
+ if isinstance(baseline_windows, dict)
992
+ else None
993
+ )
994
+ if (
995
+ isinstance(run_ids, list)
996
+ and isinstance(run_w, list)
997
+ and isinstance(base_ids, list)
998
+ ):
999
+ base_set = {
1000
+ int(b_id) for b_id in base_ids if isinstance(b_id, int | float)
1001
+ }
1002
+ weights: list[float] = []
1003
+ for r_id, w in zip(run_ids, run_w, strict=False):
1004
+ if not isinstance(r_id, int | float):
1005
+ continue
1006
+ key = int(r_id)
1007
+ if key not in base_set:
1008
+ continue
1009
+ try:
1010
+ wv = float(w)
1011
+ except Exception:
1012
+ continue
1013
+ if not math.isfinite(wv):
1014
+ continue
1015
+ weights.append(float(max(wv, 0.0)))
1016
+ if weights:
1017
+ paired_weights = weights
1018
+ except Exception: # pragma: no cover
1019
+ paired_weights = None
810
1020
  method = str(metrics_bootstrap.get("method", "percentile")).lower()
811
1021
  replicates = int(
812
1022
  metrics_bootstrap.get(
@@ -834,6 +1044,7 @@ def make_certificate(
834
1044
  delta_ci = compute_paired_delta_log_ci(
835
1045
  paired_run,
836
1046
  paired_base,
1047
+ weights=paired_weights,
837
1048
  method=ci_method,
838
1049
  replicates=replicates,
839
1050
  alpha=alpha,
@@ -1156,13 +1367,13 @@ def make_certificate(
1156
1367
  act_fin = req_fin
1157
1368
 
1158
1369
  if req_prev is not None:
1159
- stats_obj.setdefault("requested_preview", req_prev)
1370
+ stats_obj["requested_preview"] = req_prev
1160
1371
  if req_fin is not None:
1161
- stats_obj.setdefault("requested_final", req_fin)
1372
+ stats_obj["requested_final"] = req_fin
1162
1373
  if act_prev is not None:
1163
- stats_obj.setdefault("actual_preview", act_prev)
1374
+ stats_obj["actual_preview"] = act_prev
1164
1375
  if act_fin is not None:
1165
- stats_obj.setdefault("actual_final", act_fin)
1376
+ stats_obj["actual_final"] = act_fin
1166
1377
 
1167
1378
  if "coverage_ok" not in stats_obj:
1168
1379
  if (
@@ -1177,6 +1388,12 @@ def make_certificate(
1177
1388
  except Exception: # pragma: no cover
1178
1389
  pass
1179
1390
 
1391
+ _enforce_pairing_and_coverage(
1392
+ ppl_analysis.get("stats", {}),
1393
+ window_plan_profile,
1394
+ auto.get("tier", "balanced"),
1395
+ )
1396
+
1180
1397
  if isinstance(window_plan_ctx, dict):
1181
1398
  ppl_analysis["window_plan"] = window_plan_ctx
1182
1399
 
@@ -1895,6 +2112,12 @@ def make_certificate(
1895
2112
  from .primary_metric_utils import attach_primary_metric as _attach_pm
1896
2113
 
1897
2114
  _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2115
+ _enforce_display_ci_alignment(
2116
+ ratio_ci_source,
2117
+ certificate.get("primary_metric"),
2118
+ logloss_delta_ci,
2119
+ window_plan_profile,
2120
+ )
1898
2121
 
1899
2122
  # Ensure primary_metric has display_ci populated for schema invariants
1900
2123
  try:
@@ -2492,8 +2715,8 @@ def _prepare_guard_overhead_section(
2492
2715
  {
2493
2716
  "overhead_ratio": metrics.get("overhead_ratio"),
2494
2717
  "overhead_percent": metrics.get("overhead_percent"),
2495
- "bare_final": metrics.get("bare_final"),
2496
- "guarded_final": metrics.get("guarded_final"),
2718
+ "bare_ppl": metrics.get("bare_ppl"),
2719
+ "guarded_ppl": metrics.get("guarded_ppl"),
2497
2720
  "messages": list(result.messages),
2498
2721
  "warnings": list(result.warnings),
2499
2722
  "errors": list(result.errors),
@@ -2505,12 +2728,8 @@ def _prepare_guard_overhead_section(
2505
2728
  return sanitized, bool(result.passed)
2506
2729
 
2507
2730
  # Fall back to direct ratio computation when reports are not provided
2508
- bare_ppl = _coerce_float(payload.get("bare_final")) or _coerce_float(
2509
- payload.get("bare_ppl")
2510
- )
2511
- guarded_ppl = _coerce_float(payload.get("guarded_final")) or _coerce_float(
2512
- payload.get("guarded_ppl")
2513
- )
2731
+ bare_ppl = _coerce_float(payload.get("bare_ppl"))
2732
+ guarded_ppl = _coerce_float(payload.get("guarded_ppl"))
2514
2733
  ratio = _coerce_float(payload.get("overhead_ratio"))
2515
2734
 
2516
2735
  if ratio is None and bare_ppl is not None and guarded_ppl is not None:
@@ -194,6 +194,9 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
194
194
  "window_pairing_reason",
195
195
  "window_pairing_preview",
196
196
  "window_pairing_final",
197
+ "window_plan",
198
+ "window_capacity",
199
+ "stats",
197
200
  "total_tokens",
198
201
  "preview_total_tokens",
199
202
  "final_total_tokens",
@@ -579,9 +579,7 @@ def _extract_policy_overrides(report: RunReport) -> list[str]:
579
579
 
580
580
 
581
581
  def _compute_policy_digest(policy: dict[str, Any]) -> str:
582
- canonical = json.dumps(
583
- policy, sort_keys=True, default=str, separators=(",", ":"), ensure_ascii=True
584
- )
582
+ canonical = json.dumps(policy, sort_keys=True, default=str)
585
583
  return hashlib.sha256(canonical.encode()).hexdigest()[:16]
586
584
 
587
585
 
@@ -102,6 +102,23 @@ def attach_primary_metric(
102
102
  and float(base_final) > 0
103
103
  ):
104
104
  pm_copy["ratio_vs_baseline"] = float(fin) / float(base_final)
105
+ # Ensure display_ci aligns with log-space CI for ppl-like metrics
106
+ try:
107
+ kind = str(pm_copy.get("kind", "")).lower()
108
+ except Exception:
109
+ kind = ""
110
+ ci = pm_copy.get("ci")
111
+ if (
112
+ kind.startswith("ppl")
113
+ and isinstance(ci, list | tuple)
114
+ and len(ci) == 2
115
+ ):
116
+ try:
117
+ lo, hi = float(ci[0]), float(ci[1])
118
+ if math.isfinite(lo) and math.isfinite(hi):
119
+ pm_copy["display_ci"] = [math.exp(lo), math.exp(hi)]
120
+ except Exception:
121
+ pass
105
122
  # Provide a degenerate display CI if missing
106
123
  if not isinstance(
107
124
  pm_copy.get("display_ci"), list | tuple