invarlock 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/cli/commands/run.py +6 -0
- invarlock/cli/config.py +11 -1
- invarlock/cli/determinism.py +16 -1
- invarlock/core/bootstrap.py +137 -5
- invarlock/core/runner.py +305 -35
- invarlock/eval/bootstrap.py +3 -1
- invarlock/eval/primary_metric.py +20 -5
- invarlock/guards/rmt.py +536 -46
- invarlock/guards/spectral.py +1 -1
- invarlock/guards/variance.py +122 -43
- invarlock/reporting/certificate.py +231 -12
- invarlock/reporting/normalizer.py +3 -0
- invarlock/reporting/policy_utils.py +1 -3
- invarlock/reporting/primary_metric_utils.py +17 -0
- invarlock/reporting/validate.py +10 -10
- {invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/METADATA +2 -2
- {invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/RECORD +22 -22
- {invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
- {invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
invarlock/guards/spectral.py
CHANGED
invarlock/guards/variance.py
CHANGED
|
@@ -403,29 +403,36 @@ def _predictive_gate_outcome(
|
|
|
403
403
|
):
|
|
404
404
|
return False, "ci_unavailable"
|
|
405
405
|
|
|
406
|
+
lower = float(delta_ci[0])
|
|
406
407
|
upper = float(delta_ci[1])
|
|
407
408
|
min_effect = float(min_effect or 0.0)
|
|
408
409
|
|
|
410
|
+
# CI must clear zero (and the min-effect band when provided).
|
|
409
411
|
if one_sided:
|
|
410
|
-
# One-sided improvement (ΔlogNLL < 0): certify a minimum effect by
|
|
411
|
-
# requiring the *upper* bound of the (two-sided) CI to clear -min_effect.
|
|
412
412
|
if upper >= 0.0:
|
|
413
413
|
return False, "ci_contains_zero"
|
|
414
414
|
if mean_delta >= 0.0:
|
|
415
415
|
return False, "mean_not_negative"
|
|
416
|
-
|
|
417
|
-
|
|
416
|
+
if upper > -min_effect:
|
|
417
|
+
return False, "gain_below_threshold"
|
|
418
|
+
if mean_delta > -min_effect:
|
|
418
419
|
return False, "gain_below_threshold"
|
|
419
420
|
return True, "ci_gain_met"
|
|
420
421
|
|
|
421
|
-
# Two-sided
|
|
422
|
-
|
|
422
|
+
# Two-sided: detect regressions outside the +min_effect band, but only
|
|
423
|
+
# enable VE for negative improvements.
|
|
424
|
+
if lower <= 0.0 <= upper:
|
|
423
425
|
return False, "ci_contains_zero"
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
426
|
+
if lower > 0.0:
|
|
427
|
+
if lower >= min_effect and mean_delta >= min_effect:
|
|
428
|
+
return False, "regression_detected"
|
|
429
|
+
return False, "mean_not_negative"
|
|
430
|
+
if upper > -min_effect:
|
|
431
|
+
return False, "gain_below_threshold"
|
|
432
|
+
if mean_delta >= 0.0:
|
|
433
|
+
return False, "mean_not_negative"
|
|
434
|
+
if mean_delta > -min_effect:
|
|
427
435
|
return False, "gain_below_threshold"
|
|
428
|
-
|
|
429
436
|
return True, "ci_gain_met"
|
|
430
437
|
|
|
431
438
|
|
|
@@ -1441,12 +1448,17 @@ class VarianceGuard(Guard):
|
|
|
1441
1448
|
|
|
1442
1449
|
device = next(model.parameters()).device
|
|
1443
1450
|
torch.manual_seed(calib_seed)
|
|
1444
|
-
|
|
1445
|
-
|
|
1451
|
+
(
|
|
1452
|
+
ppl_no_ve_samples,
|
|
1453
|
+
loss_no_ve_samples,
|
|
1454
|
+
token_counts,
|
|
1455
|
+
) = self._compute_ppl_for_batches(
|
|
1456
|
+
model, calibration_batches, device, return_counts=True
|
|
1446
1457
|
)
|
|
1447
1458
|
coverage = min(len(calibration_batches), len(ppl_no_ve_samples))
|
|
1448
1459
|
ppl_with_ve_samples: list[float] = []
|
|
1449
1460
|
loss_with_ve_samples: list[float] = []
|
|
1461
|
+
token_counts_with: list[int] = []
|
|
1450
1462
|
ratio_ci: tuple[float, float] | None = None
|
|
1451
1463
|
|
|
1452
1464
|
enable_success = False
|
|
@@ -1462,10 +1474,12 @@ class VarianceGuard(Guard):
|
|
|
1462
1474
|
try:
|
|
1463
1475
|
torch.manual_seed(calib_seed)
|
|
1464
1476
|
if enable_success:
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1477
|
+
(
|
|
1478
|
+
ppl_with_ve_samples,
|
|
1479
|
+
loss_with_ve_samples,
|
|
1480
|
+
token_counts_with,
|
|
1481
|
+
) = self._compute_ppl_for_batches(
|
|
1482
|
+
model, calibration_batches, device, return_counts=True
|
|
1469
1483
|
)
|
|
1470
1484
|
finally:
|
|
1471
1485
|
if enable_success:
|
|
@@ -1478,6 +1492,8 @@ class VarianceGuard(Guard):
|
|
|
1478
1492
|
coverage,
|
|
1479
1493
|
len(ppl_with_ve_samples) if ppl_with_ve_samples else coverage,
|
|
1480
1494
|
len(loss_with_ve_samples) if loss_with_ve_samples else coverage,
|
|
1495
|
+
len(token_counts) if token_counts else coverage,
|
|
1496
|
+
len(token_counts_with) if token_counts_with else coverage,
|
|
1481
1497
|
)
|
|
1482
1498
|
self._calibration_stats.update(
|
|
1483
1499
|
{
|
|
@@ -1546,6 +1562,7 @@ class VarianceGuard(Guard):
|
|
|
1546
1562
|
loss_no_ve_samples = loss_no_ve_samples[:coverage]
|
|
1547
1563
|
ppl_with_ve_samples = ppl_with_ve_samples[:coverage]
|
|
1548
1564
|
loss_with_ve_samples = loss_with_ve_samples[:coverage]
|
|
1565
|
+
token_counts = token_counts[:coverage]
|
|
1549
1566
|
|
|
1550
1567
|
ratios = [
|
|
1551
1568
|
with_val / no_val
|
|
@@ -1602,6 +1619,7 @@ class VarianceGuard(Guard):
|
|
|
1602
1619
|
delta_ci = compute_paired_delta_log_ci(
|
|
1603
1620
|
loss_with_ve_samples,
|
|
1604
1621
|
loss_no_ve_samples,
|
|
1622
|
+
weights=token_counts,
|
|
1605
1623
|
method="bca",
|
|
1606
1624
|
replicates=500,
|
|
1607
1625
|
alpha=self._policy.get("alpha", 0.05),
|
|
@@ -1617,18 +1635,31 @@ class VarianceGuard(Guard):
|
|
|
1617
1635
|
)
|
|
1618
1636
|
|
|
1619
1637
|
predictive_state["evaluated"] = True
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1638
|
+
if token_counts:
|
|
1639
|
+
sw = 0.0
|
|
1640
|
+
swx = 0.0
|
|
1641
|
+
for with_loss, no_loss, weight in zip(
|
|
1642
|
+
loss_with_ve_samples,
|
|
1643
|
+
loss_no_ve_samples,
|
|
1644
|
+
token_counts,
|
|
1645
|
+
strict=False,
|
|
1646
|
+
):
|
|
1647
|
+
sw += float(weight)
|
|
1648
|
+
swx += float(weight) * (with_loss - no_loss)
|
|
1649
|
+
mean_delta = float(swx / sw) if sw > 0 else float("nan")
|
|
1650
|
+
else:
|
|
1651
|
+
mean_delta = float(
|
|
1652
|
+
np.mean(
|
|
1653
|
+
[
|
|
1654
|
+
with_loss - no_loss
|
|
1655
|
+
for with_loss, no_loss in zip(
|
|
1656
|
+
loss_with_ve_samples,
|
|
1657
|
+
loss_no_ve_samples,
|
|
1658
|
+
strict=False,
|
|
1659
|
+
)
|
|
1660
|
+
]
|
|
1661
|
+
)
|
|
1630
1662
|
)
|
|
1631
|
-
)
|
|
1632
1663
|
predictive_state["mean_delta"] = mean_delta
|
|
1633
1664
|
|
|
1634
1665
|
if delta_ci is not None and all(
|
|
@@ -1875,12 +1906,19 @@ class VarianceGuard(Guard):
|
|
|
1875
1906
|
model: nn.Module,
|
|
1876
1907
|
batches: list[Any],
|
|
1877
1908
|
device: torch.device,
|
|
1878
|
-
|
|
1909
|
+
*,
|
|
1910
|
+
return_counts: bool = False,
|
|
1911
|
+
) -> tuple[list[float], list[float]] | tuple[list[float], list[float], list[int]]:
|
|
1879
1912
|
"""Compute per-batch perplexity and log-loss values for deterministic calibration."""
|
|
1880
1913
|
ppl_values: list[float] = []
|
|
1881
1914
|
loss_values: list[float] = []
|
|
1915
|
+
token_counts: list[int] = []
|
|
1882
1916
|
if not batches:
|
|
1883
|
-
return
|
|
1917
|
+
return (
|
|
1918
|
+
(ppl_values, loss_values, token_counts)
|
|
1919
|
+
if return_counts
|
|
1920
|
+
else (ppl_values, loss_values)
|
|
1921
|
+
)
|
|
1884
1922
|
|
|
1885
1923
|
model_was_training = model.training
|
|
1886
1924
|
model.eval()
|
|
@@ -1919,12 +1957,29 @@ class VarianceGuard(Guard):
|
|
|
1919
1957
|
if math.isfinite(ppl):
|
|
1920
1958
|
ppl_values.append(ppl)
|
|
1921
1959
|
loss_values.append(loss)
|
|
1960
|
+
if return_counts:
|
|
1961
|
+
count = None
|
|
1962
|
+
try:
|
|
1963
|
+
if labels is not None and isinstance(
|
|
1964
|
+
labels, torch.Tensor
|
|
1965
|
+
):
|
|
1966
|
+
count = int((labels != -100).sum().item())
|
|
1967
|
+
except Exception:
|
|
1968
|
+
count = None
|
|
1969
|
+
if count is None:
|
|
1970
|
+
try:
|
|
1971
|
+
count = int(inputs.numel())
|
|
1972
|
+
except Exception:
|
|
1973
|
+
count = 0
|
|
1974
|
+
token_counts.append(int(max(count, 0)))
|
|
1922
1975
|
except Exception:
|
|
1923
1976
|
continue
|
|
1924
1977
|
|
|
1925
1978
|
if model_was_training:
|
|
1926
1979
|
model.train()
|
|
1927
1980
|
|
|
1981
|
+
if return_counts:
|
|
1982
|
+
return ppl_values, loss_values, token_counts
|
|
1928
1983
|
return ppl_values, loss_values
|
|
1929
1984
|
|
|
1930
1985
|
def _bootstrap_mean_ci(
|
|
@@ -2111,12 +2166,17 @@ class VarianceGuard(Guard):
|
|
|
2111
2166
|
if calibration_batches:
|
|
2112
2167
|
device = next(model.parameters()).device
|
|
2113
2168
|
torch.manual_seed(calib_seed)
|
|
2114
|
-
|
|
2115
|
-
|
|
2169
|
+
(
|
|
2170
|
+
ppl_no_ve_samples,
|
|
2171
|
+
loss_no_ve_samples,
|
|
2172
|
+
token_counts,
|
|
2173
|
+
) = self._compute_ppl_for_batches(
|
|
2174
|
+
model, calibration_batches, device, return_counts=True
|
|
2116
2175
|
)
|
|
2117
2176
|
coverage = min(len(calibration_batches), len(ppl_no_ve_samples))
|
|
2118
2177
|
ppl_with_ve_samples: list[float] = []
|
|
2119
2178
|
loss_with_ve_samples: list[float] = []
|
|
2179
|
+
token_counts_with: list[int] = []
|
|
2120
2180
|
ratio_ci: tuple[float, float] | None = None
|
|
2121
2181
|
|
|
2122
2182
|
enable_success = False
|
|
@@ -2135,8 +2195,9 @@ class VarianceGuard(Guard):
|
|
|
2135
2195
|
(
|
|
2136
2196
|
ppl_with_ve_samples,
|
|
2137
2197
|
loss_with_ve_samples,
|
|
2198
|
+
token_counts_with,
|
|
2138
2199
|
) = self._compute_ppl_for_batches(
|
|
2139
|
-
model, calibration_batches, device
|
|
2200
|
+
model, calibration_batches, device, return_counts=True
|
|
2140
2201
|
)
|
|
2141
2202
|
finally:
|
|
2142
2203
|
if enable_success:
|
|
@@ -2149,6 +2210,8 @@ class VarianceGuard(Guard):
|
|
|
2149
2210
|
coverage,
|
|
2150
2211
|
len(ppl_with_ve_samples) if ppl_with_ve_samples else coverage,
|
|
2151
2212
|
len(loss_with_ve_samples) if loss_with_ve_samples else coverage,
|
|
2213
|
+
len(token_counts) if token_counts else coverage,
|
|
2214
|
+
len(token_counts_with) if token_counts_with else coverage,
|
|
2152
2215
|
)
|
|
2153
2216
|
self._calibration_stats.update(
|
|
2154
2217
|
{"coverage": coverage, "status": "insufficient"}
|
|
@@ -2181,6 +2244,8 @@ class VarianceGuard(Guard):
|
|
|
2181
2244
|
loss_no_ve_samples = loss_no_ve_samples[:coverage]
|
|
2182
2245
|
ppl_with_ve_samples = ppl_with_ve_samples[:coverage]
|
|
2183
2246
|
loss_with_ve_samples = loss_with_ve_samples[:coverage]
|
|
2247
|
+
token_counts = token_counts[:coverage]
|
|
2248
|
+
token_counts_with = token_counts_with[:coverage]
|
|
2184
2249
|
|
|
2185
2250
|
ratios = [
|
|
2186
2251
|
with_val / no_val
|
|
@@ -2219,6 +2284,7 @@ class VarianceGuard(Guard):
|
|
|
2219
2284
|
delta_ci = compute_paired_delta_log_ci(
|
|
2220
2285
|
loss_with_ve_samples,
|
|
2221
2286
|
loss_no_ve_samples,
|
|
2287
|
+
weights=token_counts,
|
|
2222
2288
|
method="bca",
|
|
2223
2289
|
replicates=500,
|
|
2224
2290
|
alpha=self._policy.get("alpha", 0.05),
|
|
@@ -2234,18 +2300,31 @@ class VarianceGuard(Guard):
|
|
|
2234
2300
|
)
|
|
2235
2301
|
|
|
2236
2302
|
predictive_state["evaluated"] = True
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2303
|
+
if token_counts:
|
|
2304
|
+
sw = 0.0
|
|
2305
|
+
swx = 0.0
|
|
2306
|
+
for with_loss, no_loss, weight in zip(
|
|
2307
|
+
loss_with_ve_samples,
|
|
2308
|
+
loss_no_ve_samples,
|
|
2309
|
+
token_counts,
|
|
2310
|
+
strict=False,
|
|
2311
|
+
):
|
|
2312
|
+
sw += float(weight)
|
|
2313
|
+
swx += float(weight) * (with_loss - no_loss)
|
|
2314
|
+
mean_delta = float(swx / sw) if sw > 0 else float("nan")
|
|
2315
|
+
else:
|
|
2316
|
+
mean_delta = float(
|
|
2317
|
+
np.mean(
|
|
2318
|
+
[
|
|
2319
|
+
with_loss - no_loss
|
|
2320
|
+
for with_loss, no_loss in zip(
|
|
2321
|
+
loss_with_ve_samples,
|
|
2322
|
+
loss_no_ve_samples,
|
|
2323
|
+
strict=False,
|
|
2324
|
+
)
|
|
2325
|
+
]
|
|
2326
|
+
)
|
|
2247
2327
|
)
|
|
2248
|
-
)
|
|
2249
2328
|
predictive_state["mean_delta"] = mean_delta
|
|
2250
2329
|
|
|
2251
2330
|
if delta_ci is not None and all(
|
|
@@ -538,6 +538,175 @@ def _enforce_ratio_ci_alignment(
|
|
|
538
538
|
)
|
|
539
539
|
|
|
540
540
|
|
|
541
|
+
def _enforce_display_ci_alignment(
|
|
542
|
+
ratio_ci_source: str,
|
|
543
|
+
primary_metric: Any,
|
|
544
|
+
logloss_delta_ci: Any,
|
|
545
|
+
window_plan_profile: str | None,
|
|
546
|
+
) -> None:
|
|
547
|
+
"""Ensure display_ci matches exp(ci) for ppl-like metrics when paired."""
|
|
548
|
+
if ratio_ci_source != "paired_baseline":
|
|
549
|
+
return
|
|
550
|
+
if not isinstance(primary_metric, dict) or not primary_metric:
|
|
551
|
+
return
|
|
552
|
+
try:
|
|
553
|
+
kind = str(primary_metric.get("kind", "")).lower()
|
|
554
|
+
except Exception:
|
|
555
|
+
return
|
|
556
|
+
if not kind.startswith("ppl"):
|
|
557
|
+
return
|
|
558
|
+
|
|
559
|
+
def _finite_bounds(bounds: Any) -> bool:
|
|
560
|
+
return (
|
|
561
|
+
isinstance(bounds, tuple | list)
|
|
562
|
+
and len(bounds) == 2
|
|
563
|
+
and all(isinstance(v, int | float) and math.isfinite(v) for v in bounds)
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
ci = primary_metric.get("ci")
|
|
567
|
+
if not _finite_bounds(ci):
|
|
568
|
+
if _finite_bounds(logloss_delta_ci):
|
|
569
|
+
primary_metric["ci"] = (
|
|
570
|
+
float(logloss_delta_ci[0]),
|
|
571
|
+
float(logloss_delta_ci[1]),
|
|
572
|
+
)
|
|
573
|
+
ci = primary_metric["ci"]
|
|
574
|
+
else:
|
|
575
|
+
profile = (window_plan_profile or "dev").lower()
|
|
576
|
+
if profile in {"ci", "release"}:
|
|
577
|
+
raise ValueError(
|
|
578
|
+
"primary_metric.ci missing for ppl-like metric under paired baseline."
|
|
579
|
+
)
|
|
580
|
+
return
|
|
581
|
+
|
|
582
|
+
expected = tuple(math.exp(float(bound)) for bound in ci)
|
|
583
|
+
display_ci = primary_metric.get("display_ci")
|
|
584
|
+
if not _finite_bounds(display_ci):
|
|
585
|
+
profile = (window_plan_profile or "dev").lower()
|
|
586
|
+
if profile in {"ci", "release"}:
|
|
587
|
+
raise ValueError(
|
|
588
|
+
"primary_metric.display_ci missing for ppl-like metric under paired baseline."
|
|
589
|
+
)
|
|
590
|
+
primary_metric["display_ci"] = [expected[0], expected[1]]
|
|
591
|
+
return
|
|
592
|
+
|
|
593
|
+
for observed, exp_val in zip(display_ci, expected, strict=False):
|
|
594
|
+
tolerance = 5e-4 * max(1.0, abs(exp_val))
|
|
595
|
+
if abs(float(observed) - float(exp_val)) > tolerance:
|
|
596
|
+
profile = (window_plan_profile or "dev").lower()
|
|
597
|
+
if profile in {"ci", "release"}:
|
|
598
|
+
raise ValueError(
|
|
599
|
+
"primary_metric.display_ci mismatch: bounds do not match exp(ci)."
|
|
600
|
+
)
|
|
601
|
+
primary_metric["display_ci"] = [expected[0], expected[1]]
|
|
602
|
+
break
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def _enforce_pairing_and_coverage(
|
|
606
|
+
stats: dict[str, Any] | None,
|
|
607
|
+
window_plan_profile: str | None,
|
|
608
|
+
tier: str | None,
|
|
609
|
+
) -> None:
|
|
610
|
+
"""Enforce pairing and coverage contracts for CI/Release profiles."""
|
|
611
|
+
profile = (window_plan_profile or "dev").lower()
|
|
612
|
+
if profile not in {"ci", "release"}:
|
|
613
|
+
return
|
|
614
|
+
if not isinstance(stats, dict):
|
|
615
|
+
raise ValueError("Missing dataset window stats for CI/Release enforcement.")
|
|
616
|
+
|
|
617
|
+
match_fraction = stats.get("window_match_fraction")
|
|
618
|
+
overlap_fraction = stats.get("window_overlap_fraction")
|
|
619
|
+
if not (
|
|
620
|
+
isinstance(match_fraction, (int | float))
|
|
621
|
+
and math.isfinite(float(match_fraction))
|
|
622
|
+
):
|
|
623
|
+
raise ValueError("CI/Release requires window_match_fraction.")
|
|
624
|
+
if float(match_fraction) < 0.999999:
|
|
625
|
+
raise ValueError(
|
|
626
|
+
f"CI/Release requires perfect pairing (window_match_fraction={float(match_fraction):.6f})."
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
if not (
|
|
630
|
+
isinstance(overlap_fraction, (int | float))
|
|
631
|
+
and math.isfinite(float(overlap_fraction))
|
|
632
|
+
):
|
|
633
|
+
raise ValueError("CI/Release requires window_overlap_fraction.")
|
|
634
|
+
if float(overlap_fraction) > 1e-9:
|
|
635
|
+
raise ValueError(
|
|
636
|
+
f"CI/Release requires non-overlapping windows (window_overlap_fraction={float(overlap_fraction):.6f})."
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
def _coerce_count(value: Any) -> int | None:
|
|
640
|
+
if value is None or isinstance(value, bool):
|
|
641
|
+
return None
|
|
642
|
+
try:
|
|
643
|
+
val = float(value)
|
|
644
|
+
except (TypeError, ValueError):
|
|
645
|
+
return None
|
|
646
|
+
if not math.isfinite(val) or val < 0:
|
|
647
|
+
return None
|
|
648
|
+
if abs(val - round(val)) > 1e-9:
|
|
649
|
+
return None
|
|
650
|
+
return int(round(val))
|
|
651
|
+
|
|
652
|
+
actual_preview = _coerce_count(stats.get("actual_preview"))
|
|
653
|
+
actual_final = _coerce_count(stats.get("actual_final"))
|
|
654
|
+
if actual_preview is None or actual_final is None:
|
|
655
|
+
coverage = stats.get("coverage")
|
|
656
|
+
if isinstance(coverage, dict):
|
|
657
|
+
if actual_preview is None:
|
|
658
|
+
actual_preview = _coerce_count(coverage.get("preview", {}).get("used"))
|
|
659
|
+
if actual_final is None:
|
|
660
|
+
actual_final = _coerce_count(coverage.get("final", {}).get("used"))
|
|
661
|
+
|
|
662
|
+
if actual_preview is None or actual_final is None:
|
|
663
|
+
raise ValueError("CI/Release requires preview/final window counts.")
|
|
664
|
+
if actual_preview != actual_final:
|
|
665
|
+
raise ValueError(
|
|
666
|
+
f"CI/Release requires matching preview/final counts "
|
|
667
|
+
f"(preview={actual_preview}, final={actual_final})."
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
from invarlock.core.runner import BOOTSTRAP_COVERAGE_REQUIREMENTS
|
|
671
|
+
|
|
672
|
+
tier_key = str(tier or "balanced").lower()
|
|
673
|
+
floors = BOOTSTRAP_COVERAGE_REQUIREMENTS.get(
|
|
674
|
+
tier_key, BOOTSTRAP_COVERAGE_REQUIREMENTS["balanced"]
|
|
675
|
+
)
|
|
676
|
+
preview_floor = int(floors.get("preview", 0))
|
|
677
|
+
final_floor = int(floors.get("final", 0))
|
|
678
|
+
replicates_floor = int(floors.get("replicates", 0))
|
|
679
|
+
|
|
680
|
+
coverage = stats.get("coverage")
|
|
681
|
+
if not isinstance(coverage, dict):
|
|
682
|
+
raise ValueError("CI/Release requires bootstrap coverage stats.")
|
|
683
|
+
|
|
684
|
+
preview_used = _coerce_count(coverage.get("preview", {}).get("used"))
|
|
685
|
+
final_used = _coerce_count(coverage.get("final", {}).get("used"))
|
|
686
|
+
replicates_used = _coerce_count(coverage.get("replicates", {}).get("used"))
|
|
687
|
+
|
|
688
|
+
if replicates_used is None:
|
|
689
|
+
bootstrap = stats.get("bootstrap")
|
|
690
|
+
if isinstance(bootstrap, dict):
|
|
691
|
+
replicates_used = _coerce_count(
|
|
692
|
+
bootstrap.get("replicates", bootstrap.get("n"))
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
if preview_used is None or final_used is None or replicates_used is None:
|
|
696
|
+
raise ValueError("CI/Release requires preview/final/replicates coverage stats.")
|
|
697
|
+
|
|
698
|
+
if preview_used < preview_floor or final_used < final_floor:
|
|
699
|
+
raise ValueError(
|
|
700
|
+
"CI/Release requires preview/final coverage at or above tier floors "
|
|
701
|
+
f"(preview={preview_used}/{preview_floor}, final={final_used}/{final_floor})."
|
|
702
|
+
)
|
|
703
|
+
if replicates_used < replicates_floor:
|
|
704
|
+
raise ValueError(
|
|
705
|
+
"CI/Release requires bootstrap replicates at or above tier floors "
|
|
706
|
+
f"(replicates={replicates_used}/{replicates_floor})."
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
|
|
541
710
|
def _fallback_paired_windows(
|
|
542
711
|
paired_windows: int, coverage_summary: dict[str, Any]
|
|
543
712
|
) -> int:
|
|
@@ -807,6 +976,47 @@ def make_certificate(
|
|
|
807
976
|
if paired:
|
|
808
977
|
paired_run, paired_base = paired
|
|
809
978
|
paired_windows = len(paired_run)
|
|
979
|
+
paired_weights: list[float] | None = None
|
|
980
|
+
try:
|
|
981
|
+
run_ids = (
|
|
982
|
+
run_windows.get("window_ids") if isinstance(run_windows, dict) else None
|
|
983
|
+
)
|
|
984
|
+
run_w = (
|
|
985
|
+
run_windows.get("token_counts")
|
|
986
|
+
if isinstance(run_windows, dict)
|
|
987
|
+
else None
|
|
988
|
+
)
|
|
989
|
+
base_ids = (
|
|
990
|
+
baseline_windows.get("window_ids")
|
|
991
|
+
if isinstance(baseline_windows, dict)
|
|
992
|
+
else None
|
|
993
|
+
)
|
|
994
|
+
if (
|
|
995
|
+
isinstance(run_ids, list)
|
|
996
|
+
and isinstance(run_w, list)
|
|
997
|
+
and isinstance(base_ids, list)
|
|
998
|
+
):
|
|
999
|
+
base_set = {
|
|
1000
|
+
int(b_id) for b_id in base_ids if isinstance(b_id, int | float)
|
|
1001
|
+
}
|
|
1002
|
+
weights: list[float] = []
|
|
1003
|
+
for r_id, w in zip(run_ids, run_w, strict=False):
|
|
1004
|
+
if not isinstance(r_id, int | float):
|
|
1005
|
+
continue
|
|
1006
|
+
key = int(r_id)
|
|
1007
|
+
if key not in base_set:
|
|
1008
|
+
continue
|
|
1009
|
+
try:
|
|
1010
|
+
wv = float(w)
|
|
1011
|
+
except Exception:
|
|
1012
|
+
continue
|
|
1013
|
+
if not math.isfinite(wv):
|
|
1014
|
+
continue
|
|
1015
|
+
weights.append(float(max(wv, 0.0)))
|
|
1016
|
+
if weights:
|
|
1017
|
+
paired_weights = weights
|
|
1018
|
+
except Exception: # pragma: no cover
|
|
1019
|
+
paired_weights = None
|
|
810
1020
|
method = str(metrics_bootstrap.get("method", "percentile")).lower()
|
|
811
1021
|
replicates = int(
|
|
812
1022
|
metrics_bootstrap.get(
|
|
@@ -834,6 +1044,7 @@ def make_certificate(
|
|
|
834
1044
|
delta_ci = compute_paired_delta_log_ci(
|
|
835
1045
|
paired_run,
|
|
836
1046
|
paired_base,
|
|
1047
|
+
weights=paired_weights,
|
|
837
1048
|
method=ci_method,
|
|
838
1049
|
replicates=replicates,
|
|
839
1050
|
alpha=alpha,
|
|
@@ -1156,13 +1367,13 @@ def make_certificate(
|
|
|
1156
1367
|
act_fin = req_fin
|
|
1157
1368
|
|
|
1158
1369
|
if req_prev is not None:
|
|
1159
|
-
stats_obj
|
|
1370
|
+
stats_obj["requested_preview"] = req_prev
|
|
1160
1371
|
if req_fin is not None:
|
|
1161
|
-
stats_obj
|
|
1372
|
+
stats_obj["requested_final"] = req_fin
|
|
1162
1373
|
if act_prev is not None:
|
|
1163
|
-
stats_obj
|
|
1374
|
+
stats_obj["actual_preview"] = act_prev
|
|
1164
1375
|
if act_fin is not None:
|
|
1165
|
-
stats_obj
|
|
1376
|
+
stats_obj["actual_final"] = act_fin
|
|
1166
1377
|
|
|
1167
1378
|
if "coverage_ok" not in stats_obj:
|
|
1168
1379
|
if (
|
|
@@ -1177,6 +1388,12 @@ def make_certificate(
|
|
|
1177
1388
|
except Exception: # pragma: no cover
|
|
1178
1389
|
pass
|
|
1179
1390
|
|
|
1391
|
+
_enforce_pairing_and_coverage(
|
|
1392
|
+
ppl_analysis.get("stats", {}),
|
|
1393
|
+
window_plan_profile,
|
|
1394
|
+
auto.get("tier", "balanced"),
|
|
1395
|
+
)
|
|
1396
|
+
|
|
1180
1397
|
if isinstance(window_plan_ctx, dict):
|
|
1181
1398
|
ppl_analysis["window_plan"] = window_plan_ctx
|
|
1182
1399
|
|
|
@@ -1895,6 +2112,12 @@ def make_certificate(
|
|
|
1895
2112
|
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
1896
2113
|
|
|
1897
2114
|
_attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2115
|
+
_enforce_display_ci_alignment(
|
|
2116
|
+
ratio_ci_source,
|
|
2117
|
+
certificate.get("primary_metric"),
|
|
2118
|
+
logloss_delta_ci,
|
|
2119
|
+
window_plan_profile,
|
|
2120
|
+
)
|
|
1898
2121
|
|
|
1899
2122
|
# Ensure primary_metric has display_ci populated for schema invariants
|
|
1900
2123
|
try:
|
|
@@ -2492,8 +2715,8 @@ def _prepare_guard_overhead_section(
|
|
|
2492
2715
|
{
|
|
2493
2716
|
"overhead_ratio": metrics.get("overhead_ratio"),
|
|
2494
2717
|
"overhead_percent": metrics.get("overhead_percent"),
|
|
2495
|
-
"
|
|
2496
|
-
"
|
|
2718
|
+
"bare_ppl": metrics.get("bare_ppl"),
|
|
2719
|
+
"guarded_ppl": metrics.get("guarded_ppl"),
|
|
2497
2720
|
"messages": list(result.messages),
|
|
2498
2721
|
"warnings": list(result.warnings),
|
|
2499
2722
|
"errors": list(result.errors),
|
|
@@ -2505,12 +2728,8 @@ def _prepare_guard_overhead_section(
|
|
|
2505
2728
|
return sanitized, bool(result.passed)
|
|
2506
2729
|
|
|
2507
2730
|
# Fall back to direct ratio computation when reports are not provided
|
|
2508
|
-
bare_ppl = _coerce_float(payload.get("
|
|
2509
|
-
|
|
2510
|
-
)
|
|
2511
|
-
guarded_ppl = _coerce_float(payload.get("guarded_final")) or _coerce_float(
|
|
2512
|
-
payload.get("guarded_ppl")
|
|
2513
|
-
)
|
|
2731
|
+
bare_ppl = _coerce_float(payload.get("bare_ppl"))
|
|
2732
|
+
guarded_ppl = _coerce_float(payload.get("guarded_ppl"))
|
|
2514
2733
|
ratio = _coerce_float(payload.get("overhead_ratio"))
|
|
2515
2734
|
|
|
2516
2735
|
if ratio is None and bare_ppl is not None and guarded_ppl is not None:
|
|
@@ -194,6 +194,9 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
194
194
|
"window_pairing_reason",
|
|
195
195
|
"window_pairing_preview",
|
|
196
196
|
"window_pairing_final",
|
|
197
|
+
"window_plan",
|
|
198
|
+
"window_capacity",
|
|
199
|
+
"stats",
|
|
197
200
|
"total_tokens",
|
|
198
201
|
"preview_total_tokens",
|
|
199
202
|
"final_total_tokens",
|
|
@@ -579,9 +579,7 @@ def _extract_policy_overrides(report: RunReport) -> list[str]:
|
|
|
579
579
|
|
|
580
580
|
|
|
581
581
|
def _compute_policy_digest(policy: dict[str, Any]) -> str:
|
|
582
|
-
canonical = json.dumps(
|
|
583
|
-
policy, sort_keys=True, default=str, separators=(",", ":"), ensure_ascii=True
|
|
584
|
-
)
|
|
582
|
+
canonical = json.dumps(policy, sort_keys=True, default=str)
|
|
585
583
|
return hashlib.sha256(canonical.encode()).hexdigest()[:16]
|
|
586
584
|
|
|
587
585
|
|
|
@@ -102,6 +102,23 @@ def attach_primary_metric(
|
|
|
102
102
|
and float(base_final) > 0
|
|
103
103
|
):
|
|
104
104
|
pm_copy["ratio_vs_baseline"] = float(fin) / float(base_final)
|
|
105
|
+
# Ensure display_ci aligns with log-space CI for ppl-like metrics
|
|
106
|
+
try:
|
|
107
|
+
kind = str(pm_copy.get("kind", "")).lower()
|
|
108
|
+
except Exception:
|
|
109
|
+
kind = ""
|
|
110
|
+
ci = pm_copy.get("ci")
|
|
111
|
+
if (
|
|
112
|
+
kind.startswith("ppl")
|
|
113
|
+
and isinstance(ci, list | tuple)
|
|
114
|
+
and len(ci) == 2
|
|
115
|
+
):
|
|
116
|
+
try:
|
|
117
|
+
lo, hi = float(ci[0]), float(ci[1])
|
|
118
|
+
if math.isfinite(lo) and math.isfinite(hi):
|
|
119
|
+
pm_copy["display_ci"] = [math.exp(lo), math.exp(hi)]
|
|
120
|
+
except Exception:
|
|
121
|
+
pass
|
|
105
122
|
# Provide a degenerate display CI if missing
|
|
106
123
|
if not isinstance(
|
|
107
124
|
pm_copy.get("display_ci"), list | tuple
|