invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/tiers.yaml +61 -0
- invarlock/adapters/hf_loading.py +97 -0
- invarlock/calibration/__init__.py +6 -0
- invarlock/calibration/spectral_null.py +301 -0
- invarlock/calibration/variance_ve.py +154 -0
- invarlock/cli/app.py +15 -0
- invarlock/cli/commands/calibrate.py +576 -0
- invarlock/cli/commands/doctor.py +9 -3
- invarlock/cli/commands/explain_gates.py +53 -9
- invarlock/cli/commands/plugins.py +12 -2
- invarlock/cli/commands/run.py +181 -79
- invarlock/cli/commands/verify.py +40 -0
- invarlock/cli/config.py +11 -1
- invarlock/cli/determinism.py +252 -0
- invarlock/core/auto_tuning.py +215 -17
- invarlock/core/bootstrap.py +137 -5
- invarlock/core/registry.py +9 -4
- invarlock/core/runner.py +305 -35
- invarlock/eval/bench.py +467 -141
- invarlock/eval/bench_regression.py +12 -0
- invarlock/eval/bootstrap.py +3 -1
- invarlock/eval/data.py +29 -7
- invarlock/eval/primary_metric.py +20 -5
- invarlock/guards/rmt.py +536 -46
- invarlock/guards/spectral.py +217 -10
- invarlock/guards/variance.py +124 -42
- invarlock/reporting/certificate.py +476 -45
- invarlock/reporting/certificate_schema.py +4 -1
- invarlock/reporting/guards_analysis.py +108 -10
- invarlock/reporting/normalizer.py +24 -1
- invarlock/reporting/policy_utils.py +97 -15
- invarlock/reporting/primary_metric_utils.py +17 -0
- invarlock/reporting/validate.py +10 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
|
@@ -29,7 +29,7 @@ try: # pragma: no cover - exercised in integration
|
|
|
29
29
|
except Exception: # pragma: no cover
|
|
30
30
|
jsonschema = None # type: ignore
|
|
31
31
|
|
|
32
|
-
from invarlock.core.auto_tuning import
|
|
32
|
+
from invarlock.core.auto_tuning import get_tier_policies
|
|
33
33
|
from invarlock.core.bootstrap import (
|
|
34
34
|
compute_paired_delta_log_ci,
|
|
35
35
|
logspace_to_ratio_ci,
|
|
@@ -538,6 +538,175 @@ def _enforce_ratio_ci_alignment(
|
|
|
538
538
|
)
|
|
539
539
|
|
|
540
540
|
|
|
541
|
+
def _enforce_display_ci_alignment(
|
|
542
|
+
ratio_ci_source: str,
|
|
543
|
+
primary_metric: Any,
|
|
544
|
+
logloss_delta_ci: Any,
|
|
545
|
+
window_plan_profile: str | None,
|
|
546
|
+
) -> None:
|
|
547
|
+
"""Ensure display_ci matches exp(ci) for ppl-like metrics when paired."""
|
|
548
|
+
if ratio_ci_source != "paired_baseline":
|
|
549
|
+
return
|
|
550
|
+
if not isinstance(primary_metric, dict) or not primary_metric:
|
|
551
|
+
return
|
|
552
|
+
try:
|
|
553
|
+
kind = str(primary_metric.get("kind", "")).lower()
|
|
554
|
+
except Exception:
|
|
555
|
+
return
|
|
556
|
+
if not kind.startswith("ppl"):
|
|
557
|
+
return
|
|
558
|
+
|
|
559
|
+
def _finite_bounds(bounds: Any) -> bool:
|
|
560
|
+
return (
|
|
561
|
+
isinstance(bounds, tuple | list)
|
|
562
|
+
and len(bounds) == 2
|
|
563
|
+
and all(isinstance(v, int | float) and math.isfinite(v) for v in bounds)
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
ci = primary_metric.get("ci")
|
|
567
|
+
if not _finite_bounds(ci):
|
|
568
|
+
if _finite_bounds(logloss_delta_ci):
|
|
569
|
+
primary_metric["ci"] = (
|
|
570
|
+
float(logloss_delta_ci[0]),
|
|
571
|
+
float(logloss_delta_ci[1]),
|
|
572
|
+
)
|
|
573
|
+
ci = primary_metric["ci"]
|
|
574
|
+
else:
|
|
575
|
+
profile = (window_plan_profile or "dev").lower()
|
|
576
|
+
if profile in {"ci", "release"}:
|
|
577
|
+
raise ValueError(
|
|
578
|
+
"primary_metric.ci missing for ppl-like metric under paired baseline."
|
|
579
|
+
)
|
|
580
|
+
return
|
|
581
|
+
|
|
582
|
+
expected = tuple(math.exp(float(bound)) for bound in ci)
|
|
583
|
+
display_ci = primary_metric.get("display_ci")
|
|
584
|
+
if not _finite_bounds(display_ci):
|
|
585
|
+
profile = (window_plan_profile or "dev").lower()
|
|
586
|
+
if profile in {"ci", "release"}:
|
|
587
|
+
raise ValueError(
|
|
588
|
+
"primary_metric.display_ci missing for ppl-like metric under paired baseline."
|
|
589
|
+
)
|
|
590
|
+
primary_metric["display_ci"] = [expected[0], expected[1]]
|
|
591
|
+
return
|
|
592
|
+
|
|
593
|
+
for observed, exp_val in zip(display_ci, expected, strict=False):
|
|
594
|
+
tolerance = 5e-4 * max(1.0, abs(exp_val))
|
|
595
|
+
if abs(float(observed) - float(exp_val)) > tolerance:
|
|
596
|
+
profile = (window_plan_profile or "dev").lower()
|
|
597
|
+
if profile in {"ci", "release"}:
|
|
598
|
+
raise ValueError(
|
|
599
|
+
"primary_metric.display_ci mismatch: bounds do not match exp(ci)."
|
|
600
|
+
)
|
|
601
|
+
primary_metric["display_ci"] = [expected[0], expected[1]]
|
|
602
|
+
break
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def _enforce_pairing_and_coverage(
|
|
606
|
+
stats: dict[str, Any] | None,
|
|
607
|
+
window_plan_profile: str | None,
|
|
608
|
+
tier: str | None,
|
|
609
|
+
) -> None:
|
|
610
|
+
"""Enforce pairing and coverage contracts for CI/Release profiles."""
|
|
611
|
+
profile = (window_plan_profile or "dev").lower()
|
|
612
|
+
if profile not in {"ci", "release"}:
|
|
613
|
+
return
|
|
614
|
+
if not isinstance(stats, dict):
|
|
615
|
+
raise ValueError("Missing dataset window stats for CI/Release enforcement.")
|
|
616
|
+
|
|
617
|
+
match_fraction = stats.get("window_match_fraction")
|
|
618
|
+
overlap_fraction = stats.get("window_overlap_fraction")
|
|
619
|
+
if not (
|
|
620
|
+
isinstance(match_fraction, (int | float))
|
|
621
|
+
and math.isfinite(float(match_fraction))
|
|
622
|
+
):
|
|
623
|
+
raise ValueError("CI/Release requires window_match_fraction.")
|
|
624
|
+
if float(match_fraction) < 0.999999:
|
|
625
|
+
raise ValueError(
|
|
626
|
+
f"CI/Release requires perfect pairing (window_match_fraction={float(match_fraction):.6f})."
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
if not (
|
|
630
|
+
isinstance(overlap_fraction, (int | float))
|
|
631
|
+
and math.isfinite(float(overlap_fraction))
|
|
632
|
+
):
|
|
633
|
+
raise ValueError("CI/Release requires window_overlap_fraction.")
|
|
634
|
+
if float(overlap_fraction) > 1e-9:
|
|
635
|
+
raise ValueError(
|
|
636
|
+
f"CI/Release requires non-overlapping windows (window_overlap_fraction={float(overlap_fraction):.6f})."
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
def _coerce_count(value: Any) -> int | None:
|
|
640
|
+
if value is None or isinstance(value, bool):
|
|
641
|
+
return None
|
|
642
|
+
try:
|
|
643
|
+
val = float(value)
|
|
644
|
+
except (TypeError, ValueError):
|
|
645
|
+
return None
|
|
646
|
+
if not math.isfinite(val) or val < 0:
|
|
647
|
+
return None
|
|
648
|
+
if abs(val - round(val)) > 1e-9:
|
|
649
|
+
return None
|
|
650
|
+
return int(round(val))
|
|
651
|
+
|
|
652
|
+
actual_preview = _coerce_count(stats.get("actual_preview"))
|
|
653
|
+
actual_final = _coerce_count(stats.get("actual_final"))
|
|
654
|
+
if actual_preview is None or actual_final is None:
|
|
655
|
+
coverage = stats.get("coverage")
|
|
656
|
+
if isinstance(coverage, dict):
|
|
657
|
+
if actual_preview is None:
|
|
658
|
+
actual_preview = _coerce_count(coverage.get("preview", {}).get("used"))
|
|
659
|
+
if actual_final is None:
|
|
660
|
+
actual_final = _coerce_count(coverage.get("final", {}).get("used"))
|
|
661
|
+
|
|
662
|
+
if actual_preview is None or actual_final is None:
|
|
663
|
+
raise ValueError("CI/Release requires preview/final window counts.")
|
|
664
|
+
if actual_preview != actual_final:
|
|
665
|
+
raise ValueError(
|
|
666
|
+
f"CI/Release requires matching preview/final counts "
|
|
667
|
+
f"(preview={actual_preview}, final={actual_final})."
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
from invarlock.core.runner import BOOTSTRAP_COVERAGE_REQUIREMENTS
|
|
671
|
+
|
|
672
|
+
tier_key = str(tier or "balanced").lower()
|
|
673
|
+
floors = BOOTSTRAP_COVERAGE_REQUIREMENTS.get(
|
|
674
|
+
tier_key, BOOTSTRAP_COVERAGE_REQUIREMENTS["balanced"]
|
|
675
|
+
)
|
|
676
|
+
preview_floor = int(floors.get("preview", 0))
|
|
677
|
+
final_floor = int(floors.get("final", 0))
|
|
678
|
+
replicates_floor = int(floors.get("replicates", 0))
|
|
679
|
+
|
|
680
|
+
coverage = stats.get("coverage")
|
|
681
|
+
if not isinstance(coverage, dict):
|
|
682
|
+
raise ValueError("CI/Release requires bootstrap coverage stats.")
|
|
683
|
+
|
|
684
|
+
preview_used = _coerce_count(coverage.get("preview", {}).get("used"))
|
|
685
|
+
final_used = _coerce_count(coverage.get("final", {}).get("used"))
|
|
686
|
+
replicates_used = _coerce_count(coverage.get("replicates", {}).get("used"))
|
|
687
|
+
|
|
688
|
+
if replicates_used is None:
|
|
689
|
+
bootstrap = stats.get("bootstrap")
|
|
690
|
+
if isinstance(bootstrap, dict):
|
|
691
|
+
replicates_used = _coerce_count(
|
|
692
|
+
bootstrap.get("replicates", bootstrap.get("n"))
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
if preview_used is None or final_used is None or replicates_used is None:
|
|
696
|
+
raise ValueError("CI/Release requires preview/final/replicates coverage stats.")
|
|
697
|
+
|
|
698
|
+
if preview_used < preview_floor or final_used < final_floor:
|
|
699
|
+
raise ValueError(
|
|
700
|
+
"CI/Release requires preview/final coverage at or above tier floors "
|
|
701
|
+
f"(preview={preview_used}/{preview_floor}, final={final_used}/{final_floor})."
|
|
702
|
+
)
|
|
703
|
+
if replicates_used < replicates_floor:
|
|
704
|
+
raise ValueError(
|
|
705
|
+
"CI/Release requires bootstrap replicates at or above tier floors "
|
|
706
|
+
f"(replicates={replicates_used}/{replicates_floor})."
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
|
|
541
710
|
def _fallback_paired_windows(
|
|
542
711
|
paired_windows: int, coverage_summary: dict[str, Any]
|
|
543
712
|
) -> int:
|
|
@@ -598,6 +767,18 @@ def make_certificate(
|
|
|
598
767
|
except Exception: # pragma: no cover
|
|
599
768
|
pass
|
|
600
769
|
|
|
770
|
+
# Determinism preset (CI/Release provenance) when present.
|
|
771
|
+
try:
|
|
772
|
+
det = (
|
|
773
|
+
report.get("meta", {}).get("determinism")
|
|
774
|
+
if isinstance(report.get("meta"), dict)
|
|
775
|
+
else None
|
|
776
|
+
)
|
|
777
|
+
if isinstance(det, dict) and det:
|
|
778
|
+
meta["determinism"] = det
|
|
779
|
+
except Exception: # pragma: no cover
|
|
780
|
+
pass
|
|
781
|
+
|
|
601
782
|
tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
|
|
602
783
|
if not tokenizer_hash_meta:
|
|
603
784
|
dataset_section = report.get("data", {})
|
|
@@ -627,6 +808,13 @@ def make_certificate(
|
|
|
627
808
|
|
|
628
809
|
# Extract dataset configuration and compute hashes
|
|
629
810
|
dataset_info = _extract_dataset_info(report)
|
|
811
|
+
try:
|
|
812
|
+
if isinstance(dataset_info, dict):
|
|
813
|
+
windows = dataset_info.get("windows")
|
|
814
|
+
if isinstance(windows, dict):
|
|
815
|
+
windows.setdefault("stats", {})
|
|
816
|
+
except Exception: # pragma: no cover
|
|
817
|
+
pass
|
|
630
818
|
|
|
631
819
|
# Baseline reference (PM-only). Derive a primary_metric snapshot from baseline windows.
|
|
632
820
|
# Prefer explicit baseline primary_metric when provided; otherwise compute from windows
|
|
@@ -741,15 +929,17 @@ def make_certificate(
|
|
|
741
929
|
tier = str(auto_cfg.get("tier")).lower()
|
|
742
930
|
except Exception: # pragma: no cover
|
|
743
931
|
pass
|
|
932
|
+
tier_policies = get_tier_policies()
|
|
933
|
+
tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
744
934
|
metrics_policy = (
|
|
745
|
-
|
|
746
|
-
if isinstance(tier, str)
|
|
747
|
-
else {}
|
|
935
|
+
tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
|
|
748
936
|
)
|
|
749
|
-
|
|
750
|
-
metrics_policy.get("
|
|
937
|
+
pm_policy = (
|
|
938
|
+
metrics_policy.get("pm_ratio", {})
|
|
939
|
+
if isinstance(metrics_policy, dict)
|
|
940
|
+
else {}
|
|
751
941
|
)
|
|
752
|
-
min_tokens = int(
|
|
942
|
+
min_tokens = int(pm_policy.get("min_tokens", 0))
|
|
753
943
|
if (
|
|
754
944
|
isinstance(total_tokens, int)
|
|
755
945
|
and min_tokens > 0
|
|
@@ -786,6 +976,47 @@ def make_certificate(
|
|
|
786
976
|
if paired:
|
|
787
977
|
paired_run, paired_base = paired
|
|
788
978
|
paired_windows = len(paired_run)
|
|
979
|
+
paired_weights: list[float] | None = None
|
|
980
|
+
try:
|
|
981
|
+
run_ids = (
|
|
982
|
+
run_windows.get("window_ids") if isinstance(run_windows, dict) else None
|
|
983
|
+
)
|
|
984
|
+
run_w = (
|
|
985
|
+
run_windows.get("token_counts")
|
|
986
|
+
if isinstance(run_windows, dict)
|
|
987
|
+
else None
|
|
988
|
+
)
|
|
989
|
+
base_ids = (
|
|
990
|
+
baseline_windows.get("window_ids")
|
|
991
|
+
if isinstance(baseline_windows, dict)
|
|
992
|
+
else None
|
|
993
|
+
)
|
|
994
|
+
if (
|
|
995
|
+
isinstance(run_ids, list)
|
|
996
|
+
and isinstance(run_w, list)
|
|
997
|
+
and isinstance(base_ids, list)
|
|
998
|
+
):
|
|
999
|
+
base_set = {
|
|
1000
|
+
int(b_id) for b_id in base_ids if isinstance(b_id, int | float)
|
|
1001
|
+
}
|
|
1002
|
+
weights: list[float] = []
|
|
1003
|
+
for r_id, w in zip(run_ids, run_w, strict=False):
|
|
1004
|
+
if not isinstance(r_id, int | float):
|
|
1005
|
+
continue
|
|
1006
|
+
key = int(r_id)
|
|
1007
|
+
if key not in base_set:
|
|
1008
|
+
continue
|
|
1009
|
+
try:
|
|
1010
|
+
wv = float(w)
|
|
1011
|
+
except Exception:
|
|
1012
|
+
continue
|
|
1013
|
+
if not math.isfinite(wv):
|
|
1014
|
+
continue
|
|
1015
|
+
weights.append(float(max(wv, 0.0)))
|
|
1016
|
+
if weights:
|
|
1017
|
+
paired_weights = weights
|
|
1018
|
+
except Exception: # pragma: no cover
|
|
1019
|
+
paired_weights = None
|
|
789
1020
|
method = str(metrics_bootstrap.get("method", "percentile")).lower()
|
|
790
1021
|
replicates = int(
|
|
791
1022
|
metrics_bootstrap.get(
|
|
@@ -813,6 +1044,7 @@ def make_certificate(
|
|
|
813
1044
|
delta_ci = compute_paired_delta_log_ci(
|
|
814
1045
|
paired_run,
|
|
815
1046
|
paired_base,
|
|
1047
|
+
weights=paired_weights,
|
|
816
1048
|
method=ci_method,
|
|
817
1049
|
replicates=replicates,
|
|
818
1050
|
alpha=alpha,
|
|
@@ -1053,6 +1285,115 @@ def make_certificate(
|
|
|
1053
1285
|
if key in metrics_stats_source:
|
|
1054
1286
|
ppl_analysis["stats"][key] = metrics_stats_source[key]
|
|
1055
1287
|
|
|
1288
|
+
# Derive requested/actual window counts for auditability when runners do not
|
|
1289
|
+
# emit a metrics.stats block (normalization may also drop it).
|
|
1290
|
+
try:
|
|
1291
|
+
stats_obj = ppl_analysis.get("stats", {})
|
|
1292
|
+
if isinstance(stats_obj, dict):
|
|
1293
|
+
|
|
1294
|
+
def _as_count(value: Any) -> int | None:
|
|
1295
|
+
if value is None or isinstance(value, bool):
|
|
1296
|
+
return None
|
|
1297
|
+
if isinstance(value, int):
|
|
1298
|
+
return int(value) if value >= 0 else None
|
|
1299
|
+
if isinstance(value, float) and math.isfinite(value):
|
|
1300
|
+
if abs(value - round(value)) > 1e-9 or value < 0:
|
|
1301
|
+
return None
|
|
1302
|
+
return int(round(value))
|
|
1303
|
+
return None
|
|
1304
|
+
|
|
1305
|
+
data_cfg = report.get("data", {}) if isinstance(report, dict) else {}
|
|
1306
|
+
data_cfg = data_cfg if isinstance(data_cfg, dict) else {}
|
|
1307
|
+
windows_cfg = (
|
|
1308
|
+
dataset_info.get("windows", {})
|
|
1309
|
+
if isinstance(dataset_info, dict)
|
|
1310
|
+
else {}
|
|
1311
|
+
)
|
|
1312
|
+
windows_cfg = windows_cfg if isinstance(windows_cfg, dict) else {}
|
|
1313
|
+
|
|
1314
|
+
req_prev = _as_count(stats_obj.get("requested_preview"))
|
|
1315
|
+
if req_prev is None:
|
|
1316
|
+
req_prev = _as_count(data_cfg.get("preview_n"))
|
|
1317
|
+
if req_prev is None:
|
|
1318
|
+
req_prev = _as_count(windows_cfg.get("preview"))
|
|
1319
|
+
|
|
1320
|
+
req_fin = _as_count(stats_obj.get("requested_final"))
|
|
1321
|
+
if req_fin is None:
|
|
1322
|
+
req_fin = _as_count(data_cfg.get("final_n"))
|
|
1323
|
+
if req_fin is None:
|
|
1324
|
+
req_fin = _as_count(windows_cfg.get("final"))
|
|
1325
|
+
|
|
1326
|
+
eval_windows = (
|
|
1327
|
+
report.get("evaluation_windows", {}) if isinstance(report, dict) else {}
|
|
1328
|
+
)
|
|
1329
|
+
eval_windows = eval_windows if isinstance(eval_windows, dict) else {}
|
|
1330
|
+
|
|
1331
|
+
def _len_ids(section: Any) -> int | None:
|
|
1332
|
+
if not isinstance(section, dict):
|
|
1333
|
+
return None
|
|
1334
|
+
ids = section.get("window_ids")
|
|
1335
|
+
if isinstance(ids, list):
|
|
1336
|
+
return int(len(ids))
|
|
1337
|
+
return None
|
|
1338
|
+
|
|
1339
|
+
act_prev = _as_count(stats_obj.get("actual_preview"))
|
|
1340
|
+
if act_prev is None:
|
|
1341
|
+
act_prev = _len_ids(eval_windows.get("preview"))
|
|
1342
|
+
if act_prev is None:
|
|
1343
|
+
cov_prev = (
|
|
1344
|
+
coverage_summary.get("preview")
|
|
1345
|
+
if isinstance(coverage_summary, dict)
|
|
1346
|
+
else None
|
|
1347
|
+
)
|
|
1348
|
+
if isinstance(cov_prev, dict):
|
|
1349
|
+
act_prev = _as_count(cov_prev.get("used"))
|
|
1350
|
+
if act_prev is None:
|
|
1351
|
+
act_prev = req_prev
|
|
1352
|
+
|
|
1353
|
+
act_fin = _as_count(stats_obj.get("actual_final"))
|
|
1354
|
+
if act_fin is None:
|
|
1355
|
+
act_fin = _len_ids(eval_windows.get("final"))
|
|
1356
|
+
if act_fin is None:
|
|
1357
|
+
cov_fin = (
|
|
1358
|
+
coverage_summary.get("final")
|
|
1359
|
+
if isinstance(coverage_summary, dict)
|
|
1360
|
+
else None
|
|
1361
|
+
)
|
|
1362
|
+
if isinstance(cov_fin, dict):
|
|
1363
|
+
act_fin = _as_count(cov_fin.get("used"))
|
|
1364
|
+
elif isinstance(coverage_summary, dict):
|
|
1365
|
+
act_fin = _as_count(coverage_summary.get("used"))
|
|
1366
|
+
if act_fin is None:
|
|
1367
|
+
act_fin = req_fin
|
|
1368
|
+
|
|
1369
|
+
if req_prev is not None:
|
|
1370
|
+
stats_obj["requested_preview"] = req_prev
|
|
1371
|
+
if req_fin is not None:
|
|
1372
|
+
stats_obj["requested_final"] = req_fin
|
|
1373
|
+
if act_prev is not None:
|
|
1374
|
+
stats_obj["actual_preview"] = act_prev
|
|
1375
|
+
if act_fin is not None:
|
|
1376
|
+
stats_obj["actual_final"] = act_fin
|
|
1377
|
+
|
|
1378
|
+
if "coverage_ok" not in stats_obj:
|
|
1379
|
+
if (
|
|
1380
|
+
isinstance(req_prev, int)
|
|
1381
|
+
and isinstance(req_fin, int)
|
|
1382
|
+
and isinstance(act_prev, int)
|
|
1383
|
+
and isinstance(act_fin, int)
|
|
1384
|
+
):
|
|
1385
|
+
stats_obj["coverage_ok"] = (act_prev >= req_prev) and (
|
|
1386
|
+
act_fin >= req_fin
|
|
1387
|
+
)
|
|
1388
|
+
except Exception: # pragma: no cover
|
|
1389
|
+
pass
|
|
1390
|
+
|
|
1391
|
+
_enforce_pairing_and_coverage(
|
|
1392
|
+
ppl_analysis.get("stats", {}),
|
|
1393
|
+
window_plan_profile,
|
|
1394
|
+
auto.get("tier", "balanced"),
|
|
1395
|
+
)
|
|
1396
|
+
|
|
1056
1397
|
if isinstance(window_plan_ctx, dict):
|
|
1057
1398
|
ppl_analysis["window_plan"] = window_plan_ctx
|
|
1058
1399
|
|
|
@@ -1102,17 +1443,62 @@ def make_certificate(
|
|
|
1102
1443
|
if variance_policy_digest:
|
|
1103
1444
|
policies["variance"]["policy_digest"] = variance_policy_digest
|
|
1104
1445
|
|
|
1446
|
+
# Resolve tier/profile policy (canonical) and merge observed guard policies.
|
|
1447
|
+
profile = None
|
|
1448
|
+
explicit_overrides = None
|
|
1449
|
+
try:
|
|
1450
|
+
ctx = report.get("context") if isinstance(report, dict) else None
|
|
1451
|
+
if isinstance(ctx, dict) and ctx.get("profile"):
|
|
1452
|
+
profile = str(ctx.get("profile"))
|
|
1453
|
+
except Exception:
|
|
1454
|
+
profile = None
|
|
1455
|
+
try:
|
|
1456
|
+
window_plan = (
|
|
1457
|
+
report.get("metrics", {}).get("window_plan")
|
|
1458
|
+
if isinstance(report.get("metrics"), dict)
|
|
1459
|
+
else None
|
|
1460
|
+
)
|
|
1461
|
+
if (
|
|
1462
|
+
profile is None
|
|
1463
|
+
and isinstance(window_plan, dict)
|
|
1464
|
+
and window_plan.get("profile")
|
|
1465
|
+
):
|
|
1466
|
+
profile = str(window_plan.get("profile"))
|
|
1467
|
+
except Exception:
|
|
1468
|
+
profile = None
|
|
1469
|
+
try:
|
|
1470
|
+
meta_cfg = (
|
|
1471
|
+
report.get("meta", {}).get("config")
|
|
1472
|
+
if isinstance(report.get("meta"), dict)
|
|
1473
|
+
else None
|
|
1474
|
+
)
|
|
1475
|
+
if isinstance(meta_cfg, dict) and isinstance(meta_cfg.get("guards"), dict):
|
|
1476
|
+
explicit_overrides = meta_cfg.get("guards")
|
|
1477
|
+
if explicit_overrides is None and isinstance(report.get("config"), dict):
|
|
1478
|
+
cfg2 = report.get("config")
|
|
1479
|
+
if isinstance(cfg2.get("guards"), dict):
|
|
1480
|
+
explicit_overrides = cfg2.get("guards")
|
|
1481
|
+
except Exception:
|
|
1482
|
+
explicit_overrides = None
|
|
1483
|
+
|
|
1105
1484
|
resolved_policy = _build_resolved_policies(
|
|
1106
|
-
auto.get("tier", "balanced"),
|
|
1485
|
+
auto.get("tier", "balanced"),
|
|
1486
|
+
spectral,
|
|
1487
|
+
rmt,
|
|
1488
|
+
variance,
|
|
1489
|
+
profile=profile,
|
|
1490
|
+
explicit_overrides=explicit_overrides,
|
|
1491
|
+
)
|
|
1492
|
+
overrides_list = _extract_policy_overrides(report)
|
|
1493
|
+
resolved_digest = _compute_policy_digest(
|
|
1494
|
+
{"resolved_policy": resolved_policy, "overrides": overrides_list}
|
|
1107
1495
|
)
|
|
1108
|
-
resolved_digest = _compute_policy_digest(resolved_policy)
|
|
1109
|
-
policy_digest_value = variance_policy_digest or resolved_digest
|
|
1110
1496
|
policy_provenance = {
|
|
1111
1497
|
"tier": auto.get("tier", "balanced"),
|
|
1112
|
-
"overrides":
|
|
1113
|
-
"policy_digest":
|
|
1498
|
+
"overrides": overrides_list,
|
|
1499
|
+
"policy_digest": resolved_digest,
|
|
1114
1500
|
}
|
|
1115
|
-
auto["policy_digest"] =
|
|
1501
|
+
auto["policy_digest"] = resolved_digest
|
|
1116
1502
|
|
|
1117
1503
|
for guard_name in ("spectral", "rmt", "variance"):
|
|
1118
1504
|
if guard_name in resolved_policy:
|
|
@@ -1473,16 +1859,17 @@ def make_certificate(
|
|
|
1473
1859
|
or (baseline_hash != thresholds_hash)
|
|
1474
1860
|
)
|
|
1475
1861
|
|
|
1476
|
-
# Hysteresis knobs snapshot
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1862
|
+
# Hysteresis knobs snapshot (policy-resolved)
|
|
1863
|
+
metrics_policy = (
|
|
1864
|
+
resolved_policy.get("metrics", {}) if isinstance(resolved_policy, dict) else {}
|
|
1865
|
+
)
|
|
1866
|
+
if not isinstance(metrics_policy, dict):
|
|
1480
1867
|
metrics_policy = {}
|
|
1481
1868
|
ppl_hys = 0.0
|
|
1482
1869
|
acc_hys = 0.0
|
|
1483
1870
|
try:
|
|
1484
1871
|
ppl_hys = float(
|
|
1485
|
-
(metrics_policy.get("
|
|
1872
|
+
(metrics_policy.get("pm_ratio") or {}).get("hysteresis_ratio", 0.0) or 0.0
|
|
1486
1873
|
)
|
|
1487
1874
|
acc_hys = float(
|
|
1488
1875
|
(metrics_policy.get("accuracy") or {}).get("hysteresis_delta_pp", 0.0)
|
|
@@ -1725,6 +2112,12 @@ def make_certificate(
|
|
|
1725
2112
|
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
1726
2113
|
|
|
1727
2114
|
_attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2115
|
+
_enforce_display_ci_alignment(
|
|
2116
|
+
ratio_ci_source,
|
|
2117
|
+
certificate.get("primary_metric"),
|
|
2118
|
+
logloss_delta_ci,
|
|
2119
|
+
window_plan_profile,
|
|
2120
|
+
)
|
|
1728
2121
|
|
|
1729
2122
|
# Ensure primary_metric has display_ci populated for schema invariants
|
|
1730
2123
|
try:
|
|
@@ -2204,11 +2597,24 @@ def _format_epsilon_map(epsilon_map: Any) -> dict[str, float]:
|
|
|
2204
2597
|
|
|
2205
2598
|
|
|
2206
2599
|
def _build_resolved_policies(
|
|
2207
|
-
tier: str,
|
|
2600
|
+
tier: str,
|
|
2601
|
+
spectral: dict[str, Any],
|
|
2602
|
+
rmt: dict[str, Any],
|
|
2603
|
+
variance: dict[str, Any],
|
|
2604
|
+
*,
|
|
2605
|
+
profile: str | None = None,
|
|
2606
|
+
explicit_overrides: dict[str, dict[str, Any]] | None = None,
|
|
2208
2607
|
) -> dict[str, Any]:
|
|
2209
2608
|
from .policy_utils import _build_resolved_policies as _impl
|
|
2210
2609
|
|
|
2211
|
-
return _impl(
|
|
2610
|
+
return _impl(
|
|
2611
|
+
tier,
|
|
2612
|
+
spectral,
|
|
2613
|
+
rmt,
|
|
2614
|
+
variance,
|
|
2615
|
+
profile=profile,
|
|
2616
|
+
explicit_overrides=explicit_overrides,
|
|
2617
|
+
)
|
|
2212
2618
|
|
|
2213
2619
|
|
|
2214
2620
|
def _compute_policy_digest(policy: dict[str, Any]) -> str:
|
|
@@ -2279,6 +2685,23 @@ def _prepare_guard_overhead_section(
|
|
|
2279
2685
|
"threshold_percent": threshold * 100,
|
|
2280
2686
|
"source": str(payload.get("source", "report")),
|
|
2281
2687
|
}
|
|
2688
|
+
try:
|
|
2689
|
+
mode = payload.get("mode")
|
|
2690
|
+
if mode is None:
|
|
2691
|
+
mode = payload.get("guard_overhead_mode")
|
|
2692
|
+
if isinstance(mode, str) and mode.strip():
|
|
2693
|
+
sanitized["mode"] = mode.strip()
|
|
2694
|
+
except Exception:
|
|
2695
|
+
pass
|
|
2696
|
+
try:
|
|
2697
|
+
skipped = bool(payload.get("skipped", False))
|
|
2698
|
+
if skipped:
|
|
2699
|
+
sanitized["skipped"] = True
|
|
2700
|
+
reason = payload.get("skip_reason")
|
|
2701
|
+
if isinstance(reason, str) and reason.strip():
|
|
2702
|
+
sanitized["skip_reason"] = reason.strip()
|
|
2703
|
+
except Exception:
|
|
2704
|
+
pass
|
|
2282
2705
|
|
|
2283
2706
|
# Prefer structured reports and reuse the validator when available
|
|
2284
2707
|
bare_report = payload.pop("bare_report", None)
|
|
@@ -2292,8 +2715,8 @@ def _prepare_guard_overhead_section(
|
|
|
2292
2715
|
{
|
|
2293
2716
|
"overhead_ratio": metrics.get("overhead_ratio"),
|
|
2294
2717
|
"overhead_percent": metrics.get("overhead_percent"),
|
|
2295
|
-
"
|
|
2296
|
-
"
|
|
2718
|
+
"bare_ppl": metrics.get("bare_ppl"),
|
|
2719
|
+
"guarded_ppl": metrics.get("guarded_ppl"),
|
|
2297
2720
|
"messages": list(result.messages),
|
|
2298
2721
|
"warnings": list(result.warnings),
|
|
2299
2722
|
"errors": list(result.errors),
|
|
@@ -2305,12 +2728,8 @@ def _prepare_guard_overhead_section(
|
|
|
2305
2728
|
return sanitized, bool(result.passed)
|
|
2306
2729
|
|
|
2307
2730
|
# Fall back to direct ratio computation when reports are not provided
|
|
2308
|
-
bare_ppl = _coerce_float(payload.get("
|
|
2309
|
-
|
|
2310
|
-
)
|
|
2311
|
-
guarded_ppl = _coerce_float(payload.get("guarded_final")) or _coerce_float(
|
|
2312
|
-
payload.get("guarded_ppl")
|
|
2313
|
-
)
|
|
2731
|
+
bare_ppl = _coerce_float(payload.get("bare_ppl"))
|
|
2732
|
+
guarded_ppl = _coerce_float(payload.get("guarded_ppl"))
|
|
2314
2733
|
ratio = _coerce_float(payload.get("overhead_ratio"))
|
|
2315
2734
|
|
|
2316
2735
|
if ratio is None and bare_ppl is not None and guarded_ppl is not None:
|
|
@@ -2449,6 +2868,12 @@ def _propagate_pairing_stats(
|
|
|
2449
2868
|
coverage = pa_stats.get("coverage")
|
|
2450
2869
|
if isinstance(coverage, dict) and coverage:
|
|
2451
2870
|
stats["coverage"] = coverage
|
|
2871
|
+
bootstrap = pa_stats.get("bootstrap")
|
|
2872
|
+
if isinstance(bootstrap, dict) and bootstrap:
|
|
2873
|
+
stats["bootstrap"] = bootstrap
|
|
2874
|
+
paired_delta_summary = pa_stats.get("paired_delta_summary")
|
|
2875
|
+
if isinstance(paired_delta_summary, dict) and paired_delta_summary:
|
|
2876
|
+
stats["paired_delta_summary"] = paired_delta_summary
|
|
2452
2877
|
wmf = pa_stats.get("window_match_fraction")
|
|
2453
2878
|
if wmf is not None:
|
|
2454
2879
|
stats["window_match_fraction"] = wmf
|
|
@@ -2674,12 +3099,31 @@ def _compute_validation_flags(
|
|
|
2674
3099
|
}
|
|
2675
3100
|
if _tiny_relax:
|
|
2676
3101
|
tier = "aggressive"
|
|
3102
|
+
|
|
2677
3103
|
tier_thresholds = {
|
|
2678
3104
|
"conservative": 1.05,
|
|
2679
3105
|
"balanced": 1.10,
|
|
2680
3106
|
"aggressive": 1.20,
|
|
2681
3107
|
"none": 1.10,
|
|
2682
3108
|
}
|
|
3109
|
+
tier_policies = get_tier_policies()
|
|
3110
|
+
tier_policy = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
3111
|
+
metrics_policy = (
|
|
3112
|
+
tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
|
|
3113
|
+
)
|
|
3114
|
+
pm_policy = (
|
|
3115
|
+
metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
|
|
3116
|
+
)
|
|
3117
|
+
ratio_limit_base = pm_policy.get("ratio_limit_base")
|
|
3118
|
+
try:
|
|
3119
|
+
if ratio_limit_base is not None:
|
|
3120
|
+
ratio_limit_base = float(ratio_limit_base)
|
|
3121
|
+
except Exception:
|
|
3122
|
+
ratio_limit_base = None
|
|
3123
|
+
if not isinstance(ratio_limit_base, (int | float)) or not math.isfinite(
|
|
3124
|
+
float(ratio_limit_base)
|
|
3125
|
+
):
|
|
3126
|
+
ratio_limit_base = float(tier_thresholds.get(tier, 1.10))
|
|
2683
3127
|
acceptance = pm_acceptance_range if isinstance(pm_acceptance_range, dict) else {}
|
|
2684
3128
|
ratio_min_bound = None
|
|
2685
3129
|
ratio_max_bound = None
|
|
@@ -2697,7 +3141,7 @@ def _compute_validation_flags(
|
|
|
2697
3141
|
ratio_limit = (
|
|
2698
3142
|
ratio_max_bound
|
|
2699
3143
|
if isinstance(ratio_max_bound, (int | float)) and math.isfinite(ratio_max_bound)
|
|
2700
|
-
else
|
|
3144
|
+
else float(ratio_limit_base)
|
|
2701
3145
|
)
|
|
2702
3146
|
if isinstance(target_ratio, int | float) and target_ratio > 0:
|
|
2703
3147
|
ratio_limit = min(ratio_limit, float(target_ratio))
|
|
@@ -2726,13 +3170,6 @@ def _compute_validation_flags(
|
|
|
2726
3170
|
except Exception: # pragma: no cover
|
|
2727
3171
|
pass
|
|
2728
3172
|
# Hysteresis and sample-size floors from tier policies
|
|
2729
|
-
tier_policy = TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
|
|
2730
|
-
metrics_policy = (
|
|
2731
|
-
tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
|
|
2732
|
-
)
|
|
2733
|
-
pm_policy = (
|
|
2734
|
-
metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
|
|
2735
|
-
)
|
|
2736
3173
|
hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
|
|
2737
3174
|
min_tokens = int(pm_policy.get("min_tokens", 0))
|
|
2738
3175
|
# Evaluate sample-size sufficiency
|
|
@@ -2804,7 +3241,9 @@ def _compute_validation_flags(
|
|
|
2804
3241
|
summary = spectral.get("summary", {}) if isinstance(spectral, dict) else {}
|
|
2805
3242
|
max_caps = spectral.get("max_caps") or summary.get("max_caps")
|
|
2806
3243
|
if max_caps is None:
|
|
2807
|
-
default_spectral =
|
|
3244
|
+
default_spectral = (
|
|
3245
|
+
tier_policy.get("spectral", {}) if isinstance(tier_policy, dict) else {}
|
|
3246
|
+
)
|
|
2808
3247
|
max_caps = default_spectral.get("max_caps", 5)
|
|
2809
3248
|
spectral_stable = spectral.get("caps_applied", 0) <= int(max_caps)
|
|
2810
3249
|
if spectral.get("caps_exceeded"):
|
|
@@ -2871,14 +3310,6 @@ def _compute_validation_flags(
|
|
|
2871
3310
|
flags["primary_metric_acceptable"] = bool(ok)
|
|
2872
3311
|
elif kind in {"accuracy", "vqa_accuracy"}:
|
|
2873
3312
|
# Read thresholds from tier policy if available
|
|
2874
|
-
tier_policy = (
|
|
2875
|
-
TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
|
|
2876
|
-
)
|
|
2877
|
-
metrics_policy = (
|
|
2878
|
-
tier_policy.get("metrics", {})
|
|
2879
|
-
if isinstance(tier_policy, dict)
|
|
2880
|
-
else {}
|
|
2881
|
-
)
|
|
2882
3313
|
acc_policy = (
|
|
2883
3314
|
metrics_policy.get("accuracy", {})
|
|
2884
3315
|
if isinstance(metrics_policy, dict)
|