invarlock 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/tiers.yaml +61 -0
- invarlock/adapters/hf_loading.py +97 -0
- invarlock/calibration/__init__.py +6 -0
- invarlock/calibration/spectral_null.py +301 -0
- invarlock/calibration/variance_ve.py +154 -0
- invarlock/cli/app.py +15 -0
- invarlock/cli/commands/calibrate.py +576 -0
- invarlock/cli/commands/doctor.py +9 -3
- invarlock/cli/commands/explain_gates.py +53 -9
- invarlock/cli/commands/plugins.py +12 -2
- invarlock/cli/commands/run.py +175 -79
- invarlock/cli/commands/verify.py +40 -0
- invarlock/cli/determinism.py +237 -0
- invarlock/core/auto_tuning.py +215 -17
- invarlock/core/registry.py +9 -4
- invarlock/eval/bench.py +467 -141
- invarlock/eval/bench_regression.py +12 -0
- invarlock/eval/data.py +29 -7
- invarlock/guards/spectral.py +216 -9
- invarlock/guards/variance.py +6 -3
- invarlock/reporting/certificate.py +249 -37
- invarlock/reporting/certificate_schema.py +4 -1
- invarlock/reporting/guards_analysis.py +108 -10
- invarlock/reporting/normalizer.py +21 -1
- invarlock/reporting/policy_utils.py +100 -16
- {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/METADATA +12 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/RECORD +32 -25
- {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/WHEEL +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -29,7 +29,7 @@ try: # pragma: no cover - exercised in integration
|
|
|
29
29
|
except Exception: # pragma: no cover
|
|
30
30
|
jsonschema = None # type: ignore
|
|
31
31
|
|
|
32
|
-
from invarlock.core.auto_tuning import
|
|
32
|
+
from invarlock.core.auto_tuning import get_tier_policies
|
|
33
33
|
from invarlock.core.bootstrap import (
|
|
34
34
|
compute_paired_delta_log_ci,
|
|
35
35
|
logspace_to_ratio_ci,
|
|
@@ -598,6 +598,18 @@ def make_certificate(
|
|
|
598
598
|
except Exception: # pragma: no cover
|
|
599
599
|
pass
|
|
600
600
|
|
|
601
|
+
# Determinism preset (CI/Release provenance) when present.
|
|
602
|
+
try:
|
|
603
|
+
det = (
|
|
604
|
+
report.get("meta", {}).get("determinism")
|
|
605
|
+
if isinstance(report.get("meta"), dict)
|
|
606
|
+
else None
|
|
607
|
+
)
|
|
608
|
+
if isinstance(det, dict) and det:
|
|
609
|
+
meta["determinism"] = det
|
|
610
|
+
except Exception: # pragma: no cover
|
|
611
|
+
pass
|
|
612
|
+
|
|
601
613
|
tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
|
|
602
614
|
if not tokenizer_hash_meta:
|
|
603
615
|
dataset_section = report.get("data", {})
|
|
@@ -627,6 +639,13 @@ def make_certificate(
|
|
|
627
639
|
|
|
628
640
|
# Extract dataset configuration and compute hashes
|
|
629
641
|
dataset_info = _extract_dataset_info(report)
|
|
642
|
+
try:
|
|
643
|
+
if isinstance(dataset_info, dict):
|
|
644
|
+
windows = dataset_info.get("windows")
|
|
645
|
+
if isinstance(windows, dict):
|
|
646
|
+
windows.setdefault("stats", {})
|
|
647
|
+
except Exception: # pragma: no cover
|
|
648
|
+
pass
|
|
630
649
|
|
|
631
650
|
# Baseline reference (PM-only). Derive a primary_metric snapshot from baseline windows.
|
|
632
651
|
# Prefer explicit baseline primary_metric when provided; otherwise compute from windows
|
|
@@ -741,15 +760,17 @@ def make_certificate(
|
|
|
741
760
|
tier = str(auto_cfg.get("tier")).lower()
|
|
742
761
|
except Exception: # pragma: no cover
|
|
743
762
|
pass
|
|
763
|
+
tier_policies = get_tier_policies()
|
|
764
|
+
tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
744
765
|
metrics_policy = (
|
|
745
|
-
|
|
746
|
-
if isinstance(tier, str)
|
|
747
|
-
else {}
|
|
766
|
+
tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
|
|
748
767
|
)
|
|
749
|
-
|
|
750
|
-
metrics_policy.get("
|
|
768
|
+
pm_policy = (
|
|
769
|
+
metrics_policy.get("pm_ratio", {})
|
|
770
|
+
if isinstance(metrics_policy, dict)
|
|
771
|
+
else {}
|
|
751
772
|
)
|
|
752
|
-
min_tokens = int(
|
|
773
|
+
min_tokens = int(pm_policy.get("min_tokens", 0))
|
|
753
774
|
if (
|
|
754
775
|
isinstance(total_tokens, int)
|
|
755
776
|
and min_tokens > 0
|
|
@@ -1053,6 +1074,109 @@ def make_certificate(
|
|
|
1053
1074
|
if key in metrics_stats_source:
|
|
1054
1075
|
ppl_analysis["stats"][key] = metrics_stats_source[key]
|
|
1055
1076
|
|
|
1077
|
+
# Derive requested/actual window counts for auditability when runners do not
|
|
1078
|
+
# emit a metrics.stats block (normalization may also drop it).
|
|
1079
|
+
try:
|
|
1080
|
+
stats_obj = ppl_analysis.get("stats", {})
|
|
1081
|
+
if isinstance(stats_obj, dict):
|
|
1082
|
+
|
|
1083
|
+
def _as_count(value: Any) -> int | None:
|
|
1084
|
+
if value is None or isinstance(value, bool):
|
|
1085
|
+
return None
|
|
1086
|
+
if isinstance(value, int):
|
|
1087
|
+
return int(value) if value >= 0 else None
|
|
1088
|
+
if isinstance(value, float) and math.isfinite(value):
|
|
1089
|
+
if abs(value - round(value)) > 1e-9 or value < 0:
|
|
1090
|
+
return None
|
|
1091
|
+
return int(round(value))
|
|
1092
|
+
return None
|
|
1093
|
+
|
|
1094
|
+
data_cfg = report.get("data", {}) if isinstance(report, dict) else {}
|
|
1095
|
+
data_cfg = data_cfg if isinstance(data_cfg, dict) else {}
|
|
1096
|
+
windows_cfg = (
|
|
1097
|
+
dataset_info.get("windows", {})
|
|
1098
|
+
if isinstance(dataset_info, dict)
|
|
1099
|
+
else {}
|
|
1100
|
+
)
|
|
1101
|
+
windows_cfg = windows_cfg if isinstance(windows_cfg, dict) else {}
|
|
1102
|
+
|
|
1103
|
+
req_prev = _as_count(stats_obj.get("requested_preview"))
|
|
1104
|
+
if req_prev is None:
|
|
1105
|
+
req_prev = _as_count(data_cfg.get("preview_n"))
|
|
1106
|
+
if req_prev is None:
|
|
1107
|
+
req_prev = _as_count(windows_cfg.get("preview"))
|
|
1108
|
+
|
|
1109
|
+
req_fin = _as_count(stats_obj.get("requested_final"))
|
|
1110
|
+
if req_fin is None:
|
|
1111
|
+
req_fin = _as_count(data_cfg.get("final_n"))
|
|
1112
|
+
if req_fin is None:
|
|
1113
|
+
req_fin = _as_count(windows_cfg.get("final"))
|
|
1114
|
+
|
|
1115
|
+
eval_windows = (
|
|
1116
|
+
report.get("evaluation_windows", {}) if isinstance(report, dict) else {}
|
|
1117
|
+
)
|
|
1118
|
+
eval_windows = eval_windows if isinstance(eval_windows, dict) else {}
|
|
1119
|
+
|
|
1120
|
+
def _len_ids(section: Any) -> int | None:
|
|
1121
|
+
if not isinstance(section, dict):
|
|
1122
|
+
return None
|
|
1123
|
+
ids = section.get("window_ids")
|
|
1124
|
+
if isinstance(ids, list):
|
|
1125
|
+
return int(len(ids))
|
|
1126
|
+
return None
|
|
1127
|
+
|
|
1128
|
+
act_prev = _as_count(stats_obj.get("actual_preview"))
|
|
1129
|
+
if act_prev is None:
|
|
1130
|
+
act_prev = _len_ids(eval_windows.get("preview"))
|
|
1131
|
+
if act_prev is None:
|
|
1132
|
+
cov_prev = (
|
|
1133
|
+
coverage_summary.get("preview")
|
|
1134
|
+
if isinstance(coverage_summary, dict)
|
|
1135
|
+
else None
|
|
1136
|
+
)
|
|
1137
|
+
if isinstance(cov_prev, dict):
|
|
1138
|
+
act_prev = _as_count(cov_prev.get("used"))
|
|
1139
|
+
if act_prev is None:
|
|
1140
|
+
act_prev = req_prev
|
|
1141
|
+
|
|
1142
|
+
act_fin = _as_count(stats_obj.get("actual_final"))
|
|
1143
|
+
if act_fin is None:
|
|
1144
|
+
act_fin = _len_ids(eval_windows.get("final"))
|
|
1145
|
+
if act_fin is None:
|
|
1146
|
+
cov_fin = (
|
|
1147
|
+
coverage_summary.get("final")
|
|
1148
|
+
if isinstance(coverage_summary, dict)
|
|
1149
|
+
else None
|
|
1150
|
+
)
|
|
1151
|
+
if isinstance(cov_fin, dict):
|
|
1152
|
+
act_fin = _as_count(cov_fin.get("used"))
|
|
1153
|
+
elif isinstance(coverage_summary, dict):
|
|
1154
|
+
act_fin = _as_count(coverage_summary.get("used"))
|
|
1155
|
+
if act_fin is None:
|
|
1156
|
+
act_fin = req_fin
|
|
1157
|
+
|
|
1158
|
+
if req_prev is not None:
|
|
1159
|
+
stats_obj.setdefault("requested_preview", req_prev)
|
|
1160
|
+
if req_fin is not None:
|
|
1161
|
+
stats_obj.setdefault("requested_final", req_fin)
|
|
1162
|
+
if act_prev is not None:
|
|
1163
|
+
stats_obj.setdefault("actual_preview", act_prev)
|
|
1164
|
+
if act_fin is not None:
|
|
1165
|
+
stats_obj.setdefault("actual_final", act_fin)
|
|
1166
|
+
|
|
1167
|
+
if "coverage_ok" not in stats_obj:
|
|
1168
|
+
if (
|
|
1169
|
+
isinstance(req_prev, int)
|
|
1170
|
+
and isinstance(req_fin, int)
|
|
1171
|
+
and isinstance(act_prev, int)
|
|
1172
|
+
and isinstance(act_fin, int)
|
|
1173
|
+
):
|
|
1174
|
+
stats_obj["coverage_ok"] = (act_prev >= req_prev) and (
|
|
1175
|
+
act_fin >= req_fin
|
|
1176
|
+
)
|
|
1177
|
+
except Exception: # pragma: no cover
|
|
1178
|
+
pass
|
|
1179
|
+
|
|
1056
1180
|
if isinstance(window_plan_ctx, dict):
|
|
1057
1181
|
ppl_analysis["window_plan"] = window_plan_ctx
|
|
1058
1182
|
|
|
@@ -1102,17 +1226,62 @@ def make_certificate(
|
|
|
1102
1226
|
if variance_policy_digest:
|
|
1103
1227
|
policies["variance"]["policy_digest"] = variance_policy_digest
|
|
1104
1228
|
|
|
1229
|
+
# Resolve tier/profile policy (canonical) and merge observed guard policies.
|
|
1230
|
+
profile = None
|
|
1231
|
+
explicit_overrides = None
|
|
1232
|
+
try:
|
|
1233
|
+
ctx = report.get("context") if isinstance(report, dict) else None
|
|
1234
|
+
if isinstance(ctx, dict) and ctx.get("profile"):
|
|
1235
|
+
profile = str(ctx.get("profile"))
|
|
1236
|
+
except Exception:
|
|
1237
|
+
profile = None
|
|
1238
|
+
try:
|
|
1239
|
+
window_plan = (
|
|
1240
|
+
report.get("metrics", {}).get("window_plan")
|
|
1241
|
+
if isinstance(report.get("metrics"), dict)
|
|
1242
|
+
else None
|
|
1243
|
+
)
|
|
1244
|
+
if (
|
|
1245
|
+
profile is None
|
|
1246
|
+
and isinstance(window_plan, dict)
|
|
1247
|
+
and window_plan.get("profile")
|
|
1248
|
+
):
|
|
1249
|
+
profile = str(window_plan.get("profile"))
|
|
1250
|
+
except Exception:
|
|
1251
|
+
profile = None
|
|
1252
|
+
try:
|
|
1253
|
+
meta_cfg = (
|
|
1254
|
+
report.get("meta", {}).get("config")
|
|
1255
|
+
if isinstance(report.get("meta"), dict)
|
|
1256
|
+
else None
|
|
1257
|
+
)
|
|
1258
|
+
if isinstance(meta_cfg, dict) and isinstance(meta_cfg.get("guards"), dict):
|
|
1259
|
+
explicit_overrides = meta_cfg.get("guards")
|
|
1260
|
+
if explicit_overrides is None and isinstance(report.get("config"), dict):
|
|
1261
|
+
cfg2 = report.get("config")
|
|
1262
|
+
if isinstance(cfg2.get("guards"), dict):
|
|
1263
|
+
explicit_overrides = cfg2.get("guards")
|
|
1264
|
+
except Exception:
|
|
1265
|
+
explicit_overrides = None
|
|
1266
|
+
|
|
1105
1267
|
resolved_policy = _build_resolved_policies(
|
|
1106
|
-
auto.get("tier", "balanced"),
|
|
1268
|
+
auto.get("tier", "balanced"),
|
|
1269
|
+
spectral,
|
|
1270
|
+
rmt,
|
|
1271
|
+
variance,
|
|
1272
|
+
profile=profile,
|
|
1273
|
+
explicit_overrides=explicit_overrides,
|
|
1274
|
+
)
|
|
1275
|
+
overrides_list = _extract_policy_overrides(report)
|
|
1276
|
+
resolved_digest = _compute_policy_digest(
|
|
1277
|
+
{"resolved_policy": resolved_policy, "overrides": overrides_list}
|
|
1107
1278
|
)
|
|
1108
|
-
resolved_digest = _compute_policy_digest(resolved_policy)
|
|
1109
|
-
policy_digest_value = variance_policy_digest or resolved_digest
|
|
1110
1279
|
policy_provenance = {
|
|
1111
1280
|
"tier": auto.get("tier", "balanced"),
|
|
1112
|
-
"overrides":
|
|
1113
|
-
"policy_digest":
|
|
1281
|
+
"overrides": overrides_list,
|
|
1282
|
+
"policy_digest": resolved_digest,
|
|
1114
1283
|
}
|
|
1115
|
-
auto["policy_digest"] =
|
|
1284
|
+
auto["policy_digest"] = resolved_digest
|
|
1116
1285
|
|
|
1117
1286
|
for guard_name in ("spectral", "rmt", "variance"):
|
|
1118
1287
|
if guard_name in resolved_policy:
|
|
@@ -1473,16 +1642,17 @@ def make_certificate(
|
|
|
1473
1642
|
or (baseline_hash != thresholds_hash)
|
|
1474
1643
|
)
|
|
1475
1644
|
|
|
1476
|
-
# Hysteresis knobs snapshot
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1645
|
+
# Hysteresis knobs snapshot (policy-resolved)
|
|
1646
|
+
metrics_policy = (
|
|
1647
|
+
resolved_policy.get("metrics", {}) if isinstance(resolved_policy, dict) else {}
|
|
1648
|
+
)
|
|
1649
|
+
if not isinstance(metrics_policy, dict):
|
|
1480
1650
|
metrics_policy = {}
|
|
1481
1651
|
ppl_hys = 0.0
|
|
1482
1652
|
acc_hys = 0.0
|
|
1483
1653
|
try:
|
|
1484
1654
|
ppl_hys = float(
|
|
1485
|
-
(metrics_policy.get("
|
|
1655
|
+
(metrics_policy.get("pm_ratio") or {}).get("hysteresis_ratio", 0.0) or 0.0
|
|
1486
1656
|
)
|
|
1487
1657
|
acc_hys = float(
|
|
1488
1658
|
(metrics_policy.get("accuracy") or {}).get("hysteresis_delta_pp", 0.0)
|
|
@@ -2204,11 +2374,24 @@ def _format_epsilon_map(epsilon_map: Any) -> dict[str, float]:
|
|
|
2204
2374
|
|
|
2205
2375
|
|
|
2206
2376
|
def _build_resolved_policies(
|
|
2207
|
-
tier: str,
|
|
2377
|
+
tier: str,
|
|
2378
|
+
spectral: dict[str, Any],
|
|
2379
|
+
rmt: dict[str, Any],
|
|
2380
|
+
variance: dict[str, Any],
|
|
2381
|
+
*,
|
|
2382
|
+
profile: str | None = None,
|
|
2383
|
+
explicit_overrides: dict[str, dict[str, Any]] | None = None,
|
|
2208
2384
|
) -> dict[str, Any]:
|
|
2209
2385
|
from .policy_utils import _build_resolved_policies as _impl
|
|
2210
2386
|
|
|
2211
|
-
return _impl(
|
|
2387
|
+
return _impl(
|
|
2388
|
+
tier,
|
|
2389
|
+
spectral,
|
|
2390
|
+
rmt,
|
|
2391
|
+
variance,
|
|
2392
|
+
profile=profile,
|
|
2393
|
+
explicit_overrides=explicit_overrides,
|
|
2394
|
+
)
|
|
2212
2395
|
|
|
2213
2396
|
|
|
2214
2397
|
def _compute_policy_digest(policy: dict[str, Any]) -> str:
|
|
@@ -2279,6 +2462,23 @@ def _prepare_guard_overhead_section(
|
|
|
2279
2462
|
"threshold_percent": threshold * 100,
|
|
2280
2463
|
"source": str(payload.get("source", "report")),
|
|
2281
2464
|
}
|
|
2465
|
+
try:
|
|
2466
|
+
mode = payload.get("mode")
|
|
2467
|
+
if mode is None:
|
|
2468
|
+
mode = payload.get("guard_overhead_mode")
|
|
2469
|
+
if isinstance(mode, str) and mode.strip():
|
|
2470
|
+
sanitized["mode"] = mode.strip()
|
|
2471
|
+
except Exception:
|
|
2472
|
+
pass
|
|
2473
|
+
try:
|
|
2474
|
+
skipped = bool(payload.get("skipped", False))
|
|
2475
|
+
if skipped:
|
|
2476
|
+
sanitized["skipped"] = True
|
|
2477
|
+
reason = payload.get("skip_reason")
|
|
2478
|
+
if isinstance(reason, str) and reason.strip():
|
|
2479
|
+
sanitized["skip_reason"] = reason.strip()
|
|
2480
|
+
except Exception:
|
|
2481
|
+
pass
|
|
2282
2482
|
|
|
2283
2483
|
# Prefer structured reports and reuse the validator when available
|
|
2284
2484
|
bare_report = payload.pop("bare_report", None)
|
|
@@ -2449,6 +2649,12 @@ def _propagate_pairing_stats(
|
|
|
2449
2649
|
coverage = pa_stats.get("coverage")
|
|
2450
2650
|
if isinstance(coverage, dict) and coverage:
|
|
2451
2651
|
stats["coverage"] = coverage
|
|
2652
|
+
bootstrap = pa_stats.get("bootstrap")
|
|
2653
|
+
if isinstance(bootstrap, dict) and bootstrap:
|
|
2654
|
+
stats["bootstrap"] = bootstrap
|
|
2655
|
+
paired_delta_summary = pa_stats.get("paired_delta_summary")
|
|
2656
|
+
if isinstance(paired_delta_summary, dict) and paired_delta_summary:
|
|
2657
|
+
stats["paired_delta_summary"] = paired_delta_summary
|
|
2452
2658
|
wmf = pa_stats.get("window_match_fraction")
|
|
2453
2659
|
if wmf is not None:
|
|
2454
2660
|
stats["window_match_fraction"] = wmf
|
|
@@ -2674,12 +2880,31 @@ def _compute_validation_flags(
|
|
|
2674
2880
|
}
|
|
2675
2881
|
if _tiny_relax:
|
|
2676
2882
|
tier = "aggressive"
|
|
2883
|
+
|
|
2677
2884
|
tier_thresholds = {
|
|
2678
2885
|
"conservative": 1.05,
|
|
2679
2886
|
"balanced": 1.10,
|
|
2680
2887
|
"aggressive": 1.20,
|
|
2681
2888
|
"none": 1.10,
|
|
2682
2889
|
}
|
|
2890
|
+
tier_policies = get_tier_policies()
|
|
2891
|
+
tier_policy = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
2892
|
+
metrics_policy = (
|
|
2893
|
+
tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
|
|
2894
|
+
)
|
|
2895
|
+
pm_policy = (
|
|
2896
|
+
metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
|
|
2897
|
+
)
|
|
2898
|
+
ratio_limit_base = pm_policy.get("ratio_limit_base")
|
|
2899
|
+
try:
|
|
2900
|
+
if ratio_limit_base is not None:
|
|
2901
|
+
ratio_limit_base = float(ratio_limit_base)
|
|
2902
|
+
except Exception:
|
|
2903
|
+
ratio_limit_base = None
|
|
2904
|
+
if not isinstance(ratio_limit_base, (int | float)) or not math.isfinite(
|
|
2905
|
+
float(ratio_limit_base)
|
|
2906
|
+
):
|
|
2907
|
+
ratio_limit_base = float(tier_thresholds.get(tier, 1.10))
|
|
2683
2908
|
acceptance = pm_acceptance_range if isinstance(pm_acceptance_range, dict) else {}
|
|
2684
2909
|
ratio_min_bound = None
|
|
2685
2910
|
ratio_max_bound = None
|
|
@@ -2697,7 +2922,7 @@ def _compute_validation_flags(
|
|
|
2697
2922
|
ratio_limit = (
|
|
2698
2923
|
ratio_max_bound
|
|
2699
2924
|
if isinstance(ratio_max_bound, (int | float)) and math.isfinite(ratio_max_bound)
|
|
2700
|
-
else
|
|
2925
|
+
else float(ratio_limit_base)
|
|
2701
2926
|
)
|
|
2702
2927
|
if isinstance(target_ratio, int | float) and target_ratio > 0:
|
|
2703
2928
|
ratio_limit = min(ratio_limit, float(target_ratio))
|
|
@@ -2726,13 +2951,6 @@ def _compute_validation_flags(
|
|
|
2726
2951
|
except Exception: # pragma: no cover
|
|
2727
2952
|
pass
|
|
2728
2953
|
# Hysteresis and sample-size floors from tier policies
|
|
2729
|
-
tier_policy = TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
|
|
2730
|
-
metrics_policy = (
|
|
2731
|
-
tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
|
|
2732
|
-
)
|
|
2733
|
-
pm_policy = (
|
|
2734
|
-
metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
|
|
2735
|
-
)
|
|
2736
2954
|
hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
|
|
2737
2955
|
min_tokens = int(pm_policy.get("min_tokens", 0))
|
|
2738
2956
|
# Evaluate sample-size sufficiency
|
|
@@ -2804,7 +3022,9 @@ def _compute_validation_flags(
|
|
|
2804
3022
|
summary = spectral.get("summary", {}) if isinstance(spectral, dict) else {}
|
|
2805
3023
|
max_caps = spectral.get("max_caps") or summary.get("max_caps")
|
|
2806
3024
|
if max_caps is None:
|
|
2807
|
-
default_spectral =
|
|
3025
|
+
default_spectral = (
|
|
3026
|
+
tier_policy.get("spectral", {}) if isinstance(tier_policy, dict) else {}
|
|
3027
|
+
)
|
|
2808
3028
|
max_caps = default_spectral.get("max_caps", 5)
|
|
2809
3029
|
spectral_stable = spectral.get("caps_applied", 0) <= int(max_caps)
|
|
2810
3030
|
if spectral.get("caps_exceeded"):
|
|
@@ -2871,14 +3091,6 @@ def _compute_validation_flags(
|
|
|
2871
3091
|
flags["primary_metric_acceptable"] = bool(ok)
|
|
2872
3092
|
elif kind in {"accuracy", "vqa_accuracy"}:
|
|
2873
3093
|
# Read thresholds from tier policy if available
|
|
2874
|
-
tier_policy = (
|
|
2875
|
-
TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
|
|
2876
|
-
)
|
|
2877
|
-
metrics_policy = (
|
|
2878
|
-
tier_policy.get("metrics", {})
|
|
2879
|
-
if isinstance(tier_policy, dict)
|
|
2880
|
-
else {}
|
|
2881
|
-
)
|
|
2882
3094
|
acc_policy = (
|
|
2883
3095
|
metrics_policy.get("accuracy", {})
|
|
2884
3096
|
if isinstance(metrics_policy, dict)
|
|
@@ -29,6 +29,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
29
29
|
"plugins",
|
|
30
30
|
"meta",
|
|
31
31
|
"dataset",
|
|
32
|
+
"primary_metric",
|
|
32
33
|
],
|
|
33
34
|
"properties": {
|
|
34
35
|
"schema_version": {"const": CERTIFICATE_SCHEMA_VERSION},
|
|
@@ -64,11 +65,12 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
64
65
|
"seq_len": {"type": "integer", "minimum": 1},
|
|
65
66
|
"windows": {
|
|
66
67
|
"type": "object",
|
|
67
|
-
"required": ["preview", "final"],
|
|
68
|
+
"required": ["preview", "final", "stats"],
|
|
68
69
|
"properties": {
|
|
69
70
|
"preview": {"type": "integer", "minimum": 0},
|
|
70
71
|
"final": {"type": "integer", "minimum": 0},
|
|
71
72
|
"seed": {"type": "integer"},
|
|
73
|
+
"stats": {"type": "object"},
|
|
72
74
|
},
|
|
73
75
|
},
|
|
74
76
|
},
|
|
@@ -77,6 +79,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
77
79
|
# ppl_* block removed from required schema; may appear for ppl-like tasks but is optional
|
|
78
80
|
"primary_metric": {
|
|
79
81
|
"type": "object",
|
|
82
|
+
"required": ["kind"],
|
|
80
83
|
"properties": {
|
|
81
84
|
"kind": {"type": "string"},
|
|
82
85
|
"unit": {"type": "string"},
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
import math
|
|
5
5
|
from typing import Any, no_type_check
|
|
6
6
|
|
|
7
|
-
from invarlock.core.auto_tuning import
|
|
7
|
+
from invarlock.core.auto_tuning import get_tier_policies
|
|
8
8
|
|
|
9
9
|
from .policy_utils import _promote_legacy_multiple_testing_key, _resolve_policy_tier
|
|
10
10
|
from .report_types import RunReport
|
|
@@ -133,7 +133,8 @@ def _extract_spectral_analysis(
|
|
|
133
133
|
report: RunReport, baseline: dict[str, Any]
|
|
134
134
|
) -> dict[str, Any]:
|
|
135
135
|
tier = _resolve_policy_tier(report)
|
|
136
|
-
|
|
136
|
+
tier_policies = get_tier_policies()
|
|
137
|
+
tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
137
138
|
spectral_defaults = tier_defaults.get("spectral", {}) if tier_defaults else {}
|
|
138
139
|
default_sigma_quantile = spectral_defaults.get("sigma_quantile", 0.95)
|
|
139
140
|
default_deadband = spectral_defaults.get("deadband", 0.1)
|
|
@@ -166,9 +167,15 @@ def _extract_spectral_analysis(
|
|
|
166
167
|
caps_exceeded = (
|
|
167
168
|
bool(guard_metrics.get("caps_exceeded", False)) if guard_metrics else False
|
|
168
169
|
)
|
|
169
|
-
max_caps =
|
|
170
|
+
max_caps = guard_metrics.get("max_caps") if guard_metrics else None
|
|
171
|
+
if max_caps is None and guard_policy:
|
|
172
|
+
max_caps = guard_policy.get("max_caps")
|
|
170
173
|
if max_caps is None:
|
|
171
174
|
max_caps = default_max_caps
|
|
175
|
+
try:
|
|
176
|
+
max_caps = int(max_caps)
|
|
177
|
+
except Exception:
|
|
178
|
+
max_caps = int(default_max_caps)
|
|
172
179
|
|
|
173
180
|
try:
|
|
174
181
|
max_spectral_norm = float(
|
|
@@ -618,10 +625,15 @@ def _extract_rmt_analysis(
|
|
|
618
625
|
report: RunReport, baseline: dict[str, Any]
|
|
619
626
|
) -> dict[str, Any]:
|
|
620
627
|
tier = _resolve_policy_tier(report)
|
|
621
|
-
|
|
628
|
+
tier_policies = get_tier_policies()
|
|
629
|
+
tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
622
630
|
default_epsilon_map = (
|
|
623
|
-
tier_defaults.get("rmt", {}).get("
|
|
631
|
+
tier_defaults.get("rmt", {}).get("epsilon_by_family")
|
|
632
|
+
if isinstance(tier_defaults, dict)
|
|
633
|
+
else {}
|
|
624
634
|
)
|
|
635
|
+
if not default_epsilon_map and isinstance(tier_defaults, dict):
|
|
636
|
+
default_epsilon_map = (tier_defaults.get("rmt", {}) or {}).get("epsilon", {})
|
|
625
637
|
default_epsilon_map = {
|
|
626
638
|
str(family): float(value)
|
|
627
639
|
for family, value in (default_epsilon_map or {}).items()
|
|
@@ -631,6 +643,16 @@ def _extract_rmt_analysis(
|
|
|
631
643
|
outliers_guarded = 0
|
|
632
644
|
outliers_bare = 0
|
|
633
645
|
epsilon_default = 0.1
|
|
646
|
+
try:
|
|
647
|
+
eps_def = (
|
|
648
|
+
tier_defaults.get("rmt", {}).get("epsilon_default")
|
|
649
|
+
if isinstance(tier_defaults, dict)
|
|
650
|
+
else None
|
|
651
|
+
)
|
|
652
|
+
if isinstance(eps_def, int | float) and math.isfinite(float(eps_def)):
|
|
653
|
+
epsilon_default = float(eps_def)
|
|
654
|
+
except Exception:
|
|
655
|
+
pass
|
|
634
656
|
stable = True
|
|
635
657
|
explicit_stability = False
|
|
636
658
|
max_ratio = 0.0
|
|
@@ -640,19 +662,54 @@ def _extract_rmt_analysis(
|
|
|
640
662
|
baseline_outliers_per_family: dict[str, int] = {}
|
|
641
663
|
outliers_per_family: dict[str, int] = {}
|
|
642
664
|
epsilon_violations: list[Any] = []
|
|
665
|
+
margin_used = None
|
|
666
|
+
deadband_used = None
|
|
667
|
+
policy_out: dict[str, Any] | None = None
|
|
643
668
|
|
|
644
669
|
for guard in report.get("guards", []) or []:
|
|
645
670
|
if str(guard.get("name", "")).lower() == "rmt":
|
|
646
671
|
guard_metrics = guard.get("metrics", {}) or {}
|
|
647
672
|
guard_policy = guard.get("policy", {}) or {}
|
|
673
|
+
if isinstance(guard_policy, dict) and guard_policy:
|
|
674
|
+
policy_out = dict(guard_policy)
|
|
675
|
+
if "epsilon_by_family" not in policy_out and isinstance(
|
|
676
|
+
policy_out.get("epsilon"), dict
|
|
677
|
+
):
|
|
678
|
+
policy_out["epsilon_by_family"] = dict(policy_out["epsilon"])
|
|
679
|
+
if isinstance(policy_out.get("margin"), int | float) and math.isfinite(
|
|
680
|
+
float(policy_out.get("margin"))
|
|
681
|
+
):
|
|
682
|
+
margin_used = float(policy_out.get("margin"))
|
|
683
|
+
if isinstance(
|
|
684
|
+
policy_out.get("deadband"), int | float
|
|
685
|
+
) and math.isfinite(float(policy_out.get("deadband"))):
|
|
686
|
+
deadband_used = float(policy_out.get("deadband"))
|
|
687
|
+
if isinstance(
|
|
688
|
+
policy_out.get("epsilon_default"), int | float
|
|
689
|
+
) and math.isfinite(float(policy_out.get("epsilon_default"))):
|
|
690
|
+
epsilon_default = float(policy_out.get("epsilon_default"))
|
|
691
|
+
if isinstance(
|
|
692
|
+
guard_metrics.get("epsilon_default"), int | float
|
|
693
|
+
) and math.isfinite(float(guard_metrics.get("epsilon_default"))):
|
|
694
|
+
epsilon_default = float(guard_metrics.get("epsilon_default"))
|
|
648
695
|
outliers_guarded = guard_metrics.get(
|
|
649
696
|
"rmt_outliers", guard_metrics.get("layers_flagged", outliers_guarded)
|
|
650
697
|
)
|
|
651
698
|
max_ratio = guard_metrics.get("max_ratio", 0.0)
|
|
652
|
-
epsilon_default = guard_policy.get(
|
|
653
|
-
"deadband", guard_metrics.get("deadband_used", epsilon_default)
|
|
654
|
-
)
|
|
655
699
|
epsilon_map = guard_metrics.get("epsilon_by_family", {}) or epsilon_map
|
|
700
|
+
if not epsilon_map and isinstance(guard_policy, dict):
|
|
701
|
+
eps_src = guard_policy.get("epsilon_by_family") or guard_policy.get(
|
|
702
|
+
"epsilon"
|
|
703
|
+
)
|
|
704
|
+
if isinstance(eps_src, dict):
|
|
705
|
+
try:
|
|
706
|
+
epsilon_map = {
|
|
707
|
+
str(k): float(v)
|
|
708
|
+
for k, v in eps_src.items()
|
|
709
|
+
if isinstance(v, int | float) and math.isfinite(float(v))
|
|
710
|
+
}
|
|
711
|
+
except Exception:
|
|
712
|
+
pass
|
|
656
713
|
baseline_outliers_per_family = (
|
|
657
714
|
guard_metrics.get("baseline_outliers_per_family", {})
|
|
658
715
|
or baseline_outliers_per_family
|
|
@@ -844,7 +901,7 @@ def _extract_rmt_analysis(
|
|
|
844
901
|
}
|
|
845
902
|
delta_per_family = {str(k): _to_int(v) for k, v in delta_per_family.items()}
|
|
846
903
|
|
|
847
|
-
|
|
904
|
+
result = {
|
|
848
905
|
"outliers_bare": outliers_bare,
|
|
849
906
|
"outliers_guarded": outliers_guarded,
|
|
850
907
|
"epsilon": epsilon_scalar,
|
|
@@ -862,6 +919,13 @@ def _extract_rmt_analysis(
|
|
|
862
919
|
"mean_deviation_ratio": mean_deviation_ratio,
|
|
863
920
|
"families": family_breakdown,
|
|
864
921
|
}
|
|
922
|
+
if margin_used is not None:
|
|
923
|
+
result["margin"] = float(margin_used)
|
|
924
|
+
if deadband_used is not None:
|
|
925
|
+
result["deadband"] = float(deadband_used)
|
|
926
|
+
if policy_out:
|
|
927
|
+
result["policy"] = policy_out
|
|
928
|
+
return result
|
|
865
929
|
|
|
866
930
|
|
|
867
931
|
@no_type_check
|
|
@@ -873,10 +937,14 @@ def _extract_variance_analysis(report: RunReport) -> dict[str, Any]:
|
|
|
873
937
|
ratio_ci = None
|
|
874
938
|
calibration = {}
|
|
875
939
|
guard_metrics: dict[str, Any] = {}
|
|
940
|
+
guard_policy: dict[str, Any] | None = None
|
|
876
941
|
for guard in report.get("guards", []) or []:
|
|
877
942
|
if "variance" in str(guard.get("name", "")).lower():
|
|
878
943
|
metrics = guard.get("metrics", {}) or {}
|
|
879
944
|
guard_metrics = metrics
|
|
945
|
+
gp = guard.get("policy", {}) or {}
|
|
946
|
+
if isinstance(gp, dict) and gp:
|
|
947
|
+
guard_policy = dict(gp)
|
|
880
948
|
ve_enabled = metrics.get("ve_enabled", bool(metrics))
|
|
881
949
|
gain = metrics.get("ab_gain", metrics.get("gain", None))
|
|
882
950
|
ppl_no_ve = metrics.get("ppl_no_ve", None)
|
|
@@ -932,11 +1000,41 @@ def _extract_variance_analysis(report: RunReport) -> dict[str, Any]:
|
|
|
932
1000
|
if guard_metrics.get("ab_windows_used") is not None:
|
|
933
1001
|
ab_section["windows_used"] = guard_metrics["ab_windows_used"]
|
|
934
1002
|
if guard_metrics.get("ab_provenance"):
|
|
935
|
-
|
|
1003
|
+
prov = guard_metrics["ab_provenance"]
|
|
1004
|
+
if isinstance(prov, dict):
|
|
1005
|
+
prov_out = dict(prov)
|
|
1006
|
+
|
|
1007
|
+
# Normalize a top-level `window_ids` list for docs + auditability.
|
|
1008
|
+
if "window_ids" not in prov_out:
|
|
1009
|
+
window_ids: set[int] = set()
|
|
1010
|
+
|
|
1011
|
+
def _collect(node: Any) -> None:
|
|
1012
|
+
if isinstance(node, dict):
|
|
1013
|
+
ids = node.get("window_ids")
|
|
1014
|
+
if isinstance(ids, list):
|
|
1015
|
+
for wid in ids:
|
|
1016
|
+
if isinstance(wid, int | float):
|
|
1017
|
+
window_ids.add(int(wid))
|
|
1018
|
+
for v in node.values():
|
|
1019
|
+
_collect(v)
|
|
1020
|
+
return
|
|
1021
|
+
if isinstance(node, list):
|
|
1022
|
+
for v in node:
|
|
1023
|
+
_collect(v)
|
|
1024
|
+
|
|
1025
|
+
_collect(prov_out)
|
|
1026
|
+
if window_ids:
|
|
1027
|
+
prov_out["window_ids"] = sorted(window_ids)
|
|
1028
|
+
|
|
1029
|
+
ab_section["provenance"] = prov_out
|
|
1030
|
+
else:
|
|
1031
|
+
ab_section["provenance"] = prov
|
|
936
1032
|
if guard_metrics.get("ab_point_estimates"):
|
|
937
1033
|
ab_section["point_estimates"] = guard_metrics["ab_point_estimates"]
|
|
938
1034
|
if ab_section:
|
|
939
1035
|
result["ab_test"] = ab_section
|
|
1036
|
+
if guard_policy:
|
|
1037
|
+
result["policy"] = guard_policy
|
|
940
1038
|
return result
|
|
941
1039
|
|
|
942
1040
|
|