invarlock 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/profiles/ci_cpu.yaml +5 -0
- invarlock/_data/runtime/tiers.yaml +61 -0
- invarlock/adapters/hf_loading.py +97 -0
- invarlock/calibration/__init__.py +6 -0
- invarlock/calibration/spectral_null.py +301 -0
- invarlock/calibration/variance_ve.py +154 -0
- invarlock/cli/app.py +15 -0
- invarlock/cli/commands/calibrate.py +576 -0
- invarlock/cli/commands/doctor.py +16 -4
- invarlock/cli/commands/explain_gates.py +53 -9
- invarlock/cli/commands/plugins.py +12 -2
- invarlock/cli/commands/run.py +323 -81
- invarlock/cli/commands/verify.py +40 -0
- invarlock/cli/determinism.py +237 -0
- invarlock/core/auto_tuning.py +215 -17
- invarlock/core/registry.py +9 -4
- invarlock/eval/bench.py +467 -141
- invarlock/eval/bench_regression.py +12 -0
- invarlock/eval/data.py +29 -7
- invarlock/guards/spectral.py +216 -9
- invarlock/guards/variance.py +6 -3
- invarlock/reporting/certificate.py +403 -51
- invarlock/reporting/certificate_schema.py +4 -1
- invarlock/reporting/guards_analysis.py +108 -10
- invarlock/reporting/normalizer.py +21 -1
- invarlock/reporting/policy_utils.py +100 -16
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/METADATA +12 -10
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/RECORD +33 -26
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/WHEEL +0 -0
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,7 @@ from __future__ import annotations
|
|
|
13
13
|
# mypy: ignore-errors
|
|
14
14
|
import copy
|
|
15
15
|
import hashlib
|
|
16
|
+
import inspect
|
|
16
17
|
import json
|
|
17
18
|
import math
|
|
18
19
|
import os
|
|
@@ -28,7 +29,7 @@ try: # pragma: no cover - exercised in integration
|
|
|
28
29
|
except Exception: # pragma: no cover
|
|
29
30
|
jsonschema = None # type: ignore
|
|
30
31
|
|
|
31
|
-
from invarlock.core.auto_tuning import
|
|
32
|
+
from invarlock.core.auto_tuning import get_tier_policies
|
|
32
33
|
from invarlock.core.bootstrap import (
|
|
33
34
|
compute_paired_delta_log_ci,
|
|
34
35
|
logspace_to_ratio_ci,
|
|
@@ -597,6 +598,18 @@ def make_certificate(
|
|
|
597
598
|
except Exception: # pragma: no cover
|
|
598
599
|
pass
|
|
599
600
|
|
|
601
|
+
# Determinism preset (CI/Release provenance) when present.
|
|
602
|
+
try:
|
|
603
|
+
det = (
|
|
604
|
+
report.get("meta", {}).get("determinism")
|
|
605
|
+
if isinstance(report.get("meta"), dict)
|
|
606
|
+
else None
|
|
607
|
+
)
|
|
608
|
+
if isinstance(det, dict) and det:
|
|
609
|
+
meta["determinism"] = det
|
|
610
|
+
except Exception: # pragma: no cover
|
|
611
|
+
pass
|
|
612
|
+
|
|
600
613
|
tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
|
|
601
614
|
if not tokenizer_hash_meta:
|
|
602
615
|
dataset_section = report.get("data", {})
|
|
@@ -626,6 +639,13 @@ def make_certificate(
|
|
|
626
639
|
|
|
627
640
|
# Extract dataset configuration and compute hashes
|
|
628
641
|
dataset_info = _extract_dataset_info(report)
|
|
642
|
+
try:
|
|
643
|
+
if isinstance(dataset_info, dict):
|
|
644
|
+
windows = dataset_info.get("windows")
|
|
645
|
+
if isinstance(windows, dict):
|
|
646
|
+
windows.setdefault("stats", {})
|
|
647
|
+
except Exception: # pragma: no cover
|
|
648
|
+
pass
|
|
629
649
|
|
|
630
650
|
# Baseline reference (PM-only). Derive a primary_metric snapshot from baseline windows.
|
|
631
651
|
# Prefer explicit baseline primary_metric when provided; otherwise compute from windows
|
|
@@ -740,15 +760,17 @@ def make_certificate(
|
|
|
740
760
|
tier = str(auto_cfg.get("tier")).lower()
|
|
741
761
|
except Exception: # pragma: no cover
|
|
742
762
|
pass
|
|
763
|
+
tier_policies = get_tier_policies()
|
|
764
|
+
tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
743
765
|
metrics_policy = (
|
|
744
|
-
|
|
745
|
-
if isinstance(tier, str)
|
|
746
|
-
else {}
|
|
766
|
+
tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
|
|
747
767
|
)
|
|
748
|
-
|
|
749
|
-
metrics_policy.get("
|
|
768
|
+
pm_policy = (
|
|
769
|
+
metrics_policy.get("pm_ratio", {})
|
|
770
|
+
if isinstance(metrics_policy, dict)
|
|
771
|
+
else {}
|
|
750
772
|
)
|
|
751
|
-
min_tokens = int(
|
|
773
|
+
min_tokens = int(pm_policy.get("min_tokens", 0))
|
|
752
774
|
if (
|
|
753
775
|
isinstance(total_tokens, int)
|
|
754
776
|
and min_tokens > 0
|
|
@@ -1052,6 +1074,109 @@ def make_certificate(
|
|
|
1052
1074
|
if key in metrics_stats_source:
|
|
1053
1075
|
ppl_analysis["stats"][key] = metrics_stats_source[key]
|
|
1054
1076
|
|
|
1077
|
+
# Derive requested/actual window counts for auditability when runners do not
|
|
1078
|
+
# emit a metrics.stats block (normalization may also drop it).
|
|
1079
|
+
try:
|
|
1080
|
+
stats_obj = ppl_analysis.get("stats", {})
|
|
1081
|
+
if isinstance(stats_obj, dict):
|
|
1082
|
+
|
|
1083
|
+
def _as_count(value: Any) -> int | None:
|
|
1084
|
+
if value is None or isinstance(value, bool):
|
|
1085
|
+
return None
|
|
1086
|
+
if isinstance(value, int):
|
|
1087
|
+
return int(value) if value >= 0 else None
|
|
1088
|
+
if isinstance(value, float) and math.isfinite(value):
|
|
1089
|
+
if abs(value - round(value)) > 1e-9 or value < 0:
|
|
1090
|
+
return None
|
|
1091
|
+
return int(round(value))
|
|
1092
|
+
return None
|
|
1093
|
+
|
|
1094
|
+
data_cfg = report.get("data", {}) if isinstance(report, dict) else {}
|
|
1095
|
+
data_cfg = data_cfg if isinstance(data_cfg, dict) else {}
|
|
1096
|
+
windows_cfg = (
|
|
1097
|
+
dataset_info.get("windows", {})
|
|
1098
|
+
if isinstance(dataset_info, dict)
|
|
1099
|
+
else {}
|
|
1100
|
+
)
|
|
1101
|
+
windows_cfg = windows_cfg if isinstance(windows_cfg, dict) else {}
|
|
1102
|
+
|
|
1103
|
+
req_prev = _as_count(stats_obj.get("requested_preview"))
|
|
1104
|
+
if req_prev is None:
|
|
1105
|
+
req_prev = _as_count(data_cfg.get("preview_n"))
|
|
1106
|
+
if req_prev is None:
|
|
1107
|
+
req_prev = _as_count(windows_cfg.get("preview"))
|
|
1108
|
+
|
|
1109
|
+
req_fin = _as_count(stats_obj.get("requested_final"))
|
|
1110
|
+
if req_fin is None:
|
|
1111
|
+
req_fin = _as_count(data_cfg.get("final_n"))
|
|
1112
|
+
if req_fin is None:
|
|
1113
|
+
req_fin = _as_count(windows_cfg.get("final"))
|
|
1114
|
+
|
|
1115
|
+
eval_windows = (
|
|
1116
|
+
report.get("evaluation_windows", {}) if isinstance(report, dict) else {}
|
|
1117
|
+
)
|
|
1118
|
+
eval_windows = eval_windows if isinstance(eval_windows, dict) else {}
|
|
1119
|
+
|
|
1120
|
+
def _len_ids(section: Any) -> int | None:
|
|
1121
|
+
if not isinstance(section, dict):
|
|
1122
|
+
return None
|
|
1123
|
+
ids = section.get("window_ids")
|
|
1124
|
+
if isinstance(ids, list):
|
|
1125
|
+
return int(len(ids))
|
|
1126
|
+
return None
|
|
1127
|
+
|
|
1128
|
+
act_prev = _as_count(stats_obj.get("actual_preview"))
|
|
1129
|
+
if act_prev is None:
|
|
1130
|
+
act_prev = _len_ids(eval_windows.get("preview"))
|
|
1131
|
+
if act_prev is None:
|
|
1132
|
+
cov_prev = (
|
|
1133
|
+
coverage_summary.get("preview")
|
|
1134
|
+
if isinstance(coverage_summary, dict)
|
|
1135
|
+
else None
|
|
1136
|
+
)
|
|
1137
|
+
if isinstance(cov_prev, dict):
|
|
1138
|
+
act_prev = _as_count(cov_prev.get("used"))
|
|
1139
|
+
if act_prev is None:
|
|
1140
|
+
act_prev = req_prev
|
|
1141
|
+
|
|
1142
|
+
act_fin = _as_count(stats_obj.get("actual_final"))
|
|
1143
|
+
if act_fin is None:
|
|
1144
|
+
act_fin = _len_ids(eval_windows.get("final"))
|
|
1145
|
+
if act_fin is None:
|
|
1146
|
+
cov_fin = (
|
|
1147
|
+
coverage_summary.get("final")
|
|
1148
|
+
if isinstance(coverage_summary, dict)
|
|
1149
|
+
else None
|
|
1150
|
+
)
|
|
1151
|
+
if isinstance(cov_fin, dict):
|
|
1152
|
+
act_fin = _as_count(cov_fin.get("used"))
|
|
1153
|
+
elif isinstance(coverage_summary, dict):
|
|
1154
|
+
act_fin = _as_count(coverage_summary.get("used"))
|
|
1155
|
+
if act_fin is None:
|
|
1156
|
+
act_fin = req_fin
|
|
1157
|
+
|
|
1158
|
+
if req_prev is not None:
|
|
1159
|
+
stats_obj.setdefault("requested_preview", req_prev)
|
|
1160
|
+
if req_fin is not None:
|
|
1161
|
+
stats_obj.setdefault("requested_final", req_fin)
|
|
1162
|
+
if act_prev is not None:
|
|
1163
|
+
stats_obj.setdefault("actual_preview", act_prev)
|
|
1164
|
+
if act_fin is not None:
|
|
1165
|
+
stats_obj.setdefault("actual_final", act_fin)
|
|
1166
|
+
|
|
1167
|
+
if "coverage_ok" not in stats_obj:
|
|
1168
|
+
if (
|
|
1169
|
+
isinstance(req_prev, int)
|
|
1170
|
+
and isinstance(req_fin, int)
|
|
1171
|
+
and isinstance(act_prev, int)
|
|
1172
|
+
and isinstance(act_fin, int)
|
|
1173
|
+
):
|
|
1174
|
+
stats_obj["coverage_ok"] = (act_prev >= req_prev) and (
|
|
1175
|
+
act_fin >= req_fin
|
|
1176
|
+
)
|
|
1177
|
+
except Exception: # pragma: no cover
|
|
1178
|
+
pass
|
|
1179
|
+
|
|
1055
1180
|
if isinstance(window_plan_ctx, dict):
|
|
1056
1181
|
ppl_analysis["window_plan"] = window_plan_ctx
|
|
1057
1182
|
|
|
@@ -1101,17 +1226,62 @@ def make_certificate(
|
|
|
1101
1226
|
if variance_policy_digest:
|
|
1102
1227
|
policies["variance"]["policy_digest"] = variance_policy_digest
|
|
1103
1228
|
|
|
1229
|
+
# Resolve tier/profile policy (canonical) and merge observed guard policies.
|
|
1230
|
+
profile = None
|
|
1231
|
+
explicit_overrides = None
|
|
1232
|
+
try:
|
|
1233
|
+
ctx = report.get("context") if isinstance(report, dict) else None
|
|
1234
|
+
if isinstance(ctx, dict) and ctx.get("profile"):
|
|
1235
|
+
profile = str(ctx.get("profile"))
|
|
1236
|
+
except Exception:
|
|
1237
|
+
profile = None
|
|
1238
|
+
try:
|
|
1239
|
+
window_plan = (
|
|
1240
|
+
report.get("metrics", {}).get("window_plan")
|
|
1241
|
+
if isinstance(report.get("metrics"), dict)
|
|
1242
|
+
else None
|
|
1243
|
+
)
|
|
1244
|
+
if (
|
|
1245
|
+
profile is None
|
|
1246
|
+
and isinstance(window_plan, dict)
|
|
1247
|
+
and window_plan.get("profile")
|
|
1248
|
+
):
|
|
1249
|
+
profile = str(window_plan.get("profile"))
|
|
1250
|
+
except Exception:
|
|
1251
|
+
profile = None
|
|
1252
|
+
try:
|
|
1253
|
+
meta_cfg = (
|
|
1254
|
+
report.get("meta", {}).get("config")
|
|
1255
|
+
if isinstance(report.get("meta"), dict)
|
|
1256
|
+
else None
|
|
1257
|
+
)
|
|
1258
|
+
if isinstance(meta_cfg, dict) and isinstance(meta_cfg.get("guards"), dict):
|
|
1259
|
+
explicit_overrides = meta_cfg.get("guards")
|
|
1260
|
+
if explicit_overrides is None and isinstance(report.get("config"), dict):
|
|
1261
|
+
cfg2 = report.get("config")
|
|
1262
|
+
if isinstance(cfg2.get("guards"), dict):
|
|
1263
|
+
explicit_overrides = cfg2.get("guards")
|
|
1264
|
+
except Exception:
|
|
1265
|
+
explicit_overrides = None
|
|
1266
|
+
|
|
1104
1267
|
resolved_policy = _build_resolved_policies(
|
|
1105
|
-
auto.get("tier", "balanced"),
|
|
1268
|
+
auto.get("tier", "balanced"),
|
|
1269
|
+
spectral,
|
|
1270
|
+
rmt,
|
|
1271
|
+
variance,
|
|
1272
|
+
profile=profile,
|
|
1273
|
+
explicit_overrides=explicit_overrides,
|
|
1274
|
+
)
|
|
1275
|
+
overrides_list = _extract_policy_overrides(report)
|
|
1276
|
+
resolved_digest = _compute_policy_digest(
|
|
1277
|
+
{"resolved_policy": resolved_policy, "overrides": overrides_list}
|
|
1106
1278
|
)
|
|
1107
|
-
resolved_digest = _compute_policy_digest(resolved_policy)
|
|
1108
|
-
policy_digest_value = variance_policy_digest or resolved_digest
|
|
1109
1279
|
policy_provenance = {
|
|
1110
1280
|
"tier": auto.get("tier", "balanced"),
|
|
1111
|
-
"overrides":
|
|
1112
|
-
"policy_digest":
|
|
1281
|
+
"overrides": overrides_list,
|
|
1282
|
+
"policy_digest": resolved_digest,
|
|
1113
1283
|
}
|
|
1114
|
-
auto["policy_digest"] =
|
|
1284
|
+
auto["policy_digest"] = resolved_digest
|
|
1115
1285
|
|
|
1116
1286
|
for guard_name in ("spectral", "rmt", "variance"):
|
|
1117
1287
|
if guard_name in resolved_policy:
|
|
@@ -1322,24 +1492,36 @@ def make_certificate(
|
|
|
1322
1492
|
capacity_tokens = None
|
|
1323
1493
|
capacity_examples = None
|
|
1324
1494
|
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
auto.get("
|
|
1333
|
-
|
|
1334
|
-
|
|
1495
|
+
pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
|
|
1496
|
+
|
|
1497
|
+
validation_kwargs = {
|
|
1498
|
+
"ppl": ppl_analysis,
|
|
1499
|
+
"spectral": spectral,
|
|
1500
|
+
"rmt": rmt,
|
|
1501
|
+
"invariants": invariants,
|
|
1502
|
+
"tier": auto.get("tier", "balanced"),
|
|
1503
|
+
"_ppl_metrics": ppl_metrics,
|
|
1504
|
+
"target_ratio": auto.get("target_pm_ratio"),
|
|
1505
|
+
"guard_overhead": guard_overhead_section,
|
|
1506
|
+
"primary_metric": report.get("metrics", {}).get("primary_metric")
|
|
1335
1507
|
if isinstance(report.get("metrics"), dict)
|
|
1336
1508
|
else None,
|
|
1337
|
-
moe_section,
|
|
1338
|
-
{
|
|
1509
|
+
"moe": moe_section,
|
|
1510
|
+
"dataset_capacity": {
|
|
1339
1511
|
"tokens_available": capacity_tokens,
|
|
1340
1512
|
"examples_available": capacity_examples,
|
|
1341
1513
|
},
|
|
1342
|
-
|
|
1514
|
+
}
|
|
1515
|
+
try:
|
|
1516
|
+
if (
|
|
1517
|
+
"pm_acceptance_range"
|
|
1518
|
+
in inspect.signature(_compute_validation_flags).parameters
|
|
1519
|
+
):
|
|
1520
|
+
validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
|
|
1521
|
+
except Exception: # pragma: no cover - defensive against patched functions
|
|
1522
|
+
validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
|
|
1523
|
+
|
|
1524
|
+
validation_flags = _compute_validation_flags(**validation_kwargs)
|
|
1343
1525
|
# Enforce validation key allow-list to prevent surface drift
|
|
1344
1526
|
_allowed_validation = _load_validation_allowlist()
|
|
1345
1527
|
validation_filtered = {
|
|
@@ -1460,16 +1642,17 @@ def make_certificate(
|
|
|
1460
1642
|
or (baseline_hash != thresholds_hash)
|
|
1461
1643
|
)
|
|
1462
1644
|
|
|
1463
|
-
# Hysteresis knobs snapshot
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1645
|
+
# Hysteresis knobs snapshot (policy-resolved)
|
|
1646
|
+
metrics_policy = (
|
|
1647
|
+
resolved_policy.get("metrics", {}) if isinstance(resolved_policy, dict) else {}
|
|
1648
|
+
)
|
|
1649
|
+
if not isinstance(metrics_policy, dict):
|
|
1467
1650
|
metrics_policy = {}
|
|
1468
1651
|
ppl_hys = 0.0
|
|
1469
1652
|
acc_hys = 0.0
|
|
1470
1653
|
try:
|
|
1471
1654
|
ppl_hys = float(
|
|
1472
|
-
(metrics_policy.get("
|
|
1655
|
+
(metrics_policy.get("pm_ratio") or {}).get("hysteresis_ratio", 0.0) or 0.0
|
|
1473
1656
|
)
|
|
1474
1657
|
acc_hys = float(
|
|
1475
1658
|
(metrics_policy.get("accuracy") or {}).get("hysteresis_delta_pp", 0.0)
|
|
@@ -2191,11 +2374,24 @@ def _format_epsilon_map(epsilon_map: Any) -> dict[str, float]:
|
|
|
2191
2374
|
|
|
2192
2375
|
|
|
2193
2376
|
def _build_resolved_policies(
|
|
2194
|
-
tier: str,
|
|
2377
|
+
tier: str,
|
|
2378
|
+
spectral: dict[str, Any],
|
|
2379
|
+
rmt: dict[str, Any],
|
|
2380
|
+
variance: dict[str, Any],
|
|
2381
|
+
*,
|
|
2382
|
+
profile: str | None = None,
|
|
2383
|
+
explicit_overrides: dict[str, dict[str, Any]] | None = None,
|
|
2195
2384
|
) -> dict[str, Any]:
|
|
2196
2385
|
from .policy_utils import _build_resolved_policies as _impl
|
|
2197
2386
|
|
|
2198
|
-
return _impl(
|
|
2387
|
+
return _impl(
|
|
2388
|
+
tier,
|
|
2389
|
+
spectral,
|
|
2390
|
+
rmt,
|
|
2391
|
+
variance,
|
|
2392
|
+
profile=profile,
|
|
2393
|
+
explicit_overrides=explicit_overrides,
|
|
2394
|
+
)
|
|
2199
2395
|
|
|
2200
2396
|
|
|
2201
2397
|
def _compute_policy_digest(policy: dict[str, Any]) -> str:
|
|
@@ -2266,6 +2462,23 @@ def _prepare_guard_overhead_section(
|
|
|
2266
2462
|
"threshold_percent": threshold * 100,
|
|
2267
2463
|
"source": str(payload.get("source", "report")),
|
|
2268
2464
|
}
|
|
2465
|
+
try:
|
|
2466
|
+
mode = payload.get("mode")
|
|
2467
|
+
if mode is None:
|
|
2468
|
+
mode = payload.get("guard_overhead_mode")
|
|
2469
|
+
if isinstance(mode, str) and mode.strip():
|
|
2470
|
+
sanitized["mode"] = mode.strip()
|
|
2471
|
+
except Exception:
|
|
2472
|
+
pass
|
|
2473
|
+
try:
|
|
2474
|
+
skipped = bool(payload.get("skipped", False))
|
|
2475
|
+
if skipped:
|
|
2476
|
+
sanitized["skipped"] = True
|
|
2477
|
+
reason = payload.get("skip_reason")
|
|
2478
|
+
if isinstance(reason, str) and reason.strip():
|
|
2479
|
+
sanitized["skip_reason"] = reason.strip()
|
|
2480
|
+
except Exception:
|
|
2481
|
+
pass
|
|
2269
2482
|
|
|
2270
2483
|
# Prefer structured reports and reuse the validator when available
|
|
2271
2484
|
bare_report = payload.pop("bare_report", None)
|
|
@@ -2436,6 +2649,12 @@ def _propagate_pairing_stats(
|
|
|
2436
2649
|
coverage = pa_stats.get("coverage")
|
|
2437
2650
|
if isinstance(coverage, dict) and coverage:
|
|
2438
2651
|
stats["coverage"] = coverage
|
|
2652
|
+
bootstrap = pa_stats.get("bootstrap")
|
|
2653
|
+
if isinstance(bootstrap, dict) and bootstrap:
|
|
2654
|
+
stats["bootstrap"] = bootstrap
|
|
2655
|
+
paired_delta_summary = pa_stats.get("paired_delta_summary")
|
|
2656
|
+
if isinstance(paired_delta_summary, dict) and paired_delta_summary:
|
|
2657
|
+
stats["paired_delta_summary"] = paired_delta_summary
|
|
2439
2658
|
wmf = pa_stats.get("window_match_fraction")
|
|
2440
2659
|
if wmf is not None:
|
|
2441
2660
|
stats["window_match_fraction"] = wmf
|
|
@@ -2537,6 +2756,103 @@ def _build_provenance_block(
|
|
|
2537
2756
|
return provenance
|
|
2538
2757
|
|
|
2539
2758
|
|
|
2759
|
+
def _resolve_pm_acceptance_range_from_report(
|
|
2760
|
+
report: dict[str, Any] | None,
|
|
2761
|
+
) -> dict[str, float]:
|
|
2762
|
+
"""Resolve primary-metric acceptance bounds from report context/meta/env."""
|
|
2763
|
+
|
|
2764
|
+
base_min = 0.95
|
|
2765
|
+
base_max = 1.10
|
|
2766
|
+
|
|
2767
|
+
def _safe_float(val: Any) -> float | None:
|
|
2768
|
+
try:
|
|
2769
|
+
if val is None:
|
|
2770
|
+
return None
|
|
2771
|
+
return float(val)
|
|
2772
|
+
except Exception:
|
|
2773
|
+
return None
|
|
2774
|
+
|
|
2775
|
+
cfg_min = None
|
|
2776
|
+
cfg_max = None
|
|
2777
|
+
ctx = report.get("context") if isinstance(report, dict) else None
|
|
2778
|
+
if isinstance(ctx, dict):
|
|
2779
|
+
pm_ctx = (
|
|
2780
|
+
ctx.get("primary_metric")
|
|
2781
|
+
if isinstance(ctx.get("primary_metric"), dict)
|
|
2782
|
+
else {}
|
|
2783
|
+
)
|
|
2784
|
+
if isinstance(pm_ctx, dict):
|
|
2785
|
+
cfg_min = _safe_float(pm_ctx.get("acceptance_range", {}).get("min"))
|
|
2786
|
+
cfg_max = _safe_float(pm_ctx.get("acceptance_range", {}).get("max"))
|
|
2787
|
+
if cfg_min is None or cfg_max is None:
|
|
2788
|
+
alt = ctx.get("pm_acceptance_range")
|
|
2789
|
+
if isinstance(alt, dict):
|
|
2790
|
+
cfg_min = (
|
|
2791
|
+
cfg_min if cfg_min is not None else _safe_float(alt.get("min"))
|
|
2792
|
+
)
|
|
2793
|
+
cfg_max = (
|
|
2794
|
+
cfg_max if cfg_max is not None else _safe_float(alt.get("max"))
|
|
2795
|
+
)
|
|
2796
|
+
|
|
2797
|
+
if (cfg_min is None or cfg_max is None) and isinstance(report, dict):
|
|
2798
|
+
meta = report.get("meta")
|
|
2799
|
+
if isinstance(meta, dict):
|
|
2800
|
+
meta_range = meta.get("pm_acceptance_range")
|
|
2801
|
+
if isinstance(meta_range, dict):
|
|
2802
|
+
cfg_min = (
|
|
2803
|
+
cfg_min
|
|
2804
|
+
if cfg_min is not None
|
|
2805
|
+
else _safe_float(meta_range.get("min"))
|
|
2806
|
+
)
|
|
2807
|
+
cfg_max = (
|
|
2808
|
+
cfg_max
|
|
2809
|
+
if cfg_max is not None
|
|
2810
|
+
else _safe_float(meta_range.get("max"))
|
|
2811
|
+
)
|
|
2812
|
+
|
|
2813
|
+
def _parse_env(name: str) -> float | None:
|
|
2814
|
+
try:
|
|
2815
|
+
raw = os.environ.get(name, "")
|
|
2816
|
+
if raw is None or str(raw).strip() == "":
|
|
2817
|
+
return None
|
|
2818
|
+
return float(raw)
|
|
2819
|
+
except Exception:
|
|
2820
|
+
return None
|
|
2821
|
+
|
|
2822
|
+
env_min = _parse_env("INVARLOCK_PM_ACCEPTANCE_MIN")
|
|
2823
|
+
env_max = _parse_env("INVARLOCK_PM_ACCEPTANCE_MAX")
|
|
2824
|
+
|
|
2825
|
+
has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
|
|
2826
|
+
if not has_explicit:
|
|
2827
|
+
return {}
|
|
2828
|
+
|
|
2829
|
+
min_val = (
|
|
2830
|
+
env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
|
|
2831
|
+
)
|
|
2832
|
+
max_val = (
|
|
2833
|
+
env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
|
|
2834
|
+
)
|
|
2835
|
+
|
|
2836
|
+
try:
|
|
2837
|
+
if min_val is not None and min_val <= 0:
|
|
2838
|
+
min_val = base_min
|
|
2839
|
+
except Exception:
|
|
2840
|
+
min_val = base_min
|
|
2841
|
+
try:
|
|
2842
|
+
if max_val is not None and max_val <= 0:
|
|
2843
|
+
max_val = base_max
|
|
2844
|
+
except Exception:
|
|
2845
|
+
max_val = base_max
|
|
2846
|
+
|
|
2847
|
+
try:
|
|
2848
|
+
if max_val is not None and min_val is not None and max_val < min_val:
|
|
2849
|
+
max_val = min_val
|
|
2850
|
+
except Exception:
|
|
2851
|
+
max_val = base_max
|
|
2852
|
+
|
|
2853
|
+
return {"min": float(min_val), "max": float(max_val)}
|
|
2854
|
+
|
|
2855
|
+
|
|
2540
2856
|
def _compute_validation_flags(
|
|
2541
2857
|
ppl: dict[str, Any],
|
|
2542
2858
|
spectral: dict[str, Any],
|
|
@@ -2549,6 +2865,7 @@ def _compute_validation_flags(
|
|
|
2549
2865
|
primary_metric: dict[str, Any] | None = None,
|
|
2550
2866
|
moe: dict[str, Any] | None = None,
|
|
2551
2867
|
dataset_capacity: dict[str, Any] | None = None,
|
|
2868
|
+
pm_acceptance_range: dict[str, float] | None = None,
|
|
2552
2869
|
) -> dict[str, bool]:
|
|
2553
2870
|
"""Compute validation flags for the certificate including canonical gates."""
|
|
2554
2871
|
tier = (tier or "balanced").lower()
|
|
@@ -2563,13 +2880,50 @@ def _compute_validation_flags(
|
|
|
2563
2880
|
}
|
|
2564
2881
|
if _tiny_relax:
|
|
2565
2882
|
tier = "aggressive"
|
|
2883
|
+
|
|
2566
2884
|
tier_thresholds = {
|
|
2567
2885
|
"conservative": 1.05,
|
|
2568
2886
|
"balanced": 1.10,
|
|
2569
2887
|
"aggressive": 1.20,
|
|
2570
2888
|
"none": 1.10,
|
|
2571
2889
|
}
|
|
2572
|
-
|
|
2890
|
+
tier_policies = get_tier_policies()
|
|
2891
|
+
tier_policy = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
2892
|
+
metrics_policy = (
|
|
2893
|
+
tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
|
|
2894
|
+
)
|
|
2895
|
+
pm_policy = (
|
|
2896
|
+
metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
|
|
2897
|
+
)
|
|
2898
|
+
ratio_limit_base = pm_policy.get("ratio_limit_base")
|
|
2899
|
+
try:
|
|
2900
|
+
if ratio_limit_base is not None:
|
|
2901
|
+
ratio_limit_base = float(ratio_limit_base)
|
|
2902
|
+
except Exception:
|
|
2903
|
+
ratio_limit_base = None
|
|
2904
|
+
if not isinstance(ratio_limit_base, (int | float)) or not math.isfinite(
|
|
2905
|
+
float(ratio_limit_base)
|
|
2906
|
+
):
|
|
2907
|
+
ratio_limit_base = float(tier_thresholds.get(tier, 1.10))
|
|
2908
|
+
acceptance = pm_acceptance_range if isinstance(pm_acceptance_range, dict) else {}
|
|
2909
|
+
ratio_min_bound = None
|
|
2910
|
+
ratio_max_bound = None
|
|
2911
|
+
try:
|
|
2912
|
+
if acceptance.get("min") is not None:
|
|
2913
|
+
ratio_min_bound = float(acceptance.get("min"))
|
|
2914
|
+
except Exception:
|
|
2915
|
+
ratio_min_bound = None
|
|
2916
|
+
try:
|
|
2917
|
+
if acceptance.get("max") is not None:
|
|
2918
|
+
ratio_max_bound = float(acceptance.get("max"))
|
|
2919
|
+
except Exception:
|
|
2920
|
+
ratio_max_bound = None
|
|
2921
|
+
|
|
2922
|
+
ratio_limit = (
|
|
2923
|
+
ratio_max_bound
|
|
2924
|
+
if isinstance(ratio_max_bound, (int | float)) and math.isfinite(ratio_max_bound)
|
|
2925
|
+
else float(ratio_limit_base)
|
|
2926
|
+
)
|
|
2573
2927
|
if isinstance(target_ratio, int | float) and target_ratio > 0:
|
|
2574
2928
|
ratio_limit = min(ratio_limit, float(target_ratio))
|
|
2575
2929
|
|
|
@@ -2597,13 +2951,6 @@ def _compute_validation_flags(
|
|
|
2597
2951
|
except Exception: # pragma: no cover
|
|
2598
2952
|
pass
|
|
2599
2953
|
# Hysteresis and sample-size floors from tier policies
|
|
2600
|
-
tier_policy = TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
|
|
2601
|
-
metrics_policy = (
|
|
2602
|
-
tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
|
|
2603
|
-
)
|
|
2604
|
-
pm_policy = (
|
|
2605
|
-
metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
|
|
2606
|
-
)
|
|
2607
2954
|
hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
|
|
2608
2955
|
min_tokens = int(pm_policy.get("min_tokens", 0))
|
|
2609
2956
|
# Evaluate sample-size sufficiency
|
|
@@ -2636,9 +2983,18 @@ def _compute_validation_flags(
|
|
|
2636
2983
|
tokens_ok_eff = tokens_ok or _tiny_relax
|
|
2637
2984
|
# Apply hysteresis to ratio limit if needed
|
|
2638
2985
|
ratio_limit_with_hyst = ratio_limit + max(0.0, hysteresis_ratio)
|
|
2986
|
+
lower_bound_ok = True
|
|
2987
|
+
if ratio_min_bound is not None and isinstance(ratio_vs_baseline, (int | float)):
|
|
2988
|
+
try:
|
|
2989
|
+
lower_bound_ok = math.isfinite(float(ratio_vs_baseline)) and (
|
|
2990
|
+
float(ratio_vs_baseline) >= float(ratio_min_bound)
|
|
2991
|
+
)
|
|
2992
|
+
except Exception:
|
|
2993
|
+
lower_bound_ok = True
|
|
2639
2994
|
compression_acceptable = (
|
|
2640
2995
|
isinstance(ratio_vs_baseline, int | float)
|
|
2641
2996
|
and math.isfinite(ratio_vs_baseline)
|
|
2997
|
+
and lower_bound_ok
|
|
2642
2998
|
and ratio_vs_baseline <= ratio_limit_with_hyst
|
|
2643
2999
|
and tokens_ok_eff
|
|
2644
3000
|
)
|
|
@@ -2655,7 +3011,9 @@ def _compute_validation_flags(
|
|
|
2655
3011
|
and all(isinstance(x, int | float) and math.isfinite(x) for x in ratio_ci)
|
|
2656
3012
|
):
|
|
2657
3013
|
compression_acceptable = (
|
|
2658
|
-
compression_acceptable
|
|
3014
|
+
compression_acceptable
|
|
3015
|
+
and ratio_ci[1] <= ratio_limit_with_hyst
|
|
3016
|
+
and (ratio_min_bound is None or ratio_ci[0] >= ratio_min_bound)
|
|
2659
3017
|
)
|
|
2660
3018
|
|
|
2661
3019
|
# 3. RMT ε-rule compliance
|
|
@@ -2664,7 +3022,9 @@ def _compute_validation_flags(
|
|
|
2664
3022
|
summary = spectral.get("summary", {}) if isinstance(spectral, dict) else {}
|
|
2665
3023
|
max_caps = spectral.get("max_caps") or summary.get("max_caps")
|
|
2666
3024
|
if max_caps is None:
|
|
2667
|
-
default_spectral =
|
|
3025
|
+
default_spectral = (
|
|
3026
|
+
tier_policy.get("spectral", {}) if isinstance(tier_policy, dict) else {}
|
|
3027
|
+
)
|
|
2668
3028
|
max_caps = default_spectral.get("max_caps", 5)
|
|
2669
3029
|
spectral_stable = spectral.get("caps_applied", 0) <= int(max_caps)
|
|
2670
3030
|
if spectral.get("caps_exceeded"):
|
|
@@ -2731,14 +3091,6 @@ def _compute_validation_flags(
|
|
|
2731
3091
|
flags["primary_metric_acceptable"] = bool(ok)
|
|
2732
3092
|
elif kind in {"accuracy", "vqa_accuracy"}:
|
|
2733
3093
|
# Read thresholds from tier policy if available
|
|
2734
|
-
tier_policy = (
|
|
2735
|
-
TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
|
|
2736
|
-
)
|
|
2737
|
-
metrics_policy = (
|
|
2738
|
-
tier_policy.get("metrics", {})
|
|
2739
|
-
if isinstance(tier_policy, dict)
|
|
2740
|
-
else {}
|
|
2741
|
-
)
|
|
2742
3094
|
acc_policy = (
|
|
2743
3095
|
metrics_policy.get("accuracy", {})
|
|
2744
3096
|
if isinstance(metrics_policy, dict)
|
|
@@ -29,6 +29,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
29
29
|
"plugins",
|
|
30
30
|
"meta",
|
|
31
31
|
"dataset",
|
|
32
|
+
"primary_metric",
|
|
32
33
|
],
|
|
33
34
|
"properties": {
|
|
34
35
|
"schema_version": {"const": CERTIFICATE_SCHEMA_VERSION},
|
|
@@ -64,11 +65,12 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
64
65
|
"seq_len": {"type": "integer", "minimum": 1},
|
|
65
66
|
"windows": {
|
|
66
67
|
"type": "object",
|
|
67
|
-
"required": ["preview", "final"],
|
|
68
|
+
"required": ["preview", "final", "stats"],
|
|
68
69
|
"properties": {
|
|
69
70
|
"preview": {"type": "integer", "minimum": 0},
|
|
70
71
|
"final": {"type": "integer", "minimum": 0},
|
|
71
72
|
"seed": {"type": "integer"},
|
|
73
|
+
"stats": {"type": "object"},
|
|
72
74
|
},
|
|
73
75
|
},
|
|
74
76
|
},
|
|
@@ -77,6 +79,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
77
79
|
# ppl_* block removed from required schema; may appear for ppl-like tasks but is optional
|
|
78
80
|
"primary_metric": {
|
|
79
81
|
"type": "object",
|
|
82
|
+
"required": ["kind"],
|
|
80
83
|
"properties": {
|
|
81
84
|
"kind": {"type": "string"},
|
|
82
85
|
"unit": {"type": "string"},
|