invarlock 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/profiles/ci_cpu.yaml +5 -0
  3. invarlock/_data/runtime/tiers.yaml +61 -0
  4. invarlock/adapters/hf_loading.py +97 -0
  5. invarlock/calibration/__init__.py +6 -0
  6. invarlock/calibration/spectral_null.py +301 -0
  7. invarlock/calibration/variance_ve.py +154 -0
  8. invarlock/cli/app.py +15 -0
  9. invarlock/cli/commands/calibrate.py +576 -0
  10. invarlock/cli/commands/doctor.py +16 -4
  11. invarlock/cli/commands/explain_gates.py +53 -9
  12. invarlock/cli/commands/plugins.py +12 -2
  13. invarlock/cli/commands/run.py +323 -81
  14. invarlock/cli/commands/verify.py +40 -0
  15. invarlock/cli/determinism.py +237 -0
  16. invarlock/core/auto_tuning.py +215 -17
  17. invarlock/core/registry.py +9 -4
  18. invarlock/eval/bench.py +467 -141
  19. invarlock/eval/bench_regression.py +12 -0
  20. invarlock/eval/data.py +29 -7
  21. invarlock/guards/spectral.py +216 -9
  22. invarlock/guards/variance.py +6 -3
  23. invarlock/reporting/certificate.py +403 -51
  24. invarlock/reporting/certificate_schema.py +4 -1
  25. invarlock/reporting/guards_analysis.py +108 -10
  26. invarlock/reporting/normalizer.py +21 -1
  27. invarlock/reporting/policy_utils.py +100 -16
  28. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/METADATA +12 -10
  29. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/RECORD +33 -26
  30. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/WHEEL +0 -0
  31. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/entry_points.txt +0 -0
  32. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/licenses/LICENSE +0 -0
  33. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@ from __future__ import annotations
13
13
  # mypy: ignore-errors
14
14
  import copy
15
15
  import hashlib
16
+ import inspect
16
17
  import json
17
18
  import math
18
19
  import os
@@ -28,7 +29,7 @@ try: # pragma: no cover - exercised in integration
28
29
  except Exception: # pragma: no cover
29
30
  jsonschema = None # type: ignore
30
31
 
31
- from invarlock.core.auto_tuning import TIER_POLICIES
32
+ from invarlock.core.auto_tuning import get_tier_policies
32
33
  from invarlock.core.bootstrap import (
33
34
  compute_paired_delta_log_ci,
34
35
  logspace_to_ratio_ci,
@@ -597,6 +598,18 @@ def make_certificate(
597
598
  except Exception: # pragma: no cover
598
599
  pass
599
600
 
601
+ # Determinism preset (CI/Release provenance) when present.
602
+ try:
603
+ det = (
604
+ report.get("meta", {}).get("determinism")
605
+ if isinstance(report.get("meta"), dict)
606
+ else None
607
+ )
608
+ if isinstance(det, dict) and det:
609
+ meta["determinism"] = det
610
+ except Exception: # pragma: no cover
611
+ pass
612
+
600
613
  tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
601
614
  if not tokenizer_hash_meta:
602
615
  dataset_section = report.get("data", {})
@@ -626,6 +639,13 @@ def make_certificate(
626
639
 
627
640
  # Extract dataset configuration and compute hashes
628
641
  dataset_info = _extract_dataset_info(report)
642
+ try:
643
+ if isinstance(dataset_info, dict):
644
+ windows = dataset_info.get("windows")
645
+ if isinstance(windows, dict):
646
+ windows.setdefault("stats", {})
647
+ except Exception: # pragma: no cover
648
+ pass
629
649
 
630
650
  # Baseline reference (PM-only). Derive a primary_metric snapshot from baseline windows.
631
651
  # Prefer explicit baseline primary_metric when provided; otherwise compute from windows
@@ -740,15 +760,17 @@ def make_certificate(
740
760
  tier = str(auto_cfg.get("tier")).lower()
741
761
  except Exception: # pragma: no cover
742
762
  pass
763
+ tier_policies = get_tier_policies()
764
+ tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
743
765
  metrics_policy = (
744
- TIER_POLICIES.get(tier, {}).get("metrics", {})
745
- if isinstance(tier, str)
746
- else {}
766
+ tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
747
767
  )
748
- ppl_policy = (
749
- metrics_policy.get("ppl", {}) if isinstance(metrics_policy, dict) else {}
768
+ pm_policy = (
769
+ metrics_policy.get("pm_ratio", {})
770
+ if isinstance(metrics_policy, dict)
771
+ else {}
750
772
  )
751
- min_tokens = int(ppl_policy.get("min_tokens", 0))
773
+ min_tokens = int(pm_policy.get("min_tokens", 0))
752
774
  if (
753
775
  isinstance(total_tokens, int)
754
776
  and min_tokens > 0
@@ -1052,6 +1074,109 @@ def make_certificate(
1052
1074
  if key in metrics_stats_source:
1053
1075
  ppl_analysis["stats"][key] = metrics_stats_source[key]
1054
1076
 
1077
+ # Derive requested/actual window counts for auditability when runners do not
1078
+ # emit a metrics.stats block (normalization may also drop it).
1079
+ try:
1080
+ stats_obj = ppl_analysis.get("stats", {})
1081
+ if isinstance(stats_obj, dict):
1082
+
1083
+ def _as_count(value: Any) -> int | None:
1084
+ if value is None or isinstance(value, bool):
1085
+ return None
1086
+ if isinstance(value, int):
1087
+ return int(value) if value >= 0 else None
1088
+ if isinstance(value, float) and math.isfinite(value):
1089
+ if abs(value - round(value)) > 1e-9 or value < 0:
1090
+ return None
1091
+ return int(round(value))
1092
+ return None
1093
+
1094
+ data_cfg = report.get("data", {}) if isinstance(report, dict) else {}
1095
+ data_cfg = data_cfg if isinstance(data_cfg, dict) else {}
1096
+ windows_cfg = (
1097
+ dataset_info.get("windows", {})
1098
+ if isinstance(dataset_info, dict)
1099
+ else {}
1100
+ )
1101
+ windows_cfg = windows_cfg if isinstance(windows_cfg, dict) else {}
1102
+
1103
+ req_prev = _as_count(stats_obj.get("requested_preview"))
1104
+ if req_prev is None:
1105
+ req_prev = _as_count(data_cfg.get("preview_n"))
1106
+ if req_prev is None:
1107
+ req_prev = _as_count(windows_cfg.get("preview"))
1108
+
1109
+ req_fin = _as_count(stats_obj.get("requested_final"))
1110
+ if req_fin is None:
1111
+ req_fin = _as_count(data_cfg.get("final_n"))
1112
+ if req_fin is None:
1113
+ req_fin = _as_count(windows_cfg.get("final"))
1114
+
1115
+ eval_windows = (
1116
+ report.get("evaluation_windows", {}) if isinstance(report, dict) else {}
1117
+ )
1118
+ eval_windows = eval_windows if isinstance(eval_windows, dict) else {}
1119
+
1120
+ def _len_ids(section: Any) -> int | None:
1121
+ if not isinstance(section, dict):
1122
+ return None
1123
+ ids = section.get("window_ids")
1124
+ if isinstance(ids, list):
1125
+ return int(len(ids))
1126
+ return None
1127
+
1128
+ act_prev = _as_count(stats_obj.get("actual_preview"))
1129
+ if act_prev is None:
1130
+ act_prev = _len_ids(eval_windows.get("preview"))
1131
+ if act_prev is None:
1132
+ cov_prev = (
1133
+ coverage_summary.get("preview")
1134
+ if isinstance(coverage_summary, dict)
1135
+ else None
1136
+ )
1137
+ if isinstance(cov_prev, dict):
1138
+ act_prev = _as_count(cov_prev.get("used"))
1139
+ if act_prev is None:
1140
+ act_prev = req_prev
1141
+
1142
+ act_fin = _as_count(stats_obj.get("actual_final"))
1143
+ if act_fin is None:
1144
+ act_fin = _len_ids(eval_windows.get("final"))
1145
+ if act_fin is None:
1146
+ cov_fin = (
1147
+ coverage_summary.get("final")
1148
+ if isinstance(coverage_summary, dict)
1149
+ else None
1150
+ )
1151
+ if isinstance(cov_fin, dict):
1152
+ act_fin = _as_count(cov_fin.get("used"))
1153
+ elif isinstance(coverage_summary, dict):
1154
+ act_fin = _as_count(coverage_summary.get("used"))
1155
+ if act_fin is None:
1156
+ act_fin = req_fin
1157
+
1158
+ if req_prev is not None:
1159
+ stats_obj.setdefault("requested_preview", req_prev)
1160
+ if req_fin is not None:
1161
+ stats_obj.setdefault("requested_final", req_fin)
1162
+ if act_prev is not None:
1163
+ stats_obj.setdefault("actual_preview", act_prev)
1164
+ if act_fin is not None:
1165
+ stats_obj.setdefault("actual_final", act_fin)
1166
+
1167
+ if "coverage_ok" not in stats_obj:
1168
+ if (
1169
+ isinstance(req_prev, int)
1170
+ and isinstance(req_fin, int)
1171
+ and isinstance(act_prev, int)
1172
+ and isinstance(act_fin, int)
1173
+ ):
1174
+ stats_obj["coverage_ok"] = (act_prev >= req_prev) and (
1175
+ act_fin >= req_fin
1176
+ )
1177
+ except Exception: # pragma: no cover
1178
+ pass
1179
+
1055
1180
  if isinstance(window_plan_ctx, dict):
1056
1181
  ppl_analysis["window_plan"] = window_plan_ctx
1057
1182
 
@@ -1101,17 +1226,62 @@ def make_certificate(
1101
1226
  if variance_policy_digest:
1102
1227
  policies["variance"]["policy_digest"] = variance_policy_digest
1103
1228
 
1229
+ # Resolve tier/profile policy (canonical) and merge observed guard policies.
1230
+ profile = None
1231
+ explicit_overrides = None
1232
+ try:
1233
+ ctx = report.get("context") if isinstance(report, dict) else None
1234
+ if isinstance(ctx, dict) and ctx.get("profile"):
1235
+ profile = str(ctx.get("profile"))
1236
+ except Exception:
1237
+ profile = None
1238
+ try:
1239
+ window_plan = (
1240
+ report.get("metrics", {}).get("window_plan")
1241
+ if isinstance(report.get("metrics"), dict)
1242
+ else None
1243
+ )
1244
+ if (
1245
+ profile is None
1246
+ and isinstance(window_plan, dict)
1247
+ and window_plan.get("profile")
1248
+ ):
1249
+ profile = str(window_plan.get("profile"))
1250
+ except Exception:
1251
+ profile = None
1252
+ try:
1253
+ meta_cfg = (
1254
+ report.get("meta", {}).get("config")
1255
+ if isinstance(report.get("meta"), dict)
1256
+ else None
1257
+ )
1258
+ if isinstance(meta_cfg, dict) and isinstance(meta_cfg.get("guards"), dict):
1259
+ explicit_overrides = meta_cfg.get("guards")
1260
+ if explicit_overrides is None and isinstance(report.get("config"), dict):
1261
+ cfg2 = report.get("config")
1262
+ if isinstance(cfg2.get("guards"), dict):
1263
+ explicit_overrides = cfg2.get("guards")
1264
+ except Exception:
1265
+ explicit_overrides = None
1266
+
1104
1267
  resolved_policy = _build_resolved_policies(
1105
- auto.get("tier", "balanced"), spectral, rmt, variance
1268
+ auto.get("tier", "balanced"),
1269
+ spectral,
1270
+ rmt,
1271
+ variance,
1272
+ profile=profile,
1273
+ explicit_overrides=explicit_overrides,
1274
+ )
1275
+ overrides_list = _extract_policy_overrides(report)
1276
+ resolved_digest = _compute_policy_digest(
1277
+ {"resolved_policy": resolved_policy, "overrides": overrides_list}
1106
1278
  )
1107
- resolved_digest = _compute_policy_digest(resolved_policy)
1108
- policy_digest_value = variance_policy_digest or resolved_digest
1109
1279
  policy_provenance = {
1110
1280
  "tier": auto.get("tier", "balanced"),
1111
- "overrides": _extract_policy_overrides(report),
1112
- "policy_digest": policy_digest_value,
1281
+ "overrides": overrides_list,
1282
+ "policy_digest": resolved_digest,
1113
1283
  }
1114
- auto["policy_digest"] = policy_digest_value
1284
+ auto["policy_digest"] = resolved_digest
1115
1285
 
1116
1286
  for guard_name in ("spectral", "rmt", "variance"):
1117
1287
  if guard_name in resolved_policy:
@@ -1322,24 +1492,36 @@ def make_certificate(
1322
1492
  capacity_tokens = None
1323
1493
  capacity_examples = None
1324
1494
 
1325
- validation_flags = _compute_validation_flags(
1326
- ppl_analysis,
1327
- spectral,
1328
- rmt,
1329
- invariants,
1330
- auto.get("tier", "balanced"),
1331
- ppl_metrics,
1332
- auto.get("target_pm_ratio"),
1333
- guard_overhead_section,
1334
- report.get("metrics", {}).get("primary_metric")
1495
+ pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
1496
+
1497
+ validation_kwargs = {
1498
+ "ppl": ppl_analysis,
1499
+ "spectral": spectral,
1500
+ "rmt": rmt,
1501
+ "invariants": invariants,
1502
+ "tier": auto.get("tier", "balanced"),
1503
+ "_ppl_metrics": ppl_metrics,
1504
+ "target_ratio": auto.get("target_pm_ratio"),
1505
+ "guard_overhead": guard_overhead_section,
1506
+ "primary_metric": report.get("metrics", {}).get("primary_metric")
1335
1507
  if isinstance(report.get("metrics"), dict)
1336
1508
  else None,
1337
- moe_section,
1338
- {
1509
+ "moe": moe_section,
1510
+ "dataset_capacity": {
1339
1511
  "tokens_available": capacity_tokens,
1340
1512
  "examples_available": capacity_examples,
1341
1513
  },
1342
- )
1514
+ }
1515
+ try:
1516
+ if (
1517
+ "pm_acceptance_range"
1518
+ in inspect.signature(_compute_validation_flags).parameters
1519
+ ):
1520
+ validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
1521
+ except Exception: # pragma: no cover - defensive against patched functions
1522
+ validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
1523
+
1524
+ validation_flags = _compute_validation_flags(**validation_kwargs)
1343
1525
  # Enforce validation key allow-list to prevent surface drift
1344
1526
  _allowed_validation = _load_validation_allowlist()
1345
1527
  validation_filtered = {
@@ -1460,16 +1642,17 @@ def make_certificate(
1460
1642
  or (baseline_hash != thresholds_hash)
1461
1643
  )
1462
1644
 
1463
- # Hysteresis knobs snapshot
1464
- try:
1465
- metrics_policy = TIER_POLICIES.get(cur_tier, {}).get("metrics", {})
1466
- except Exception: # pragma: no cover
1645
+ # Hysteresis knobs snapshot (policy-resolved)
1646
+ metrics_policy = (
1647
+ resolved_policy.get("metrics", {}) if isinstance(resolved_policy, dict) else {}
1648
+ )
1649
+ if not isinstance(metrics_policy, dict):
1467
1650
  metrics_policy = {}
1468
1651
  ppl_hys = 0.0
1469
1652
  acc_hys = 0.0
1470
1653
  try:
1471
1654
  ppl_hys = float(
1472
- (metrics_policy.get("ppl") or {}).get("hysteresis_ratio", 0.0) or 0.0
1655
+ (metrics_policy.get("pm_ratio") or {}).get("hysteresis_ratio", 0.0) or 0.0
1473
1656
  )
1474
1657
  acc_hys = float(
1475
1658
  (metrics_policy.get("accuracy") or {}).get("hysteresis_delta_pp", 0.0)
@@ -2191,11 +2374,24 @@ def _format_epsilon_map(epsilon_map: Any) -> dict[str, float]:
2191
2374
 
2192
2375
 
2193
2376
  def _build_resolved_policies(
2194
- tier: str, spectral: dict[str, Any], rmt: dict[str, Any], variance: dict[str, Any]
2377
+ tier: str,
2378
+ spectral: dict[str, Any],
2379
+ rmt: dict[str, Any],
2380
+ variance: dict[str, Any],
2381
+ *,
2382
+ profile: str | None = None,
2383
+ explicit_overrides: dict[str, dict[str, Any]] | None = None,
2195
2384
  ) -> dict[str, Any]:
2196
2385
  from .policy_utils import _build_resolved_policies as _impl
2197
2386
 
2198
- return _impl(tier, spectral, rmt, variance)
2387
+ return _impl(
2388
+ tier,
2389
+ spectral,
2390
+ rmt,
2391
+ variance,
2392
+ profile=profile,
2393
+ explicit_overrides=explicit_overrides,
2394
+ )
2199
2395
 
2200
2396
 
2201
2397
  def _compute_policy_digest(policy: dict[str, Any]) -> str:
@@ -2266,6 +2462,23 @@ def _prepare_guard_overhead_section(
2266
2462
  "threshold_percent": threshold * 100,
2267
2463
  "source": str(payload.get("source", "report")),
2268
2464
  }
2465
+ try:
2466
+ mode = payload.get("mode")
2467
+ if mode is None:
2468
+ mode = payload.get("guard_overhead_mode")
2469
+ if isinstance(mode, str) and mode.strip():
2470
+ sanitized["mode"] = mode.strip()
2471
+ except Exception:
2472
+ pass
2473
+ try:
2474
+ skipped = bool(payload.get("skipped", False))
2475
+ if skipped:
2476
+ sanitized["skipped"] = True
2477
+ reason = payload.get("skip_reason")
2478
+ if isinstance(reason, str) and reason.strip():
2479
+ sanitized["skip_reason"] = reason.strip()
2480
+ except Exception:
2481
+ pass
2269
2482
 
2270
2483
  # Prefer structured reports and reuse the validator when available
2271
2484
  bare_report = payload.pop("bare_report", None)
@@ -2436,6 +2649,12 @@ def _propagate_pairing_stats(
2436
2649
  coverage = pa_stats.get("coverage")
2437
2650
  if isinstance(coverage, dict) and coverage:
2438
2651
  stats["coverage"] = coverage
2652
+ bootstrap = pa_stats.get("bootstrap")
2653
+ if isinstance(bootstrap, dict) and bootstrap:
2654
+ stats["bootstrap"] = bootstrap
2655
+ paired_delta_summary = pa_stats.get("paired_delta_summary")
2656
+ if isinstance(paired_delta_summary, dict) and paired_delta_summary:
2657
+ stats["paired_delta_summary"] = paired_delta_summary
2439
2658
  wmf = pa_stats.get("window_match_fraction")
2440
2659
  if wmf is not None:
2441
2660
  stats["window_match_fraction"] = wmf
@@ -2537,6 +2756,103 @@ def _build_provenance_block(
2537
2756
  return provenance
2538
2757
 
2539
2758
 
2759
+ def _resolve_pm_acceptance_range_from_report(
2760
+ report: dict[str, Any] | None,
2761
+ ) -> dict[str, float]:
2762
+ """Resolve primary-metric acceptance bounds from report context/meta/env."""
2763
+
2764
+ base_min = 0.95
2765
+ base_max = 1.10
2766
+
2767
+ def _safe_float(val: Any) -> float | None:
2768
+ try:
2769
+ if val is None:
2770
+ return None
2771
+ return float(val)
2772
+ except Exception:
2773
+ return None
2774
+
2775
+ cfg_min = None
2776
+ cfg_max = None
2777
+ ctx = report.get("context") if isinstance(report, dict) else None
2778
+ if isinstance(ctx, dict):
2779
+ pm_ctx = (
2780
+ ctx.get("primary_metric")
2781
+ if isinstance(ctx.get("primary_metric"), dict)
2782
+ else {}
2783
+ )
2784
+ if isinstance(pm_ctx, dict):
2785
+ cfg_min = _safe_float(pm_ctx.get("acceptance_range", {}).get("min"))
2786
+ cfg_max = _safe_float(pm_ctx.get("acceptance_range", {}).get("max"))
2787
+ if cfg_min is None or cfg_max is None:
2788
+ alt = ctx.get("pm_acceptance_range")
2789
+ if isinstance(alt, dict):
2790
+ cfg_min = (
2791
+ cfg_min if cfg_min is not None else _safe_float(alt.get("min"))
2792
+ )
2793
+ cfg_max = (
2794
+ cfg_max if cfg_max is not None else _safe_float(alt.get("max"))
2795
+ )
2796
+
2797
+ if (cfg_min is None or cfg_max is None) and isinstance(report, dict):
2798
+ meta = report.get("meta")
2799
+ if isinstance(meta, dict):
2800
+ meta_range = meta.get("pm_acceptance_range")
2801
+ if isinstance(meta_range, dict):
2802
+ cfg_min = (
2803
+ cfg_min
2804
+ if cfg_min is not None
2805
+ else _safe_float(meta_range.get("min"))
2806
+ )
2807
+ cfg_max = (
2808
+ cfg_max
2809
+ if cfg_max is not None
2810
+ else _safe_float(meta_range.get("max"))
2811
+ )
2812
+
2813
+ def _parse_env(name: str) -> float | None:
2814
+ try:
2815
+ raw = os.environ.get(name, "")
2816
+ if raw is None or str(raw).strip() == "":
2817
+ return None
2818
+ return float(raw)
2819
+ except Exception:
2820
+ return None
2821
+
2822
+ env_min = _parse_env("INVARLOCK_PM_ACCEPTANCE_MIN")
2823
+ env_max = _parse_env("INVARLOCK_PM_ACCEPTANCE_MAX")
2824
+
2825
+ has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
2826
+ if not has_explicit:
2827
+ return {}
2828
+
2829
+ min_val = (
2830
+ env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
2831
+ )
2832
+ max_val = (
2833
+ env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
2834
+ )
2835
+
2836
+ try:
2837
+ if min_val is not None and min_val <= 0:
2838
+ min_val = base_min
2839
+ except Exception:
2840
+ min_val = base_min
2841
+ try:
2842
+ if max_val is not None and max_val <= 0:
2843
+ max_val = base_max
2844
+ except Exception:
2845
+ max_val = base_max
2846
+
2847
+ try:
2848
+ if max_val is not None and min_val is not None and max_val < min_val:
2849
+ max_val = min_val
2850
+ except Exception:
2851
+ max_val = base_max
2852
+
2853
+ return {"min": float(min_val), "max": float(max_val)}
2854
+
2855
+
2540
2856
  def _compute_validation_flags(
2541
2857
  ppl: dict[str, Any],
2542
2858
  spectral: dict[str, Any],
@@ -2549,6 +2865,7 @@ def _compute_validation_flags(
2549
2865
  primary_metric: dict[str, Any] | None = None,
2550
2866
  moe: dict[str, Any] | None = None,
2551
2867
  dataset_capacity: dict[str, Any] | None = None,
2868
+ pm_acceptance_range: dict[str, float] | None = None,
2552
2869
  ) -> dict[str, bool]:
2553
2870
  """Compute validation flags for the certificate including canonical gates."""
2554
2871
  tier = (tier or "balanced").lower()
@@ -2563,13 +2880,50 @@ def _compute_validation_flags(
2563
2880
  }
2564
2881
  if _tiny_relax:
2565
2882
  tier = "aggressive"
2883
+
2566
2884
  tier_thresholds = {
2567
2885
  "conservative": 1.05,
2568
2886
  "balanced": 1.10,
2569
2887
  "aggressive": 1.20,
2570
2888
  "none": 1.10,
2571
2889
  }
2572
- ratio_limit = tier_thresholds.get(tier, 1.10)
2890
+ tier_policies = get_tier_policies()
2891
+ tier_policy = tier_policies.get(tier, tier_policies.get("balanced", {}))
2892
+ metrics_policy = (
2893
+ tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
2894
+ )
2895
+ pm_policy = (
2896
+ metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
2897
+ )
2898
+ ratio_limit_base = pm_policy.get("ratio_limit_base")
2899
+ try:
2900
+ if ratio_limit_base is not None:
2901
+ ratio_limit_base = float(ratio_limit_base)
2902
+ except Exception:
2903
+ ratio_limit_base = None
2904
+ if not isinstance(ratio_limit_base, (int | float)) or not math.isfinite(
2905
+ float(ratio_limit_base)
2906
+ ):
2907
+ ratio_limit_base = float(tier_thresholds.get(tier, 1.10))
2908
+ acceptance = pm_acceptance_range if isinstance(pm_acceptance_range, dict) else {}
2909
+ ratio_min_bound = None
2910
+ ratio_max_bound = None
2911
+ try:
2912
+ if acceptance.get("min") is not None:
2913
+ ratio_min_bound = float(acceptance.get("min"))
2914
+ except Exception:
2915
+ ratio_min_bound = None
2916
+ try:
2917
+ if acceptance.get("max") is not None:
2918
+ ratio_max_bound = float(acceptance.get("max"))
2919
+ except Exception:
2920
+ ratio_max_bound = None
2921
+
2922
+ ratio_limit = (
2923
+ ratio_max_bound
2924
+ if isinstance(ratio_max_bound, (int | float)) and math.isfinite(ratio_max_bound)
2925
+ else float(ratio_limit_base)
2926
+ )
2573
2927
  if isinstance(target_ratio, int | float) and target_ratio > 0:
2574
2928
  ratio_limit = min(ratio_limit, float(target_ratio))
2575
2929
 
@@ -2597,13 +2951,6 @@ def _compute_validation_flags(
2597
2951
  except Exception: # pragma: no cover
2598
2952
  pass
2599
2953
  # Hysteresis and sample-size floors from tier policies
2600
- tier_policy = TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
2601
- metrics_policy = (
2602
- tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
2603
- )
2604
- pm_policy = (
2605
- metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
2606
- )
2607
2954
  hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
2608
2955
  min_tokens = int(pm_policy.get("min_tokens", 0))
2609
2956
  # Evaluate sample-size sufficiency
@@ -2636,9 +2983,18 @@ def _compute_validation_flags(
2636
2983
  tokens_ok_eff = tokens_ok or _tiny_relax
2637
2984
  # Apply hysteresis to ratio limit if needed
2638
2985
  ratio_limit_with_hyst = ratio_limit + max(0.0, hysteresis_ratio)
2986
+ lower_bound_ok = True
2987
+ if ratio_min_bound is not None and isinstance(ratio_vs_baseline, (int | float)):
2988
+ try:
2989
+ lower_bound_ok = math.isfinite(float(ratio_vs_baseline)) and (
2990
+ float(ratio_vs_baseline) >= float(ratio_min_bound)
2991
+ )
2992
+ except Exception:
2993
+ lower_bound_ok = True
2639
2994
  compression_acceptable = (
2640
2995
  isinstance(ratio_vs_baseline, int | float)
2641
2996
  and math.isfinite(ratio_vs_baseline)
2997
+ and lower_bound_ok
2642
2998
  and ratio_vs_baseline <= ratio_limit_with_hyst
2643
2999
  and tokens_ok_eff
2644
3000
  )
@@ -2655,7 +3011,9 @@ def _compute_validation_flags(
2655
3011
  and all(isinstance(x, int | float) and math.isfinite(x) for x in ratio_ci)
2656
3012
  ):
2657
3013
  compression_acceptable = (
2658
- compression_acceptable and ratio_ci[1] <= ratio_limit_with_hyst
3014
+ compression_acceptable
3015
+ and ratio_ci[1] <= ratio_limit_with_hyst
3016
+ and (ratio_min_bound is None or ratio_ci[0] >= ratio_min_bound)
2659
3017
  )
2660
3018
 
2661
3019
  # 3. RMT ε-rule compliance
@@ -2664,7 +3022,9 @@ def _compute_validation_flags(
2664
3022
  summary = spectral.get("summary", {}) if isinstance(spectral, dict) else {}
2665
3023
  max_caps = spectral.get("max_caps") or summary.get("max_caps")
2666
3024
  if max_caps is None:
2667
- default_spectral = TIER_POLICIES.get(tier, {}).get("spectral", {})
3025
+ default_spectral = (
3026
+ tier_policy.get("spectral", {}) if isinstance(tier_policy, dict) else {}
3027
+ )
2668
3028
  max_caps = default_spectral.get("max_caps", 5)
2669
3029
  spectral_stable = spectral.get("caps_applied", 0) <= int(max_caps)
2670
3030
  if spectral.get("caps_exceeded"):
@@ -2731,14 +3091,6 @@ def _compute_validation_flags(
2731
3091
  flags["primary_metric_acceptable"] = bool(ok)
2732
3092
  elif kind in {"accuracy", "vqa_accuracy"}:
2733
3093
  # Read thresholds from tier policy if available
2734
- tier_policy = (
2735
- TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
2736
- )
2737
- metrics_policy = (
2738
- tier_policy.get("metrics", {})
2739
- if isinstance(tier_policy, dict)
2740
- else {}
2741
- )
2742
3094
  acc_policy = (
2743
3095
  metrics_policy.get("accuracy", {})
2744
3096
  if isinstance(metrics_policy, dict)
@@ -29,6 +29,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
29
29
  "plugins",
30
30
  "meta",
31
31
  "dataset",
32
+ "primary_metric",
32
33
  ],
33
34
  "properties": {
34
35
  "schema_version": {"const": CERTIFICATE_SCHEMA_VERSION},
@@ -64,11 +65,12 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
64
65
  "seq_len": {"type": "integer", "minimum": 1},
65
66
  "windows": {
66
67
  "type": "object",
67
- "required": ["preview", "final"],
68
+ "required": ["preview", "final", "stats"],
68
69
  "properties": {
69
70
  "preview": {"type": "integer", "minimum": 0},
70
71
  "final": {"type": "integer", "minimum": 0},
71
72
  "seed": {"type": "integer"},
73
+ "stats": {"type": "object"},
72
74
  },
73
75
  },
74
76
  },
@@ -77,6 +79,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
77
79
  # ppl_* block removed from required schema; may appear for ppl-like tasks but is optional
78
80
  "primary_metric": {
79
81
  "type": "object",
82
+ "required": ["kind"],
80
83
  "properties": {
81
84
  "kind": {"type": "string"},
82
85
  "unit": {"type": "string"},