invarlock 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/tiers.yaml +61 -0
  3. invarlock/adapters/hf_loading.py +97 -0
  4. invarlock/calibration/__init__.py +6 -0
  5. invarlock/calibration/spectral_null.py +301 -0
  6. invarlock/calibration/variance_ve.py +154 -0
  7. invarlock/cli/app.py +15 -0
  8. invarlock/cli/commands/calibrate.py +576 -0
  9. invarlock/cli/commands/doctor.py +9 -3
  10. invarlock/cli/commands/explain_gates.py +53 -9
  11. invarlock/cli/commands/plugins.py +12 -2
  12. invarlock/cli/commands/run.py +175 -79
  13. invarlock/cli/commands/verify.py +40 -0
  14. invarlock/cli/determinism.py +237 -0
  15. invarlock/core/auto_tuning.py +215 -17
  16. invarlock/core/registry.py +9 -4
  17. invarlock/eval/bench.py +467 -141
  18. invarlock/eval/bench_regression.py +12 -0
  19. invarlock/eval/data.py +29 -7
  20. invarlock/guards/spectral.py +216 -9
  21. invarlock/guards/variance.py +6 -3
  22. invarlock/reporting/certificate.py +249 -37
  23. invarlock/reporting/certificate_schema.py +4 -1
  24. invarlock/reporting/guards_analysis.py +108 -10
  25. invarlock/reporting/normalizer.py +21 -1
  26. invarlock/reporting/policy_utils.py +100 -16
  27. {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/METADATA +12 -10
  28. {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/RECORD +32 -25
  29. {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/WHEEL +0 -0
  30. {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/entry_points.txt +0 -0
  31. {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/licenses/LICENSE +0 -0
  32. {invarlock-0.3.1.dist-info → invarlock-0.3.2.dist-info}/top_level.txt +0 -0
@@ -29,7 +29,7 @@ try: # pragma: no cover - exercised in integration
29
29
  except Exception: # pragma: no cover
30
30
  jsonschema = None # type: ignore
31
31
 
32
- from invarlock.core.auto_tuning import TIER_POLICIES
32
+ from invarlock.core.auto_tuning import get_tier_policies
33
33
  from invarlock.core.bootstrap import (
34
34
  compute_paired_delta_log_ci,
35
35
  logspace_to_ratio_ci,
@@ -598,6 +598,18 @@ def make_certificate(
598
598
  except Exception: # pragma: no cover
599
599
  pass
600
600
 
601
+ # Determinism preset (CI/Release provenance) when present.
602
+ try:
603
+ det = (
604
+ report.get("meta", {}).get("determinism")
605
+ if isinstance(report.get("meta"), dict)
606
+ else None
607
+ )
608
+ if isinstance(det, dict) and det:
609
+ meta["determinism"] = det
610
+ except Exception: # pragma: no cover
611
+ pass
612
+
601
613
  tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
602
614
  if not tokenizer_hash_meta:
603
615
  dataset_section = report.get("data", {})
@@ -627,6 +639,13 @@ def make_certificate(
627
639
 
628
640
  # Extract dataset configuration and compute hashes
629
641
  dataset_info = _extract_dataset_info(report)
642
+ try:
643
+ if isinstance(dataset_info, dict):
644
+ windows = dataset_info.get("windows")
645
+ if isinstance(windows, dict):
646
+ windows.setdefault("stats", {})
647
+ except Exception: # pragma: no cover
648
+ pass
630
649
 
631
650
  # Baseline reference (PM-only). Derive a primary_metric snapshot from baseline windows.
632
651
  # Prefer explicit baseline primary_metric when provided; otherwise compute from windows
@@ -741,15 +760,17 @@ def make_certificate(
741
760
  tier = str(auto_cfg.get("tier")).lower()
742
761
  except Exception: # pragma: no cover
743
762
  pass
763
+ tier_policies = get_tier_policies()
764
+ tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
744
765
  metrics_policy = (
745
- TIER_POLICIES.get(tier, {}).get("metrics", {})
746
- if isinstance(tier, str)
747
- else {}
766
+ tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
748
767
  )
749
- ppl_policy = (
750
- metrics_policy.get("ppl", {}) if isinstance(metrics_policy, dict) else {}
768
+ pm_policy = (
769
+ metrics_policy.get("pm_ratio", {})
770
+ if isinstance(metrics_policy, dict)
771
+ else {}
751
772
  )
752
- min_tokens = int(ppl_policy.get("min_tokens", 0))
773
+ min_tokens = int(pm_policy.get("min_tokens", 0))
753
774
  if (
754
775
  isinstance(total_tokens, int)
755
776
  and min_tokens > 0
@@ -1053,6 +1074,109 @@ def make_certificate(
1053
1074
  if key in metrics_stats_source:
1054
1075
  ppl_analysis["stats"][key] = metrics_stats_source[key]
1055
1076
 
1077
+ # Derive requested/actual window counts for auditability when runners do not
1078
+ # emit a metrics.stats block (normalization may also drop it).
1079
+ try:
1080
+ stats_obj = ppl_analysis.get("stats", {})
1081
+ if isinstance(stats_obj, dict):
1082
+
1083
+ def _as_count(value: Any) -> int | None:
1084
+ if value is None or isinstance(value, bool):
1085
+ return None
1086
+ if isinstance(value, int):
1087
+ return int(value) if value >= 0 else None
1088
+ if isinstance(value, float) and math.isfinite(value):
1089
+ if abs(value - round(value)) > 1e-9 or value < 0:
1090
+ return None
1091
+ return int(round(value))
1092
+ return None
1093
+
1094
+ data_cfg = report.get("data", {}) if isinstance(report, dict) else {}
1095
+ data_cfg = data_cfg if isinstance(data_cfg, dict) else {}
1096
+ windows_cfg = (
1097
+ dataset_info.get("windows", {})
1098
+ if isinstance(dataset_info, dict)
1099
+ else {}
1100
+ )
1101
+ windows_cfg = windows_cfg if isinstance(windows_cfg, dict) else {}
1102
+
1103
+ req_prev = _as_count(stats_obj.get("requested_preview"))
1104
+ if req_prev is None:
1105
+ req_prev = _as_count(data_cfg.get("preview_n"))
1106
+ if req_prev is None:
1107
+ req_prev = _as_count(windows_cfg.get("preview"))
1108
+
1109
+ req_fin = _as_count(stats_obj.get("requested_final"))
1110
+ if req_fin is None:
1111
+ req_fin = _as_count(data_cfg.get("final_n"))
1112
+ if req_fin is None:
1113
+ req_fin = _as_count(windows_cfg.get("final"))
1114
+
1115
+ eval_windows = (
1116
+ report.get("evaluation_windows", {}) if isinstance(report, dict) else {}
1117
+ )
1118
+ eval_windows = eval_windows if isinstance(eval_windows, dict) else {}
1119
+
1120
+ def _len_ids(section: Any) -> int | None:
1121
+ if not isinstance(section, dict):
1122
+ return None
1123
+ ids = section.get("window_ids")
1124
+ if isinstance(ids, list):
1125
+ return int(len(ids))
1126
+ return None
1127
+
1128
+ act_prev = _as_count(stats_obj.get("actual_preview"))
1129
+ if act_prev is None:
1130
+ act_prev = _len_ids(eval_windows.get("preview"))
1131
+ if act_prev is None:
1132
+ cov_prev = (
1133
+ coverage_summary.get("preview")
1134
+ if isinstance(coverage_summary, dict)
1135
+ else None
1136
+ )
1137
+ if isinstance(cov_prev, dict):
1138
+ act_prev = _as_count(cov_prev.get("used"))
1139
+ if act_prev is None:
1140
+ act_prev = req_prev
1141
+
1142
+ act_fin = _as_count(stats_obj.get("actual_final"))
1143
+ if act_fin is None:
1144
+ act_fin = _len_ids(eval_windows.get("final"))
1145
+ if act_fin is None:
1146
+ cov_fin = (
1147
+ coverage_summary.get("final")
1148
+ if isinstance(coverage_summary, dict)
1149
+ else None
1150
+ )
1151
+ if isinstance(cov_fin, dict):
1152
+ act_fin = _as_count(cov_fin.get("used"))
1153
+ elif isinstance(coverage_summary, dict):
1154
+ act_fin = _as_count(coverage_summary.get("used"))
1155
+ if act_fin is None:
1156
+ act_fin = req_fin
1157
+
1158
+ if req_prev is not None:
1159
+ stats_obj.setdefault("requested_preview", req_prev)
1160
+ if req_fin is not None:
1161
+ stats_obj.setdefault("requested_final", req_fin)
1162
+ if act_prev is not None:
1163
+ stats_obj.setdefault("actual_preview", act_prev)
1164
+ if act_fin is not None:
1165
+ stats_obj.setdefault("actual_final", act_fin)
1166
+
1167
+ if "coverage_ok" not in stats_obj:
1168
+ if (
1169
+ isinstance(req_prev, int)
1170
+ and isinstance(req_fin, int)
1171
+ and isinstance(act_prev, int)
1172
+ and isinstance(act_fin, int)
1173
+ ):
1174
+ stats_obj["coverage_ok"] = (act_prev >= req_prev) and (
1175
+ act_fin >= req_fin
1176
+ )
1177
+ except Exception: # pragma: no cover
1178
+ pass
1179
+
1056
1180
  if isinstance(window_plan_ctx, dict):
1057
1181
  ppl_analysis["window_plan"] = window_plan_ctx
1058
1182
 
@@ -1102,17 +1226,62 @@ def make_certificate(
1102
1226
  if variance_policy_digest:
1103
1227
  policies["variance"]["policy_digest"] = variance_policy_digest
1104
1228
 
1229
+ # Resolve tier/profile policy (canonical) and merge observed guard policies.
1230
+ profile = None
1231
+ explicit_overrides = None
1232
+ try:
1233
+ ctx = report.get("context") if isinstance(report, dict) else None
1234
+ if isinstance(ctx, dict) and ctx.get("profile"):
1235
+ profile = str(ctx.get("profile"))
1236
+ except Exception:
1237
+ profile = None
1238
+ try:
1239
+ window_plan = (
1240
+ report.get("metrics", {}).get("window_plan")
1241
+ if isinstance(report.get("metrics"), dict)
1242
+ else None
1243
+ )
1244
+ if (
1245
+ profile is None
1246
+ and isinstance(window_plan, dict)
1247
+ and window_plan.get("profile")
1248
+ ):
1249
+ profile = str(window_plan.get("profile"))
1250
+ except Exception:
1251
+ profile = None
1252
+ try:
1253
+ meta_cfg = (
1254
+ report.get("meta", {}).get("config")
1255
+ if isinstance(report.get("meta"), dict)
1256
+ else None
1257
+ )
1258
+ if isinstance(meta_cfg, dict) and isinstance(meta_cfg.get("guards"), dict):
1259
+ explicit_overrides = meta_cfg.get("guards")
1260
+ if explicit_overrides is None and isinstance(report.get("config"), dict):
1261
+ cfg2 = report.get("config")
1262
+ if isinstance(cfg2.get("guards"), dict):
1263
+ explicit_overrides = cfg2.get("guards")
1264
+ except Exception:
1265
+ explicit_overrides = None
1266
+
1105
1267
  resolved_policy = _build_resolved_policies(
1106
- auto.get("tier", "balanced"), spectral, rmt, variance
1268
+ auto.get("tier", "balanced"),
1269
+ spectral,
1270
+ rmt,
1271
+ variance,
1272
+ profile=profile,
1273
+ explicit_overrides=explicit_overrides,
1274
+ )
1275
+ overrides_list = _extract_policy_overrides(report)
1276
+ resolved_digest = _compute_policy_digest(
1277
+ {"resolved_policy": resolved_policy, "overrides": overrides_list}
1107
1278
  )
1108
- resolved_digest = _compute_policy_digest(resolved_policy)
1109
- policy_digest_value = variance_policy_digest or resolved_digest
1110
1279
  policy_provenance = {
1111
1280
  "tier": auto.get("tier", "balanced"),
1112
- "overrides": _extract_policy_overrides(report),
1113
- "policy_digest": policy_digest_value,
1281
+ "overrides": overrides_list,
1282
+ "policy_digest": resolved_digest,
1114
1283
  }
1115
- auto["policy_digest"] = policy_digest_value
1284
+ auto["policy_digest"] = resolved_digest
1116
1285
 
1117
1286
  for guard_name in ("spectral", "rmt", "variance"):
1118
1287
  if guard_name in resolved_policy:
@@ -1473,16 +1642,17 @@ def make_certificate(
1473
1642
  or (baseline_hash != thresholds_hash)
1474
1643
  )
1475
1644
 
1476
- # Hysteresis knobs snapshot
1477
- try:
1478
- metrics_policy = TIER_POLICIES.get(cur_tier, {}).get("metrics", {})
1479
- except Exception: # pragma: no cover
1645
+ # Hysteresis knobs snapshot (policy-resolved)
1646
+ metrics_policy = (
1647
+ resolved_policy.get("metrics", {}) if isinstance(resolved_policy, dict) else {}
1648
+ )
1649
+ if not isinstance(metrics_policy, dict):
1480
1650
  metrics_policy = {}
1481
1651
  ppl_hys = 0.0
1482
1652
  acc_hys = 0.0
1483
1653
  try:
1484
1654
  ppl_hys = float(
1485
- (metrics_policy.get("ppl") or {}).get("hysteresis_ratio", 0.0) or 0.0
1655
+ (metrics_policy.get("pm_ratio") or {}).get("hysteresis_ratio", 0.0) or 0.0
1486
1656
  )
1487
1657
  acc_hys = float(
1488
1658
  (metrics_policy.get("accuracy") or {}).get("hysteresis_delta_pp", 0.0)
@@ -2204,11 +2374,24 @@ def _format_epsilon_map(epsilon_map: Any) -> dict[str, float]:
2204
2374
 
2205
2375
 
2206
2376
  def _build_resolved_policies(
2207
- tier: str, spectral: dict[str, Any], rmt: dict[str, Any], variance: dict[str, Any]
2377
+ tier: str,
2378
+ spectral: dict[str, Any],
2379
+ rmt: dict[str, Any],
2380
+ variance: dict[str, Any],
2381
+ *,
2382
+ profile: str | None = None,
2383
+ explicit_overrides: dict[str, dict[str, Any]] | None = None,
2208
2384
  ) -> dict[str, Any]:
2209
2385
  from .policy_utils import _build_resolved_policies as _impl
2210
2386
 
2211
- return _impl(tier, spectral, rmt, variance)
2387
+ return _impl(
2388
+ tier,
2389
+ spectral,
2390
+ rmt,
2391
+ variance,
2392
+ profile=profile,
2393
+ explicit_overrides=explicit_overrides,
2394
+ )
2212
2395
 
2213
2396
 
2214
2397
  def _compute_policy_digest(policy: dict[str, Any]) -> str:
@@ -2279,6 +2462,23 @@ def _prepare_guard_overhead_section(
2279
2462
  "threshold_percent": threshold * 100,
2280
2463
  "source": str(payload.get("source", "report")),
2281
2464
  }
2465
+ try:
2466
+ mode = payload.get("mode")
2467
+ if mode is None:
2468
+ mode = payload.get("guard_overhead_mode")
2469
+ if isinstance(mode, str) and mode.strip():
2470
+ sanitized["mode"] = mode.strip()
2471
+ except Exception:
2472
+ pass
2473
+ try:
2474
+ skipped = bool(payload.get("skipped", False))
2475
+ if skipped:
2476
+ sanitized["skipped"] = True
2477
+ reason = payload.get("skip_reason")
2478
+ if isinstance(reason, str) and reason.strip():
2479
+ sanitized["skip_reason"] = reason.strip()
2480
+ except Exception:
2481
+ pass
2282
2482
 
2283
2483
  # Prefer structured reports and reuse the validator when available
2284
2484
  bare_report = payload.pop("bare_report", None)
@@ -2449,6 +2649,12 @@ def _propagate_pairing_stats(
2449
2649
  coverage = pa_stats.get("coverage")
2450
2650
  if isinstance(coverage, dict) and coverage:
2451
2651
  stats["coverage"] = coverage
2652
+ bootstrap = pa_stats.get("bootstrap")
2653
+ if isinstance(bootstrap, dict) and bootstrap:
2654
+ stats["bootstrap"] = bootstrap
2655
+ paired_delta_summary = pa_stats.get("paired_delta_summary")
2656
+ if isinstance(paired_delta_summary, dict) and paired_delta_summary:
2657
+ stats["paired_delta_summary"] = paired_delta_summary
2452
2658
  wmf = pa_stats.get("window_match_fraction")
2453
2659
  if wmf is not None:
2454
2660
  stats["window_match_fraction"] = wmf
@@ -2674,12 +2880,31 @@ def _compute_validation_flags(
2674
2880
  }
2675
2881
  if _tiny_relax:
2676
2882
  tier = "aggressive"
2883
+
2677
2884
  tier_thresholds = {
2678
2885
  "conservative": 1.05,
2679
2886
  "balanced": 1.10,
2680
2887
  "aggressive": 1.20,
2681
2888
  "none": 1.10,
2682
2889
  }
2890
+ tier_policies = get_tier_policies()
2891
+ tier_policy = tier_policies.get(tier, tier_policies.get("balanced", {}))
2892
+ metrics_policy = (
2893
+ tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
2894
+ )
2895
+ pm_policy = (
2896
+ metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
2897
+ )
2898
+ ratio_limit_base = pm_policy.get("ratio_limit_base")
2899
+ try:
2900
+ if ratio_limit_base is not None:
2901
+ ratio_limit_base = float(ratio_limit_base)
2902
+ except Exception:
2903
+ ratio_limit_base = None
2904
+ if not isinstance(ratio_limit_base, (int | float)) or not math.isfinite(
2905
+ float(ratio_limit_base)
2906
+ ):
2907
+ ratio_limit_base = float(tier_thresholds.get(tier, 1.10))
2683
2908
  acceptance = pm_acceptance_range if isinstance(pm_acceptance_range, dict) else {}
2684
2909
  ratio_min_bound = None
2685
2910
  ratio_max_bound = None
@@ -2697,7 +2922,7 @@ def _compute_validation_flags(
2697
2922
  ratio_limit = (
2698
2923
  ratio_max_bound
2699
2924
  if isinstance(ratio_max_bound, (int | float)) and math.isfinite(ratio_max_bound)
2700
- else tier_thresholds.get(tier, 1.10)
2925
+ else float(ratio_limit_base)
2701
2926
  )
2702
2927
  if isinstance(target_ratio, int | float) and target_ratio > 0:
2703
2928
  ratio_limit = min(ratio_limit, float(target_ratio))
@@ -2726,13 +2951,6 @@ def _compute_validation_flags(
2726
2951
  except Exception: # pragma: no cover
2727
2952
  pass
2728
2953
  # Hysteresis and sample-size floors from tier policies
2729
- tier_policy = TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
2730
- metrics_policy = (
2731
- tier_policy.get("metrics", {}) if isinstance(tier_policy, dict) else {}
2732
- )
2733
- pm_policy = (
2734
- metrics_policy.get("pm_ratio", {}) if isinstance(metrics_policy, dict) else {}
2735
- )
2736
2954
  hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
2737
2955
  min_tokens = int(pm_policy.get("min_tokens", 0))
2738
2956
  # Evaluate sample-size sufficiency
@@ -2804,7 +3022,9 @@ def _compute_validation_flags(
2804
3022
  summary = spectral.get("summary", {}) if isinstance(spectral, dict) else {}
2805
3023
  max_caps = spectral.get("max_caps") or summary.get("max_caps")
2806
3024
  if max_caps is None:
2807
- default_spectral = TIER_POLICIES.get(tier, {}).get("spectral", {})
3025
+ default_spectral = (
3026
+ tier_policy.get("spectral", {}) if isinstance(tier_policy, dict) else {}
3027
+ )
2808
3028
  max_caps = default_spectral.get("max_caps", 5)
2809
3029
  spectral_stable = spectral.get("caps_applied", 0) <= int(max_caps)
2810
3030
  if spectral.get("caps_exceeded"):
@@ -2871,14 +3091,6 @@ def _compute_validation_flags(
2871
3091
  flags["primary_metric_acceptable"] = bool(ok)
2872
3092
  elif kind in {"accuracy", "vqa_accuracy"}:
2873
3093
  # Read thresholds from tier policy if available
2874
- tier_policy = (
2875
- TIER_POLICIES.get(tier, {}) if isinstance(tier, str) else {}
2876
- )
2877
- metrics_policy = (
2878
- tier_policy.get("metrics", {})
2879
- if isinstance(tier_policy, dict)
2880
- else {}
2881
- )
2882
3094
  acc_policy = (
2883
3095
  metrics_policy.get("accuracy", {})
2884
3096
  if isinstance(metrics_policy, dict)
@@ -29,6 +29,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
29
29
  "plugins",
30
30
  "meta",
31
31
  "dataset",
32
+ "primary_metric",
32
33
  ],
33
34
  "properties": {
34
35
  "schema_version": {"const": CERTIFICATE_SCHEMA_VERSION},
@@ -64,11 +65,12 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
64
65
  "seq_len": {"type": "integer", "minimum": 1},
65
66
  "windows": {
66
67
  "type": "object",
67
- "required": ["preview", "final"],
68
+ "required": ["preview", "final", "stats"],
68
69
  "properties": {
69
70
  "preview": {"type": "integer", "minimum": 0},
70
71
  "final": {"type": "integer", "minimum": 0},
71
72
  "seed": {"type": "integer"},
73
+ "stats": {"type": "object"},
72
74
  },
73
75
  },
74
76
  },
@@ -77,6 +79,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
77
79
  # ppl_* block removed from required schema; may appear for ppl-like tasks but is optional
78
80
  "primary_metric": {
79
81
  "type": "object",
82
+ "required": ["kind"],
80
83
  "properties": {
81
84
  "kind": {"type": "string"},
82
85
  "unit": {"type": "string"},
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
  import math
5
5
  from typing import Any, no_type_check
6
6
 
7
- from invarlock.core.auto_tuning import TIER_POLICIES
7
+ from invarlock.core.auto_tuning import get_tier_policies
8
8
 
9
9
  from .policy_utils import _promote_legacy_multiple_testing_key, _resolve_policy_tier
10
10
  from .report_types import RunReport
@@ -133,7 +133,8 @@ def _extract_spectral_analysis(
133
133
  report: RunReport, baseline: dict[str, Any]
134
134
  ) -> dict[str, Any]:
135
135
  tier = _resolve_policy_tier(report)
136
- tier_defaults = TIER_POLICIES.get(tier, TIER_POLICIES.get("balanced", {}))
136
+ tier_policies = get_tier_policies()
137
+ tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
137
138
  spectral_defaults = tier_defaults.get("spectral", {}) if tier_defaults else {}
138
139
  default_sigma_quantile = spectral_defaults.get("sigma_quantile", 0.95)
139
140
  default_deadband = spectral_defaults.get("deadband", 0.1)
@@ -166,9 +167,15 @@ def _extract_spectral_analysis(
166
167
  caps_exceeded = (
167
168
  bool(guard_metrics.get("caps_exceeded", False)) if guard_metrics else False
168
169
  )
169
- max_caps = guard_policy.get("max_caps") if guard_policy else None
170
+ max_caps = guard_metrics.get("max_caps") if guard_metrics else None
171
+ if max_caps is None and guard_policy:
172
+ max_caps = guard_policy.get("max_caps")
170
173
  if max_caps is None:
171
174
  max_caps = default_max_caps
175
+ try:
176
+ max_caps = int(max_caps)
177
+ except Exception:
178
+ max_caps = int(default_max_caps)
172
179
 
173
180
  try:
174
181
  max_spectral_norm = float(
@@ -618,10 +625,15 @@ def _extract_rmt_analysis(
618
625
  report: RunReport, baseline: dict[str, Any]
619
626
  ) -> dict[str, Any]:
620
627
  tier = _resolve_policy_tier(report)
621
- tier_defaults = TIER_POLICIES.get(tier, TIER_POLICIES.get("balanced", {}))
628
+ tier_policies = get_tier_policies()
629
+ tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
622
630
  default_epsilon_map = (
623
- tier_defaults.get("rmt", {}).get("epsilon", {}) if tier_defaults else {}
631
+ tier_defaults.get("rmt", {}).get("epsilon_by_family")
632
+ if isinstance(tier_defaults, dict)
633
+ else {}
624
634
  )
635
+ if not default_epsilon_map and isinstance(tier_defaults, dict):
636
+ default_epsilon_map = (tier_defaults.get("rmt", {}) or {}).get("epsilon", {})
625
637
  default_epsilon_map = {
626
638
  str(family): float(value)
627
639
  for family, value in (default_epsilon_map or {}).items()
@@ -631,6 +643,16 @@ def _extract_rmt_analysis(
631
643
  outliers_guarded = 0
632
644
  outliers_bare = 0
633
645
  epsilon_default = 0.1
646
+ try:
647
+ eps_def = (
648
+ tier_defaults.get("rmt", {}).get("epsilon_default")
649
+ if isinstance(tier_defaults, dict)
650
+ else None
651
+ )
652
+ if isinstance(eps_def, int | float) and math.isfinite(float(eps_def)):
653
+ epsilon_default = float(eps_def)
654
+ except Exception:
655
+ pass
634
656
  stable = True
635
657
  explicit_stability = False
636
658
  max_ratio = 0.0
@@ -640,19 +662,54 @@ def _extract_rmt_analysis(
640
662
  baseline_outliers_per_family: dict[str, int] = {}
641
663
  outliers_per_family: dict[str, int] = {}
642
664
  epsilon_violations: list[Any] = []
665
+ margin_used = None
666
+ deadband_used = None
667
+ policy_out: dict[str, Any] | None = None
643
668
 
644
669
  for guard in report.get("guards", []) or []:
645
670
  if str(guard.get("name", "")).lower() == "rmt":
646
671
  guard_metrics = guard.get("metrics", {}) or {}
647
672
  guard_policy = guard.get("policy", {}) or {}
673
+ if isinstance(guard_policy, dict) and guard_policy:
674
+ policy_out = dict(guard_policy)
675
+ if "epsilon_by_family" not in policy_out and isinstance(
676
+ policy_out.get("epsilon"), dict
677
+ ):
678
+ policy_out["epsilon_by_family"] = dict(policy_out["epsilon"])
679
+ if isinstance(policy_out.get("margin"), int | float) and math.isfinite(
680
+ float(policy_out.get("margin"))
681
+ ):
682
+ margin_used = float(policy_out.get("margin"))
683
+ if isinstance(
684
+ policy_out.get("deadband"), int | float
685
+ ) and math.isfinite(float(policy_out.get("deadband"))):
686
+ deadband_used = float(policy_out.get("deadband"))
687
+ if isinstance(
688
+ policy_out.get("epsilon_default"), int | float
689
+ ) and math.isfinite(float(policy_out.get("epsilon_default"))):
690
+ epsilon_default = float(policy_out.get("epsilon_default"))
691
+ if isinstance(
692
+ guard_metrics.get("epsilon_default"), int | float
693
+ ) and math.isfinite(float(guard_metrics.get("epsilon_default"))):
694
+ epsilon_default = float(guard_metrics.get("epsilon_default"))
648
695
  outliers_guarded = guard_metrics.get(
649
696
  "rmt_outliers", guard_metrics.get("layers_flagged", outliers_guarded)
650
697
  )
651
698
  max_ratio = guard_metrics.get("max_ratio", 0.0)
652
- epsilon_default = guard_policy.get(
653
- "deadband", guard_metrics.get("deadband_used", epsilon_default)
654
- )
655
699
  epsilon_map = guard_metrics.get("epsilon_by_family", {}) or epsilon_map
700
+ if not epsilon_map and isinstance(guard_policy, dict):
701
+ eps_src = guard_policy.get("epsilon_by_family") or guard_policy.get(
702
+ "epsilon"
703
+ )
704
+ if isinstance(eps_src, dict):
705
+ try:
706
+ epsilon_map = {
707
+ str(k): float(v)
708
+ for k, v in eps_src.items()
709
+ if isinstance(v, int | float) and math.isfinite(float(v))
710
+ }
711
+ except Exception:
712
+ pass
656
713
  baseline_outliers_per_family = (
657
714
  guard_metrics.get("baseline_outliers_per_family", {})
658
715
  or baseline_outliers_per_family
@@ -844,7 +901,7 @@ def _extract_rmt_analysis(
844
901
  }
845
902
  delta_per_family = {str(k): _to_int(v) for k, v in delta_per_family.items()}
846
903
 
847
- return {
904
+ result = {
848
905
  "outliers_bare": outliers_bare,
849
906
  "outliers_guarded": outliers_guarded,
850
907
  "epsilon": epsilon_scalar,
@@ -862,6 +919,13 @@ def _extract_rmt_analysis(
862
919
  "mean_deviation_ratio": mean_deviation_ratio,
863
920
  "families": family_breakdown,
864
921
  }
922
+ if margin_used is not None:
923
+ result["margin"] = float(margin_used)
924
+ if deadband_used is not None:
925
+ result["deadband"] = float(deadband_used)
926
+ if policy_out:
927
+ result["policy"] = policy_out
928
+ return result
865
929
 
866
930
 
867
931
  @no_type_check
@@ -873,10 +937,14 @@ def _extract_variance_analysis(report: RunReport) -> dict[str, Any]:
873
937
  ratio_ci = None
874
938
  calibration = {}
875
939
  guard_metrics: dict[str, Any] = {}
940
+ guard_policy: dict[str, Any] | None = None
876
941
  for guard in report.get("guards", []) or []:
877
942
  if "variance" in str(guard.get("name", "")).lower():
878
943
  metrics = guard.get("metrics", {}) or {}
879
944
  guard_metrics = metrics
945
+ gp = guard.get("policy", {}) or {}
946
+ if isinstance(gp, dict) and gp:
947
+ guard_policy = dict(gp)
880
948
  ve_enabled = metrics.get("ve_enabled", bool(metrics))
881
949
  gain = metrics.get("ab_gain", metrics.get("gain", None))
882
950
  ppl_no_ve = metrics.get("ppl_no_ve", None)
@@ -932,11 +1000,41 @@ def _extract_variance_analysis(report: RunReport) -> dict[str, Any]:
932
1000
  if guard_metrics.get("ab_windows_used") is not None:
933
1001
  ab_section["windows_used"] = guard_metrics["ab_windows_used"]
934
1002
  if guard_metrics.get("ab_provenance"):
935
- ab_section["provenance"] = guard_metrics["ab_provenance"]
1003
+ prov = guard_metrics["ab_provenance"]
1004
+ if isinstance(prov, dict):
1005
+ prov_out = dict(prov)
1006
+
1007
+ # Normalize a top-level `window_ids` list for docs + auditability.
1008
+ if "window_ids" not in prov_out:
1009
+ window_ids: set[int] = set()
1010
+
1011
+ def _collect(node: Any) -> None:
1012
+ if isinstance(node, dict):
1013
+ ids = node.get("window_ids")
1014
+ if isinstance(ids, list):
1015
+ for wid in ids:
1016
+ if isinstance(wid, int | float):
1017
+ window_ids.add(int(wid))
1018
+ for v in node.values():
1019
+ _collect(v)
1020
+ return
1021
+ if isinstance(node, list):
1022
+ for v in node:
1023
+ _collect(v)
1024
+
1025
+ _collect(prov_out)
1026
+ if window_ids:
1027
+ prov_out["window_ids"] = sorted(window_ids)
1028
+
1029
+ ab_section["provenance"] = prov_out
1030
+ else:
1031
+ ab_section["provenance"] = prov
936
1032
  if guard_metrics.get("ab_point_estimates"):
937
1033
  ab_section["point_estimates"] = guard_metrics["ab_point_estimates"]
938
1034
  if ab_section:
939
1035
  result["ab_test"] = ab_section
1036
+ if guard_policy:
1037
+ result["policy"] = guard_policy
940
1038
  return result
941
1039
 
942
1040