invarlock 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/tiers.yaml +57 -30
  3. invarlock/adapters/__init__.py +1 -1
  4. invarlock/calibration/spectral_null.py +15 -10
  5. invarlock/calibration/variance_ve.py +0 -2
  6. invarlock/cli/commands/calibrate.py +6 -2
  7. invarlock/cli/commands/certify.py +58 -39
  8. invarlock/cli/commands/doctor.py +3 -1
  9. invarlock/cli/commands/explain_gates.py +57 -8
  10. invarlock/cli/commands/report.py +1 -1
  11. invarlock/cli/commands/run.py +159 -61
  12. invarlock/cli/commands/verify.py +78 -4
  13. invarlock/cli/config.py +21 -5
  14. invarlock/core/api.py +45 -5
  15. invarlock/core/auto_tuning.py +65 -20
  16. invarlock/core/contracts.py +7 -1
  17. invarlock/core/registry.py +2 -2
  18. invarlock/core/runner.py +314 -50
  19. invarlock/eval/bench.py +0 -13
  20. invarlock/eval/data.py +73 -283
  21. invarlock/eval/metrics.py +134 -4
  22. invarlock/eval/primary_metric.py +23 -0
  23. invarlock/eval/tail_stats.py +230 -0
  24. invarlock/guards/_estimators.py +154 -0
  25. invarlock/guards/policies.py +16 -6
  26. invarlock/guards/rmt.py +625 -544
  27. invarlock/guards/spectral.py +348 -110
  28. invarlock/guards/tier_config.py +32 -30
  29. invarlock/guards/variance.py +5 -29
  30. invarlock/guards_ref/rmt_ref.py +23 -23
  31. invarlock/model_profile.py +42 -15
  32. invarlock/reporting/certificate.py +225 -46
  33. invarlock/reporting/certificate_schema.py +2 -1
  34. invarlock/reporting/dataset_hashing.py +15 -2
  35. invarlock/reporting/guards_analysis.py +197 -274
  36. invarlock/reporting/normalizer.py +6 -0
  37. invarlock/reporting/policy_utils.py +38 -36
  38. invarlock/reporting/primary_metric_utils.py +71 -17
  39. invarlock/reporting/render.py +61 -0
  40. invarlock/reporting/report.py +1 -1
  41. invarlock/reporting/report_types.py +5 -2
  42. invarlock/reporting/validate.py +1 -18
  43. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/METADATA +6 -6
  44. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/RECORD +48 -46
  45. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/WHEEL +0 -0
  46. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/entry_points.txt +0 -0
  47. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/licenses/LICENSE +0 -0
  48. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/top_level.txt +0 -0
invarlock/core/runner.py CHANGED
@@ -18,7 +18,18 @@ from typing import Any
18
18
 
19
19
  import numpy as np
20
20
 
21
- from .api import Guard, ModelAdapter, ModelEdit, RunConfig, RunReport
21
+ from invarlock.eval.tail_stats import evaluate_metric_tail
22
+
23
+ from .api import (
24
+ EditLike,
25
+ Guard,
26
+ GuardWithContext,
27
+ GuardWithPrepare,
28
+ ModelAdapter,
29
+ ModelEdit,
30
+ RunConfig,
31
+ RunReport,
32
+ )
22
33
  from .auto_tuning import resolve_tier_policies
23
34
  from .bootstrap import (
24
35
  compute_logloss_ci,
@@ -112,7 +123,7 @@ class CoreRunner:
112
123
  self,
113
124
  model: Any,
114
125
  adapter: ModelAdapter,
115
- edit: ModelEdit,
126
+ edit: ModelEdit | EditLike,
116
127
  guards: list[Guard],
117
128
  config: RunConfig,
118
129
  calibration_data: Any = None,
@@ -175,7 +186,7 @@ class CoreRunner:
175
186
  config.context["auto"] = dict(auto_config)
176
187
  try:
177
188
  report.context["auto"] = config.context["auto"]
178
- except Exception:
189
+ except Exception: # pragma: no cover - defensive context propagation
179
190
  pass
180
191
 
181
192
  report.status = RunStatus.RUNNING.value
@@ -303,10 +314,10 @@ class CoreRunner:
303
314
  self,
304
315
  model: Any,
305
316
  adapter: ModelAdapter,
306
- edit: ModelEdit,
317
+ edit: ModelEdit | EditLike,
307
318
  model_desc: dict[str, Any],
308
319
  report: RunReport,
309
- edit_config: dict[str, Any] | None = None,
320
+ edit_config: dict[str, Any] | None,
310
321
  ) -> dict[str, Any]:
311
322
  """Phase 2: Apply edit operation."""
312
323
  edit_label = "baseline" if edit.name == "baseline" else edit.name
@@ -388,7 +399,7 @@ class CoreRunner:
388
399
  {"guard": guard.name, "policy": guard_policy},
389
400
  )
390
401
 
391
- if hasattr(guard, "set_run_context"):
402
+ if isinstance(guard, GuardWithContext):
392
403
  try:
393
404
  guard.set_run_context(report)
394
405
  except Exception as exc:
@@ -400,7 +411,7 @@ class CoreRunner:
400
411
  )
401
412
 
402
413
  # Call prepare method if it exists (most guards need this)
403
- if hasattr(guard, "prepare"):
414
+ if isinstance(guard, GuardWithPrepare):
404
415
  prepare_result = guard.prepare(
405
416
  model, adapter, calibration_data, guard_policy
406
417
  )
@@ -454,7 +465,7 @@ class CoreRunner:
454
465
  for guard in guards:
455
466
  self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
456
467
 
457
- if hasattr(guard, "set_run_context"):
468
+ if isinstance(guard, GuardWithContext):
458
469
  try:
459
470
  guard.set_run_context(report)
460
471
  except Exception as exc: # pragma: no cover - defensive
@@ -583,6 +594,116 @@ class CoreRunner:
583
594
  }
584
595
  eval_windows = {"preview": {}, "final": {}}
585
596
 
597
+ # Optional: compute primary metric tail evidence vs baseline when provided.
598
+ try:
599
+ pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
600
+ pm_kind = str(pm.get("kind", "")).lower() if isinstance(pm, dict) else ""
601
+ is_ppl_metric = pm_kind.startswith("ppl")
602
+
603
+ baseline_eval = {}
604
+ if (
605
+ is_ppl_metric
606
+ and config
607
+ and isinstance(config.context, dict)
608
+ and isinstance(config.context.get("baseline_eval_windows"), dict)
609
+ ):
610
+ baseline_eval = config.context.get("baseline_eval_windows") or {}
611
+
612
+ if is_ppl_metric and baseline_eval:
613
+ tier_policies = (
614
+ report.meta.get("tier_policies", {})
615
+ if isinstance(getattr(report, "meta", None), dict)
616
+ else {}
617
+ )
618
+ metrics_policy = (
619
+ tier_policies.get("metrics", {})
620
+ if isinstance(tier_policies, dict)
621
+ else {}
622
+ )
623
+ pm_tail_policy = (
624
+ metrics_policy.get("pm_tail", {})
625
+ if isinstance(metrics_policy, dict)
626
+ else {}
627
+ )
628
+
629
+ run_final = (
630
+ eval_windows.get("final", {})
631
+ if isinstance(eval_windows, dict)
632
+ else {}
633
+ )
634
+ base_final = (
635
+ baseline_eval.get("final", {})
636
+ if isinstance(baseline_eval, dict)
637
+ else {}
638
+ )
639
+
640
+ deltas: list[float] = []
641
+ weights: list[float] = []
642
+ run_ids = (
643
+ run_final.get("window_ids") if isinstance(run_final, dict) else None
644
+ )
645
+ run_ll = (
646
+ run_final.get("logloss") if isinstance(run_final, dict) else None
647
+ )
648
+ run_tc = (
649
+ run_final.get("token_counts")
650
+ if isinstance(run_final, dict)
651
+ else None
652
+ )
653
+ base_ids = (
654
+ base_final.get("window_ids")
655
+ if isinstance(base_final, dict)
656
+ else None
657
+ )
658
+ base_ll = (
659
+ base_final.get("logloss") if isinstance(base_final, dict) else None
660
+ )
661
+
662
+ if (
663
+ isinstance(run_ids, list)
664
+ and isinstance(run_ll, list)
665
+ and isinstance(base_ids, list)
666
+ and isinstance(base_ll, list)
667
+ ):
668
+ base_map: dict[int, float] = {}
669
+ for b_id, b_val in zip(base_ids, base_ll, strict=False):
670
+ if isinstance(b_id, int | float) and isinstance(
671
+ b_val, int | float
672
+ ):
673
+ base_map[int(b_id)] = float(b_val)
674
+ for idx, (r_id, r_val) in enumerate(
675
+ zip(run_ids, run_ll, strict=False)
676
+ ):
677
+ if not (
678
+ isinstance(r_id, int | float)
679
+ and isinstance(r_val, int | float)
680
+ ):
681
+ continue
682
+ key = int(r_id)
683
+ if key not in base_map:
684
+ continue
685
+ dv = float(r_val) - base_map[key]
686
+ if math.isfinite(dv):
687
+ deltas.append(float(dv))
688
+ if isinstance(run_tc, list) and idx < len(run_tc):
689
+ try:
690
+ wv = float(run_tc[idx])
691
+ except Exception:
692
+ wv = 0.0
693
+ weights.append(float(max(wv, 0.0)))
694
+
695
+ tail_result = evaluate_metric_tail(
696
+ deltas=deltas,
697
+ weights=weights
698
+ if (weights and len(weights) == len(deltas))
699
+ else None,
700
+ policy=pm_tail_policy if isinstance(pm_tail_policy, dict) else None,
701
+ )
702
+ tail_result["source"] = "paired_baseline.final"
703
+ metrics["primary_metric_tail"] = tail_result
704
+ except Exception: # pragma: no cover - best effort
705
+ pass
706
+
586
707
  policy_flags = self._resolve_policy_flags(config)
587
708
  eval_error = metrics.get("eval_error") if isinstance(metrics, dict) else None
588
709
  if eval_error:
@@ -834,8 +955,10 @@ class CoreRunner:
834
955
  pairing_reason = None
835
956
  preview_pair_stats = {"matched": 0, "expected": 0}
836
957
  final_pair_stats = {"matched": 0, "expected": 0}
958
+ paired_windows_attempted = 0
837
959
  preview_window_ids: list[int] = []
838
960
  final_window_ids: list[int] = []
961
+
839
962
  preview_tokens: list[list[int]] = []
840
963
  final_tokens: list[list[int]] = []
841
964
  preview_limit = min(preview_n, len(preview_data)) if preview_data else 0
@@ -876,6 +999,8 @@ class CoreRunner:
876
999
  # even if an exception occurs during the main compute block.
877
1000
  delta_samples: list[float] = []
878
1001
  delta_weights: list[float] = []
1002
+ pm_invalid = False
1003
+ degraded_reason: str | None = None
879
1004
 
880
1005
  try:
881
1006
 
@@ -891,7 +1016,7 @@ class CoreRunner:
891
1016
  max_batches: int,
892
1017
  start_idx: int,
893
1018
  ) -> dict[str, Any]:
894
- nonlocal alignment_logged
1019
+ nonlocal alignment_logged, eval_error
895
1020
 
896
1021
  total_tokens_local = 0
897
1022
  actual_tokens_local = 0
@@ -927,7 +1052,9 @@ class CoreRunner:
927
1052
  limit = _resolve_limit(batches, max_batches)
928
1053
 
929
1054
  for batch in batches[:limit]:
930
- if max_batches > 0 and count >= max_batches:
1055
+ if (
1056
+ max_batches > 0 and count >= max_batches
1057
+ ): # pragma: no cover - slicing already caps iteration
931
1058
  break
932
1059
 
933
1060
  labels = None
@@ -1100,7 +1227,7 @@ class CoreRunner:
1100
1227
  "zero_mask_batches": zero_mask_batches,
1101
1228
  "requested": limit,
1102
1229
  },
1103
- )
1230
+ ) # pragma: no cover - requires debug tracing with zero batches
1104
1231
  if resolved_loss_mode == "mlm":
1105
1232
  error_msg = (
1106
1233
  "MLM evaluation produced zero usable batches; "
@@ -1121,7 +1248,10 @@ class CoreRunner:
1121
1248
  "zero_mask_batches": zero_mask_batches,
1122
1249
  },
1123
1250
  )
1124
- raise ValueError(error_msg)
1251
+ eval_error = {
1252
+ "error": "mlm_missing_masks",
1253
+ "detail": error_msg,
1254
+ }
1125
1255
  return {
1126
1256
  "ppl": float("nan"),
1127
1257
  "total_tokens": total_tokens_local,
@@ -1167,8 +1297,42 @@ class CoreRunner:
1167
1297
  final_data, final_limit, preview_summary["num_batches"]
1168
1298
  )
1169
1299
 
1170
- preview_log_losses = preview_summary["log_losses"]
1171
- final_log_losses = final_summary["log_losses"]
1300
+ preview_raw_losses = preview_summary["log_losses"]
1301
+ final_raw_losses = final_summary["log_losses"]
1302
+ try:
1303
+ paired_windows_attempted = min(
1304
+ len(preview_raw_losses), len(final_raw_losses)
1305
+ )
1306
+ except Exception:
1307
+ paired_windows_attempted = 0
1308
+
1309
+ preview_log_losses = [
1310
+ float(loss) for loss in preview_raw_losses if math.isfinite(loss)
1311
+ ]
1312
+ final_log_losses = [
1313
+ float(loss) for loss in final_raw_losses if math.isfinite(loss)
1314
+ ]
1315
+ if len(preview_log_losses) != len(preview_raw_losses):
1316
+ self._log_event(
1317
+ "eval",
1318
+ "non_finite_preview_losses_filtered",
1319
+ LogLevel.WARNING,
1320
+ {
1321
+ "total": len(preview_raw_losses),
1322
+ "filtered": len(preview_raw_losses) - len(preview_log_losses),
1323
+ },
1324
+ )
1325
+ if len(final_log_losses) != len(final_raw_losses):
1326
+ self._log_event(
1327
+ "eval",
1328
+ "non_finite_final_losses_filtered",
1329
+ LogLevel.WARNING,
1330
+ {
1331
+ "total": len(final_raw_losses),
1332
+ "filtered": len(final_raw_losses) - len(final_log_losses),
1333
+ },
1334
+ )
1335
+
1172
1336
  preview_tokens_ct = preview_summary["total_tokens"]
1173
1337
  final_tokens_ct = final_summary["total_tokens"]
1174
1338
  preview_batches_ct = preview_summary["num_batches"]
@@ -1235,14 +1399,29 @@ class CoreRunner:
1235
1399
  delta_mean_log = final_mean_log - preview_mean_log
1236
1400
  pm_ratio = math.exp(delta_mean_log)
1237
1401
 
1238
- if not (math.isfinite(delta_mean_log) and math.isfinite(pm_ratio)):
1239
- raise RuntimeError("Invalid perplexity ratio or delta")
1240
-
1241
- expected_ratio = math.exp(delta_mean_log)
1242
- if abs(pm_ratio - expected_ratio) > 1e-6:
1243
- raise RuntimeError(
1244
- "Primary-metric ratio mismatch with exp(mean ΔlogNLL)"
1402
+ pm_invalid = False
1403
+ try:
1404
+ if not (math.isfinite(delta_mean_log) and math.isfinite(pm_ratio)):
1405
+ raise RuntimeError("non_finite_primary_metric")
1406
+
1407
+ expected_ratio = math.exp(delta_mean_log)
1408
+ if abs(pm_ratio - expected_ratio) > 1e-6:
1409
+ raise RuntimeError("primary_metric_ratio_mismatch")
1410
+ except Exception as exc:
1411
+ pm_invalid = True
1412
+ self._log_event(
1413
+ "eval",
1414
+ "primary_metric_invalid",
1415
+ LogLevel.WARNING,
1416
+ {
1417
+ "pm_preview": float(pm_preview),
1418
+ "pm_final": float(pm_final),
1419
+ "delta_mean_log": float(delta_mean_log),
1420
+ "pm_ratio": float(pm_ratio),
1421
+ "error": str(exc),
1422
+ },
1245
1423
  )
1424
+ # Preserve downstream reporting; keep NaNs but mark degraded
1246
1425
 
1247
1426
  if bootstrap_enabled and preview_log_losses:
1248
1427
  preview_log_ci = compute_logloss_ci(
@@ -1298,7 +1477,20 @@ class CoreRunner:
1298
1477
  abs(r - e) > 1e-6
1299
1478
  for r, e in zip(ratio_ci, expected_ratio_ci, strict=False)
1300
1479
  ):
1301
- raise RuntimeError("Ratio CI inconsistent with Δlog CI")
1480
+ pm_invalid = True
1481
+ self._log_event(
1482
+ "eval",
1483
+ "ratio_ci_inconsistent",
1484
+ LogLevel.WARNING,
1485
+ {
1486
+ "ratio_ci": ratio_ci,
1487
+ "expected_ratio_ci": expected_ratio_ci,
1488
+ },
1489
+ )
1490
+ ratio_ci = (
1491
+ float(expected_ratio_ci[0]),
1492
+ float(expected_ratio_ci[1]),
1493
+ )
1302
1494
  else:
1303
1495
  delta_log_ci = (delta_mean_log, delta_mean_log)
1304
1496
  ratio_ci = (pm_ratio, pm_ratio)
@@ -1335,19 +1527,60 @@ class CoreRunner:
1335
1527
  degenerate_reason = "no_variation"
1336
1528
 
1337
1529
  if degenerate_delta:
1530
+ pm_invalid = True
1338
1531
  self._log_event(
1339
1532
  "eval",
1340
1533
  "degenerate_delta_samples",
1341
- LogLevel.ERROR,
1534
+ LogLevel.WARNING,
1342
1535
  {
1343
1536
  "reason": degenerate_reason,
1344
1537
  "sample_count": len(delta_samples),
1345
1538
  },
1346
1539
  )
1347
- if profile_label in {"ci", "release"}:
1348
- raise RuntimeError(
1349
- f"Degenerate paired ΔlogNLL distribution ({degenerate_reason})"
1540
+
1541
+ needs_pm_fallback = (not math.isfinite(pm_preview)) or (
1542
+ not math.isfinite(pm_final)
1543
+ )
1544
+ needs_delta_fallback = (not math.isfinite(delta_mean_log)) or (
1545
+ not math.isfinite(pm_ratio)
1546
+ )
1547
+
1548
+ degraded_reason: str | None = None
1549
+ if needs_pm_fallback:
1550
+ degraded_reason = "non_finite_pm"
1551
+ elif needs_delta_fallback:
1552
+ degraded_reason = "non_finite_delta"
1553
+ elif degenerate_reason:
1554
+ degraded_reason = f"degenerate_delta:{degenerate_reason}"
1555
+ elif pm_invalid:
1556
+ degraded_reason = "primary_metric_invalid"
1557
+
1558
+ if needs_pm_fallback or needs_delta_fallback:
1559
+ pm_invalid = True
1560
+ pm_fallback = (
1561
+ pm_preview
1562
+ if math.isfinite(pm_preview) and pm_preview > 0
1563
+ else pm_final
1564
+ )
1565
+ if not (math.isfinite(pm_fallback) and pm_fallback > 0):
1566
+ pm_fallback = 1.0
1567
+
1568
+ if needs_pm_fallback:
1569
+ pm_preview = (
1570
+ pm_preview
1571
+ if math.isfinite(pm_preview) and pm_preview > 0
1572
+ else pm_fallback
1573
+ )
1574
+ pm_final = (
1575
+ pm_final
1576
+ if math.isfinite(pm_final) and pm_final > 0
1577
+ else pm_fallback
1350
1578
  )
1579
+ if needs_delta_fallback:
1580
+ if not math.isfinite(delta_mean_log):
1581
+ delta_mean_log = 0.0
1582
+ if not math.isfinite(pm_ratio):
1583
+ pm_ratio = 1.0
1351
1584
 
1352
1585
  def _hash_tokens(tokens: list[int]) -> bytes:
1353
1586
  if not tokens:
@@ -1371,10 +1604,14 @@ class CoreRunner:
1371
1604
  if not isinstance(dataset_cfg, dict):
1372
1605
  return None
1373
1606
  seq_len_val = dataset_cfg.get("seq_len")
1374
- stride_val = dataset_cfg.get("stride", seq_len_val)
1607
+ if seq_len_val is None:
1608
+ return None
1609
+ stride_raw = dataset_cfg.get("stride", seq_len_val)
1610
+ if stride_raw is None:
1611
+ return None
1375
1612
  try:
1376
1613
  seq_len_f = float(seq_len_val)
1377
- stride_f = float(stride_val)
1614
+ stride_f = float(stride_raw)
1378
1615
  except (TypeError, ValueError):
1379
1616
  return None
1380
1617
  if not math.isfinite(seq_len_f) or seq_len_f <= 0:
@@ -1687,7 +1924,9 @@ class CoreRunner:
1687
1924
  except Exception:
1688
1925
  pass
1689
1926
 
1690
- paired_windows_count = len(delta_samples)
1927
+ paired_windows_count = (
1928
+ paired_windows_attempted if paired_windows_attempted else len(delta_samples)
1929
+ )
1691
1930
  unweighted_delta_mean = (
1692
1931
  float(np.mean(delta_samples)) if delta_samples else float(delta_mean_log)
1693
1932
  )
@@ -1715,8 +1954,11 @@ class CoreRunner:
1715
1954
  metrics = {
1716
1955
  "primary_metric": {
1717
1956
  "kind": pm_kind,
1718
- "preview": float(pm_preview),
1719
- "final": float(pm_final),
1957
+ "preview": float(pm_preview) if math.isfinite(pm_preview) else None,
1958
+ "final": float(pm_final) if math.isfinite(pm_final) else None,
1959
+ "invalid": bool(pm_invalid),
1960
+ "degraded": bool(pm_invalid or degraded_reason),
1961
+ "degraded_reason": degraded_reason,
1720
1962
  },
1721
1963
  "logloss_preview": float(preview_mean_log),
1722
1964
  "logloss_final": float(final_mean_log),
@@ -2030,17 +2272,27 @@ class CoreRunner:
2030
2272
  except Exception:
2031
2273
  drift_ratio = None
2032
2274
 
2275
+ spike_threshold = getattr(config, "spike_threshold", 2.0)
2033
2276
  if drift_ratio is None:
2034
2277
  is_catastrophic_spike = False
2035
2278
  metrics_acceptable = True
2036
2279
  else:
2037
- spike_threshold = getattr(config, "spike_threshold", 2.0)
2038
2280
  is_catastrophic_spike = drift_ratio > spike_threshold
2039
2281
  # Check if standard metrics are acceptable against configured max ratio
2040
2282
  metrics_acceptable = drift_ratio <= getattr(config, "max_pm_ratio", 2.0)
2041
2283
 
2042
2284
  # Determine rollback reason and status
2043
2285
  rollback_reason = None
2286
+ tail_failed = False
2287
+ try:
2288
+ pm_tail = metrics.get("primary_metric_tail", {})
2289
+ if isinstance(pm_tail, dict) and pm_tail:
2290
+ mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
2291
+ evaluated = bool(pm_tail.get("evaluated", False))
2292
+ passed = bool(pm_tail.get("passed", True))
2293
+ tail_failed = bool(mode == "fail" and evaluated and (not passed))
2294
+ except Exception: # pragma: no cover
2295
+ tail_failed = False
2044
2296
  if is_catastrophic_spike:
2045
2297
  rollback_reason = (
2046
2298
  f"catastrophic_ppl_spike (ratio: {drift_ratio:.3f} > {spike_threshold})"
@@ -2057,6 +2309,9 @@ class CoreRunner:
2057
2309
  "immediate_rollback": True,
2058
2310
  },
2059
2311
  )
2312
+ elif tail_failed:
2313
+ rollback_reason = "primary_metric_tail_failed"
2314
+ status = RunStatus.ROLLBACK.value
2060
2315
  elif (not all_guards_passed) or (not metrics_acceptable):
2061
2316
  # Match historical/test expectation string exactly
2062
2317
  rollback_reason = "guards_failed or metrics_unacceptable"
@@ -2185,20 +2440,27 @@ class CoreRunner:
2185
2440
  ) -> dict[str, dict[str, Any]]:
2186
2441
  """Resolve tier-based guard policies from configuration."""
2187
2442
  # Use passed auto_config if available, otherwise extract from report meta
2188
- if auto_config is None:
2189
- config_meta = report.meta.get("config", {})
2443
+ auto_cfg: dict[str, Any] | None = auto_config
2444
+ if auto_cfg is None:
2445
+ config_meta = report.meta.get("config") or {}
2190
2446
 
2191
2447
  # Try to get auto config from various possible locations
2192
- if hasattr(report, "auto_config"):
2193
- auto_config = report.auto_config
2194
- elif "auto" in config_meta:
2195
- auto_config = config_meta["auto"]
2196
- else:
2448
+ auto_cfg = report.__dict__.get("auto_config")
2449
+ if (
2450
+ auto_cfg is None
2451
+ and isinstance(config_meta, dict)
2452
+ and "auto" in config_meta
2453
+ ):
2454
+ auto_cfg = config_meta["auto"]
2455
+ elif auto_cfg is None:
2197
2456
  # Fallback to default balanced tier
2198
- auto_config = {"tier": "balanced", "enabled": True}
2457
+ auto_cfg = {"tier": "balanced", "enabled": True}
2458
+
2459
+ if not isinstance(auto_cfg, dict):
2460
+ auto_cfg = {"tier": "balanced", "enabled": True}
2199
2461
 
2200
2462
  # Extract tier and edit name
2201
- tier = auto_config.get("tier", "balanced")
2463
+ tier = auto_cfg.get("tier", "balanced")
2202
2464
  edit_name = None
2203
2465
  if hasattr(report, "edit") and report.edit:
2204
2466
  edit_name = report.edit.get("name")
@@ -2208,8 +2470,10 @@ class CoreRunner:
2208
2470
  edit_name = report.meta["edit_name"]
2209
2471
 
2210
2472
  # Get explicit guard overrides from config
2211
- config_meta = report.meta.get("config", {})
2212
- explicit_overrides = config_meta.get("guards", {})
2473
+ config_meta = report.meta.get("config") or {}
2474
+ explicit_overrides = (
2475
+ config_meta.get("guards", {}) if isinstance(config_meta, dict) else {}
2476
+ )
2213
2477
 
2214
2478
  try:
2215
2479
  # Resolve tier policies
@@ -2237,18 +2501,18 @@ class CoreRunner:
2237
2501
  def _apply_guard_policy(self, guard: Guard, policy: dict[str, Any]) -> None:
2238
2502
  """Apply resolved policy parameters to a guard instance."""
2239
2503
  try:
2504
+ guard_config = getattr(guard, "config", None)
2505
+ guard_policy = getattr(guard, "policy", None)
2506
+
2240
2507
  # Apply policy parameters to guard
2241
2508
  for param_name, param_value in policy.items():
2242
2509
  if hasattr(guard, param_name):
2243
2510
  setattr(guard, param_name, param_value)
2244
- elif hasattr(guard, "config") and isinstance(guard.config, dict):
2245
- # Try to set in guard's config dict
2246
- guard.config[param_name] = param_value
2247
- elif hasattr(guard, "policy") and isinstance(guard.policy, dict):
2248
- # Try to set in guard's policy dict
2249
- guard.policy[param_name] = param_value
2511
+ elif isinstance(guard_config, dict):
2512
+ guard_config[param_name] = param_value
2513
+ elif isinstance(guard_policy, dict):
2514
+ guard_policy[param_name] = param_value
2250
2515
  else:
2251
- # Last resort: add to guard as attribute
2252
2516
  setattr(guard, param_name, param_value)
2253
2517
 
2254
2518
  except Exception as e:
invarlock/eval/bench.py CHANGED
@@ -92,7 +92,6 @@ class BenchmarkConfig:
92
92
  epsilon: float | None = (
93
93
  None # RMT deadband tolerance (None = use resolved deadband)
94
94
  )
95
- strict: bool = False # If True, sets epsilon = 0
96
95
  ppl_overhead_threshold: float = 0.01 # 1%
97
96
  guard_overhead_time_threshold: float = 0.15 # 15%
98
97
  guard_overhead_mem_threshold: float = 0.10 # 10%
@@ -104,10 +103,6 @@ class BenchmarkConfig:
104
103
  """Apply post-initialization logic."""
105
104
  self.output_dir = Path(self.output_dir)
106
105
 
107
- # Handle strict mode
108
- if self.strict:
109
- self.epsilon = 0.0
110
-
111
106
 
112
107
  @dataclass
113
108
  class ScenarioResult:
@@ -1043,7 +1038,6 @@ def run_guard_effect_benchmark(
1043
1038
  profile: str = "ci",
1044
1039
  output_dir: str | Path = "benchmarks",
1045
1040
  epsilon: float | None = None,
1046
- strict: bool = False,
1047
1041
  **kwargs,
1048
1042
  ) -> dict[str, Any]:
1049
1043
  """
@@ -1056,7 +1050,6 @@ def run_guard_effect_benchmark(
1056
1050
  profile: "ci" (50/50 windows) or "release" (100/100 windows)
1057
1051
  output_dir: Directory to save results
1058
1052
  epsilon: Optional epsilon override
1059
- strict: If True, sets epsilon = 0
1060
1053
  **kwargs: Additional configuration options
1061
1054
 
1062
1055
  Returns:
@@ -1075,7 +1068,6 @@ def run_guard_effect_benchmark(
1075
1068
  profile=profile,
1076
1069
  output_dir=Path(output_dir),
1077
1070
  epsilon=epsilon,
1078
- strict=strict,
1079
1071
  **kwargs,
1080
1072
  )
1081
1073
 
@@ -1384,7 +1376,6 @@ def _config_to_dict(config: BenchmarkConfig) -> dict[str, Any]:
1384
1376
  "stride": config.stride,
1385
1377
  "seed": config.seed,
1386
1378
  "epsilon": config.epsilon,
1387
- "strict": config.strict,
1388
1379
  "ppl_overhead_threshold": config.ppl_overhead_threshold,
1389
1380
  "guard_overhead_time_threshold": config.guard_overhead_time_threshold,
1390
1381
  "guard_overhead_mem_threshold": config.guard_overhead_mem_threshold,
@@ -1426,9 +1417,6 @@ def main():
1426
1417
  type=float,
1427
1418
  help="RMT outliers epsilon threshold (default: use resolved RMT deadband)",
1428
1419
  )
1429
- parser.add_argument(
1430
- "--strict", action="store_true", help="Set epsilon=0 (overrides --epsilon)"
1431
- )
1432
1420
 
1433
1421
  # Model and dataset configuration
1434
1422
  parser.add_argument(
@@ -1505,7 +1493,6 @@ def main():
1505
1493
  profile=args.profile,
1506
1494
  output_dir=args.out,
1507
1495
  epsilon=args.epsilon,
1508
- strict=args.strict,
1509
1496
  **kwargs,
1510
1497
  )
1511
1498