invarlock 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/tiers.yaml +57 -30
- invarlock/adapters/__init__.py +1 -1
- invarlock/calibration/spectral_null.py +15 -10
- invarlock/calibration/variance_ve.py +0 -2
- invarlock/cli/commands/calibrate.py +6 -2
- invarlock/cli/commands/certify.py +58 -39
- invarlock/cli/commands/doctor.py +3 -1
- invarlock/cli/commands/explain_gates.py +57 -8
- invarlock/cli/commands/report.py +1 -1
- invarlock/cli/commands/run.py +159 -61
- invarlock/cli/commands/verify.py +78 -4
- invarlock/cli/config.py +21 -5
- invarlock/core/api.py +45 -5
- invarlock/core/auto_tuning.py +65 -20
- invarlock/core/contracts.py +7 -1
- invarlock/core/registry.py +2 -2
- invarlock/core/runner.py +314 -50
- invarlock/eval/bench.py +0 -13
- invarlock/eval/data.py +14 -28
- invarlock/eval/metrics.py +4 -1
- invarlock/eval/primary_metric.py +23 -0
- invarlock/eval/tail_stats.py +230 -0
- invarlock/guards/_estimators.py +154 -0
- invarlock/guards/policies.py +16 -6
- invarlock/guards/rmt.py +625 -544
- invarlock/guards/spectral.py +348 -110
- invarlock/guards/tier_config.py +32 -30
- invarlock/guards/variance.py +5 -29
- invarlock/guards_ref/rmt_ref.py +23 -23
- invarlock/model_profile.py +42 -15
- invarlock/reporting/certificate.py +225 -46
- invarlock/reporting/certificate_schema.py +2 -1
- invarlock/reporting/dataset_hashing.py +15 -2
- invarlock/reporting/guards_analysis.py +197 -274
- invarlock/reporting/normalizer.py +6 -0
- invarlock/reporting/policy_utils.py +38 -36
- invarlock/reporting/primary_metric_utils.py +71 -17
- invarlock/reporting/render.py +61 -0
- invarlock/reporting/report.py +1 -1
- invarlock/reporting/report_types.py +5 -2
- invarlock/reporting/validate.py +1 -18
- {invarlock-0.3.5.dist-info → invarlock-0.3.6.dist-info}/METADATA +6 -6
- {invarlock-0.3.5.dist-info → invarlock-0.3.6.dist-info}/RECORD +48 -46
- {invarlock-0.3.5.dist-info → invarlock-0.3.6.dist-info}/WHEEL +0 -0
- {invarlock-0.3.5.dist-info → invarlock-0.3.6.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.5.dist-info → invarlock-0.3.6.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.5.dist-info → invarlock-0.3.6.dist-info}/top_level.txt +0 -0
invarlock/core/runner.py
CHANGED
|
@@ -18,7 +18,18 @@ from typing import Any
|
|
|
18
18
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
|
|
21
|
-
from .
|
|
21
|
+
from invarlock.eval.tail_stats import evaluate_metric_tail
|
|
22
|
+
|
|
23
|
+
from .api import (
|
|
24
|
+
EditLike,
|
|
25
|
+
Guard,
|
|
26
|
+
GuardWithContext,
|
|
27
|
+
GuardWithPrepare,
|
|
28
|
+
ModelAdapter,
|
|
29
|
+
ModelEdit,
|
|
30
|
+
RunConfig,
|
|
31
|
+
RunReport,
|
|
32
|
+
)
|
|
22
33
|
from .auto_tuning import resolve_tier_policies
|
|
23
34
|
from .bootstrap import (
|
|
24
35
|
compute_logloss_ci,
|
|
@@ -112,7 +123,7 @@ class CoreRunner:
|
|
|
112
123
|
self,
|
|
113
124
|
model: Any,
|
|
114
125
|
adapter: ModelAdapter,
|
|
115
|
-
edit: ModelEdit,
|
|
126
|
+
edit: ModelEdit | EditLike,
|
|
116
127
|
guards: list[Guard],
|
|
117
128
|
config: RunConfig,
|
|
118
129
|
calibration_data: Any = None,
|
|
@@ -175,7 +186,7 @@ class CoreRunner:
|
|
|
175
186
|
config.context["auto"] = dict(auto_config)
|
|
176
187
|
try:
|
|
177
188
|
report.context["auto"] = config.context["auto"]
|
|
178
|
-
except Exception:
|
|
189
|
+
except Exception: # pragma: no cover - defensive context propagation
|
|
179
190
|
pass
|
|
180
191
|
|
|
181
192
|
report.status = RunStatus.RUNNING.value
|
|
@@ -303,10 +314,10 @@ class CoreRunner:
|
|
|
303
314
|
self,
|
|
304
315
|
model: Any,
|
|
305
316
|
adapter: ModelAdapter,
|
|
306
|
-
edit: ModelEdit,
|
|
317
|
+
edit: ModelEdit | EditLike,
|
|
307
318
|
model_desc: dict[str, Any],
|
|
308
319
|
report: RunReport,
|
|
309
|
-
edit_config: dict[str, Any] | None
|
|
320
|
+
edit_config: dict[str, Any] | None,
|
|
310
321
|
) -> dict[str, Any]:
|
|
311
322
|
"""Phase 2: Apply edit operation."""
|
|
312
323
|
edit_label = "baseline" if edit.name == "baseline" else edit.name
|
|
@@ -388,7 +399,7 @@ class CoreRunner:
|
|
|
388
399
|
{"guard": guard.name, "policy": guard_policy},
|
|
389
400
|
)
|
|
390
401
|
|
|
391
|
-
if
|
|
402
|
+
if isinstance(guard, GuardWithContext):
|
|
392
403
|
try:
|
|
393
404
|
guard.set_run_context(report)
|
|
394
405
|
except Exception as exc:
|
|
@@ -400,7 +411,7 @@ class CoreRunner:
|
|
|
400
411
|
)
|
|
401
412
|
|
|
402
413
|
# Call prepare method if it exists (most guards need this)
|
|
403
|
-
if
|
|
414
|
+
if isinstance(guard, GuardWithPrepare):
|
|
404
415
|
prepare_result = guard.prepare(
|
|
405
416
|
model, adapter, calibration_data, guard_policy
|
|
406
417
|
)
|
|
@@ -454,7 +465,7 @@ class CoreRunner:
|
|
|
454
465
|
for guard in guards:
|
|
455
466
|
self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
|
|
456
467
|
|
|
457
|
-
if
|
|
468
|
+
if isinstance(guard, GuardWithContext):
|
|
458
469
|
try:
|
|
459
470
|
guard.set_run_context(report)
|
|
460
471
|
except Exception as exc: # pragma: no cover - defensive
|
|
@@ -583,6 +594,116 @@ class CoreRunner:
|
|
|
583
594
|
}
|
|
584
595
|
eval_windows = {"preview": {}, "final": {}}
|
|
585
596
|
|
|
597
|
+
# Optional: compute primary metric tail evidence vs baseline when provided.
|
|
598
|
+
try:
|
|
599
|
+
pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
|
|
600
|
+
pm_kind = str(pm.get("kind", "")).lower() if isinstance(pm, dict) else ""
|
|
601
|
+
is_ppl_metric = pm_kind.startswith("ppl")
|
|
602
|
+
|
|
603
|
+
baseline_eval = {}
|
|
604
|
+
if (
|
|
605
|
+
is_ppl_metric
|
|
606
|
+
and config
|
|
607
|
+
and isinstance(config.context, dict)
|
|
608
|
+
and isinstance(config.context.get("baseline_eval_windows"), dict)
|
|
609
|
+
):
|
|
610
|
+
baseline_eval = config.context.get("baseline_eval_windows") or {}
|
|
611
|
+
|
|
612
|
+
if is_ppl_metric and baseline_eval:
|
|
613
|
+
tier_policies = (
|
|
614
|
+
report.meta.get("tier_policies", {})
|
|
615
|
+
if isinstance(getattr(report, "meta", None), dict)
|
|
616
|
+
else {}
|
|
617
|
+
)
|
|
618
|
+
metrics_policy = (
|
|
619
|
+
tier_policies.get("metrics", {})
|
|
620
|
+
if isinstance(tier_policies, dict)
|
|
621
|
+
else {}
|
|
622
|
+
)
|
|
623
|
+
pm_tail_policy = (
|
|
624
|
+
metrics_policy.get("pm_tail", {})
|
|
625
|
+
if isinstance(metrics_policy, dict)
|
|
626
|
+
else {}
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
run_final = (
|
|
630
|
+
eval_windows.get("final", {})
|
|
631
|
+
if isinstance(eval_windows, dict)
|
|
632
|
+
else {}
|
|
633
|
+
)
|
|
634
|
+
base_final = (
|
|
635
|
+
baseline_eval.get("final", {})
|
|
636
|
+
if isinstance(baseline_eval, dict)
|
|
637
|
+
else {}
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
deltas: list[float] = []
|
|
641
|
+
weights: list[float] = []
|
|
642
|
+
run_ids = (
|
|
643
|
+
run_final.get("window_ids") if isinstance(run_final, dict) else None
|
|
644
|
+
)
|
|
645
|
+
run_ll = (
|
|
646
|
+
run_final.get("logloss") if isinstance(run_final, dict) else None
|
|
647
|
+
)
|
|
648
|
+
run_tc = (
|
|
649
|
+
run_final.get("token_counts")
|
|
650
|
+
if isinstance(run_final, dict)
|
|
651
|
+
else None
|
|
652
|
+
)
|
|
653
|
+
base_ids = (
|
|
654
|
+
base_final.get("window_ids")
|
|
655
|
+
if isinstance(base_final, dict)
|
|
656
|
+
else None
|
|
657
|
+
)
|
|
658
|
+
base_ll = (
|
|
659
|
+
base_final.get("logloss") if isinstance(base_final, dict) else None
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
if (
|
|
663
|
+
isinstance(run_ids, list)
|
|
664
|
+
and isinstance(run_ll, list)
|
|
665
|
+
and isinstance(base_ids, list)
|
|
666
|
+
and isinstance(base_ll, list)
|
|
667
|
+
):
|
|
668
|
+
base_map: dict[int, float] = {}
|
|
669
|
+
for b_id, b_val in zip(base_ids, base_ll, strict=False):
|
|
670
|
+
if isinstance(b_id, int | float) and isinstance(
|
|
671
|
+
b_val, int | float
|
|
672
|
+
):
|
|
673
|
+
base_map[int(b_id)] = float(b_val)
|
|
674
|
+
for idx, (r_id, r_val) in enumerate(
|
|
675
|
+
zip(run_ids, run_ll, strict=False)
|
|
676
|
+
):
|
|
677
|
+
if not (
|
|
678
|
+
isinstance(r_id, int | float)
|
|
679
|
+
and isinstance(r_val, int | float)
|
|
680
|
+
):
|
|
681
|
+
continue
|
|
682
|
+
key = int(r_id)
|
|
683
|
+
if key not in base_map:
|
|
684
|
+
continue
|
|
685
|
+
dv = float(r_val) - base_map[key]
|
|
686
|
+
if math.isfinite(dv):
|
|
687
|
+
deltas.append(float(dv))
|
|
688
|
+
if isinstance(run_tc, list) and idx < len(run_tc):
|
|
689
|
+
try:
|
|
690
|
+
wv = float(run_tc[idx])
|
|
691
|
+
except Exception:
|
|
692
|
+
wv = 0.0
|
|
693
|
+
weights.append(float(max(wv, 0.0)))
|
|
694
|
+
|
|
695
|
+
tail_result = evaluate_metric_tail(
|
|
696
|
+
deltas=deltas,
|
|
697
|
+
weights=weights
|
|
698
|
+
if (weights and len(weights) == len(deltas))
|
|
699
|
+
else None,
|
|
700
|
+
policy=pm_tail_policy if isinstance(pm_tail_policy, dict) else None,
|
|
701
|
+
)
|
|
702
|
+
tail_result["source"] = "paired_baseline.final"
|
|
703
|
+
metrics["primary_metric_tail"] = tail_result
|
|
704
|
+
except Exception: # pragma: no cover - best effort
|
|
705
|
+
pass
|
|
706
|
+
|
|
586
707
|
policy_flags = self._resolve_policy_flags(config)
|
|
587
708
|
eval_error = metrics.get("eval_error") if isinstance(metrics, dict) else None
|
|
588
709
|
if eval_error:
|
|
@@ -834,8 +955,10 @@ class CoreRunner:
|
|
|
834
955
|
pairing_reason = None
|
|
835
956
|
preview_pair_stats = {"matched": 0, "expected": 0}
|
|
836
957
|
final_pair_stats = {"matched": 0, "expected": 0}
|
|
958
|
+
paired_windows_attempted = 0
|
|
837
959
|
preview_window_ids: list[int] = []
|
|
838
960
|
final_window_ids: list[int] = []
|
|
961
|
+
|
|
839
962
|
preview_tokens: list[list[int]] = []
|
|
840
963
|
final_tokens: list[list[int]] = []
|
|
841
964
|
preview_limit = min(preview_n, len(preview_data)) if preview_data else 0
|
|
@@ -876,6 +999,8 @@ class CoreRunner:
|
|
|
876
999
|
# even if an exception occurs during the main compute block.
|
|
877
1000
|
delta_samples: list[float] = []
|
|
878
1001
|
delta_weights: list[float] = []
|
|
1002
|
+
pm_invalid = False
|
|
1003
|
+
degraded_reason: str | None = None
|
|
879
1004
|
|
|
880
1005
|
try:
|
|
881
1006
|
|
|
@@ -891,7 +1016,7 @@ class CoreRunner:
|
|
|
891
1016
|
max_batches: int,
|
|
892
1017
|
start_idx: int,
|
|
893
1018
|
) -> dict[str, Any]:
|
|
894
|
-
nonlocal alignment_logged
|
|
1019
|
+
nonlocal alignment_logged, eval_error
|
|
895
1020
|
|
|
896
1021
|
total_tokens_local = 0
|
|
897
1022
|
actual_tokens_local = 0
|
|
@@ -927,7 +1052,9 @@ class CoreRunner:
|
|
|
927
1052
|
limit = _resolve_limit(batches, max_batches)
|
|
928
1053
|
|
|
929
1054
|
for batch in batches[:limit]:
|
|
930
|
-
if
|
|
1055
|
+
if (
|
|
1056
|
+
max_batches > 0 and count >= max_batches
|
|
1057
|
+
): # pragma: no cover - slicing already caps iteration
|
|
931
1058
|
break
|
|
932
1059
|
|
|
933
1060
|
labels = None
|
|
@@ -1100,7 +1227,7 @@ class CoreRunner:
|
|
|
1100
1227
|
"zero_mask_batches": zero_mask_batches,
|
|
1101
1228
|
"requested": limit,
|
|
1102
1229
|
},
|
|
1103
|
-
)
|
|
1230
|
+
) # pragma: no cover - requires debug tracing with zero batches
|
|
1104
1231
|
if resolved_loss_mode == "mlm":
|
|
1105
1232
|
error_msg = (
|
|
1106
1233
|
"MLM evaluation produced zero usable batches; "
|
|
@@ -1121,7 +1248,10 @@ class CoreRunner:
|
|
|
1121
1248
|
"zero_mask_batches": zero_mask_batches,
|
|
1122
1249
|
},
|
|
1123
1250
|
)
|
|
1124
|
-
|
|
1251
|
+
eval_error = {
|
|
1252
|
+
"error": "mlm_missing_masks",
|
|
1253
|
+
"detail": error_msg,
|
|
1254
|
+
}
|
|
1125
1255
|
return {
|
|
1126
1256
|
"ppl": float("nan"),
|
|
1127
1257
|
"total_tokens": total_tokens_local,
|
|
@@ -1167,8 +1297,42 @@ class CoreRunner:
|
|
|
1167
1297
|
final_data, final_limit, preview_summary["num_batches"]
|
|
1168
1298
|
)
|
|
1169
1299
|
|
|
1170
|
-
|
|
1171
|
-
|
|
1300
|
+
preview_raw_losses = preview_summary["log_losses"]
|
|
1301
|
+
final_raw_losses = final_summary["log_losses"]
|
|
1302
|
+
try:
|
|
1303
|
+
paired_windows_attempted = min(
|
|
1304
|
+
len(preview_raw_losses), len(final_raw_losses)
|
|
1305
|
+
)
|
|
1306
|
+
except Exception:
|
|
1307
|
+
paired_windows_attempted = 0
|
|
1308
|
+
|
|
1309
|
+
preview_log_losses = [
|
|
1310
|
+
float(loss) for loss in preview_raw_losses if math.isfinite(loss)
|
|
1311
|
+
]
|
|
1312
|
+
final_log_losses = [
|
|
1313
|
+
float(loss) for loss in final_raw_losses if math.isfinite(loss)
|
|
1314
|
+
]
|
|
1315
|
+
if len(preview_log_losses) != len(preview_raw_losses):
|
|
1316
|
+
self._log_event(
|
|
1317
|
+
"eval",
|
|
1318
|
+
"non_finite_preview_losses_filtered",
|
|
1319
|
+
LogLevel.WARNING,
|
|
1320
|
+
{
|
|
1321
|
+
"total": len(preview_raw_losses),
|
|
1322
|
+
"filtered": len(preview_raw_losses) - len(preview_log_losses),
|
|
1323
|
+
},
|
|
1324
|
+
)
|
|
1325
|
+
if len(final_log_losses) != len(final_raw_losses):
|
|
1326
|
+
self._log_event(
|
|
1327
|
+
"eval",
|
|
1328
|
+
"non_finite_final_losses_filtered",
|
|
1329
|
+
LogLevel.WARNING,
|
|
1330
|
+
{
|
|
1331
|
+
"total": len(final_raw_losses),
|
|
1332
|
+
"filtered": len(final_raw_losses) - len(final_log_losses),
|
|
1333
|
+
},
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1172
1336
|
preview_tokens_ct = preview_summary["total_tokens"]
|
|
1173
1337
|
final_tokens_ct = final_summary["total_tokens"]
|
|
1174
1338
|
preview_batches_ct = preview_summary["num_batches"]
|
|
@@ -1235,14 +1399,29 @@ class CoreRunner:
|
|
|
1235
1399
|
delta_mean_log = final_mean_log - preview_mean_log
|
|
1236
1400
|
pm_ratio = math.exp(delta_mean_log)
|
|
1237
1401
|
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1402
|
+
pm_invalid = False
|
|
1403
|
+
try:
|
|
1404
|
+
if not (math.isfinite(delta_mean_log) and math.isfinite(pm_ratio)):
|
|
1405
|
+
raise RuntimeError("non_finite_primary_metric")
|
|
1406
|
+
|
|
1407
|
+
expected_ratio = math.exp(delta_mean_log)
|
|
1408
|
+
if abs(pm_ratio - expected_ratio) > 1e-6:
|
|
1409
|
+
raise RuntimeError("primary_metric_ratio_mismatch")
|
|
1410
|
+
except Exception as exc:
|
|
1411
|
+
pm_invalid = True
|
|
1412
|
+
self._log_event(
|
|
1413
|
+
"eval",
|
|
1414
|
+
"primary_metric_invalid",
|
|
1415
|
+
LogLevel.WARNING,
|
|
1416
|
+
{
|
|
1417
|
+
"pm_preview": float(pm_preview),
|
|
1418
|
+
"pm_final": float(pm_final),
|
|
1419
|
+
"delta_mean_log": float(delta_mean_log),
|
|
1420
|
+
"pm_ratio": float(pm_ratio),
|
|
1421
|
+
"error": str(exc),
|
|
1422
|
+
},
|
|
1245
1423
|
)
|
|
1424
|
+
# Preserve downstream reporting; keep NaNs but mark degraded
|
|
1246
1425
|
|
|
1247
1426
|
if bootstrap_enabled and preview_log_losses:
|
|
1248
1427
|
preview_log_ci = compute_logloss_ci(
|
|
@@ -1298,7 +1477,20 @@ class CoreRunner:
|
|
|
1298
1477
|
abs(r - e) > 1e-6
|
|
1299
1478
|
for r, e in zip(ratio_ci, expected_ratio_ci, strict=False)
|
|
1300
1479
|
):
|
|
1301
|
-
|
|
1480
|
+
pm_invalid = True
|
|
1481
|
+
self._log_event(
|
|
1482
|
+
"eval",
|
|
1483
|
+
"ratio_ci_inconsistent",
|
|
1484
|
+
LogLevel.WARNING,
|
|
1485
|
+
{
|
|
1486
|
+
"ratio_ci": ratio_ci,
|
|
1487
|
+
"expected_ratio_ci": expected_ratio_ci,
|
|
1488
|
+
},
|
|
1489
|
+
)
|
|
1490
|
+
ratio_ci = (
|
|
1491
|
+
float(expected_ratio_ci[0]),
|
|
1492
|
+
float(expected_ratio_ci[1]),
|
|
1493
|
+
)
|
|
1302
1494
|
else:
|
|
1303
1495
|
delta_log_ci = (delta_mean_log, delta_mean_log)
|
|
1304
1496
|
ratio_ci = (pm_ratio, pm_ratio)
|
|
@@ -1335,19 +1527,60 @@ class CoreRunner:
|
|
|
1335
1527
|
degenerate_reason = "no_variation"
|
|
1336
1528
|
|
|
1337
1529
|
if degenerate_delta:
|
|
1530
|
+
pm_invalid = True
|
|
1338
1531
|
self._log_event(
|
|
1339
1532
|
"eval",
|
|
1340
1533
|
"degenerate_delta_samples",
|
|
1341
|
-
LogLevel.
|
|
1534
|
+
LogLevel.WARNING,
|
|
1342
1535
|
{
|
|
1343
1536
|
"reason": degenerate_reason,
|
|
1344
1537
|
"sample_count": len(delta_samples),
|
|
1345
1538
|
},
|
|
1346
1539
|
)
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1540
|
+
|
|
1541
|
+
needs_pm_fallback = (not math.isfinite(pm_preview)) or (
|
|
1542
|
+
not math.isfinite(pm_final)
|
|
1543
|
+
)
|
|
1544
|
+
needs_delta_fallback = (not math.isfinite(delta_mean_log)) or (
|
|
1545
|
+
not math.isfinite(pm_ratio)
|
|
1546
|
+
)
|
|
1547
|
+
|
|
1548
|
+
degraded_reason: str | None = None
|
|
1549
|
+
if needs_pm_fallback:
|
|
1550
|
+
degraded_reason = "non_finite_pm"
|
|
1551
|
+
elif needs_delta_fallback:
|
|
1552
|
+
degraded_reason = "non_finite_delta"
|
|
1553
|
+
elif degenerate_reason:
|
|
1554
|
+
degraded_reason = f"degenerate_delta:{degenerate_reason}"
|
|
1555
|
+
elif pm_invalid:
|
|
1556
|
+
degraded_reason = "primary_metric_invalid"
|
|
1557
|
+
|
|
1558
|
+
if needs_pm_fallback or needs_delta_fallback:
|
|
1559
|
+
pm_invalid = True
|
|
1560
|
+
pm_fallback = (
|
|
1561
|
+
pm_preview
|
|
1562
|
+
if math.isfinite(pm_preview) and pm_preview > 0
|
|
1563
|
+
else pm_final
|
|
1564
|
+
)
|
|
1565
|
+
if not (math.isfinite(pm_fallback) and pm_fallback > 0):
|
|
1566
|
+
pm_fallback = 1.0
|
|
1567
|
+
|
|
1568
|
+
if needs_pm_fallback:
|
|
1569
|
+
pm_preview = (
|
|
1570
|
+
pm_preview
|
|
1571
|
+
if math.isfinite(pm_preview) and pm_preview > 0
|
|
1572
|
+
else pm_fallback
|
|
1573
|
+
)
|
|
1574
|
+
pm_final = (
|
|
1575
|
+
pm_final
|
|
1576
|
+
if math.isfinite(pm_final) and pm_final > 0
|
|
1577
|
+
else pm_fallback
|
|
1350
1578
|
)
|
|
1579
|
+
if needs_delta_fallback:
|
|
1580
|
+
if not math.isfinite(delta_mean_log):
|
|
1581
|
+
delta_mean_log = 0.0
|
|
1582
|
+
if not math.isfinite(pm_ratio):
|
|
1583
|
+
pm_ratio = 1.0
|
|
1351
1584
|
|
|
1352
1585
|
def _hash_tokens(tokens: list[int]) -> bytes:
|
|
1353
1586
|
if not tokens:
|
|
@@ -1371,10 +1604,14 @@ class CoreRunner:
|
|
|
1371
1604
|
if not isinstance(dataset_cfg, dict):
|
|
1372
1605
|
return None
|
|
1373
1606
|
seq_len_val = dataset_cfg.get("seq_len")
|
|
1374
|
-
|
|
1607
|
+
if seq_len_val is None:
|
|
1608
|
+
return None
|
|
1609
|
+
stride_raw = dataset_cfg.get("stride", seq_len_val)
|
|
1610
|
+
if stride_raw is None:
|
|
1611
|
+
return None
|
|
1375
1612
|
try:
|
|
1376
1613
|
seq_len_f = float(seq_len_val)
|
|
1377
|
-
stride_f = float(
|
|
1614
|
+
stride_f = float(stride_raw)
|
|
1378
1615
|
except (TypeError, ValueError):
|
|
1379
1616
|
return None
|
|
1380
1617
|
if not math.isfinite(seq_len_f) or seq_len_f <= 0:
|
|
@@ -1687,7 +1924,9 @@ class CoreRunner:
|
|
|
1687
1924
|
except Exception:
|
|
1688
1925
|
pass
|
|
1689
1926
|
|
|
1690
|
-
paired_windows_count =
|
|
1927
|
+
paired_windows_count = (
|
|
1928
|
+
paired_windows_attempted if paired_windows_attempted else len(delta_samples)
|
|
1929
|
+
)
|
|
1691
1930
|
unweighted_delta_mean = (
|
|
1692
1931
|
float(np.mean(delta_samples)) if delta_samples else float(delta_mean_log)
|
|
1693
1932
|
)
|
|
@@ -1715,8 +1954,11 @@ class CoreRunner:
|
|
|
1715
1954
|
metrics = {
|
|
1716
1955
|
"primary_metric": {
|
|
1717
1956
|
"kind": pm_kind,
|
|
1718
|
-
"preview": float(pm_preview),
|
|
1719
|
-
"final": float(pm_final),
|
|
1957
|
+
"preview": float(pm_preview) if math.isfinite(pm_preview) else None,
|
|
1958
|
+
"final": float(pm_final) if math.isfinite(pm_final) else None,
|
|
1959
|
+
"invalid": bool(pm_invalid),
|
|
1960
|
+
"degraded": bool(pm_invalid or degraded_reason),
|
|
1961
|
+
"degraded_reason": degraded_reason,
|
|
1720
1962
|
},
|
|
1721
1963
|
"logloss_preview": float(preview_mean_log),
|
|
1722
1964
|
"logloss_final": float(final_mean_log),
|
|
@@ -2030,17 +2272,27 @@ class CoreRunner:
|
|
|
2030
2272
|
except Exception:
|
|
2031
2273
|
drift_ratio = None
|
|
2032
2274
|
|
|
2275
|
+
spike_threshold = getattr(config, "spike_threshold", 2.0)
|
|
2033
2276
|
if drift_ratio is None:
|
|
2034
2277
|
is_catastrophic_spike = False
|
|
2035
2278
|
metrics_acceptable = True
|
|
2036
2279
|
else:
|
|
2037
|
-
spike_threshold = getattr(config, "spike_threshold", 2.0)
|
|
2038
2280
|
is_catastrophic_spike = drift_ratio > spike_threshold
|
|
2039
2281
|
# Check if standard metrics are acceptable against configured max ratio
|
|
2040
2282
|
metrics_acceptable = drift_ratio <= getattr(config, "max_pm_ratio", 2.0)
|
|
2041
2283
|
|
|
2042
2284
|
# Determine rollback reason and status
|
|
2043
2285
|
rollback_reason = None
|
|
2286
|
+
tail_failed = False
|
|
2287
|
+
try:
|
|
2288
|
+
pm_tail = metrics.get("primary_metric_tail", {})
|
|
2289
|
+
if isinstance(pm_tail, dict) and pm_tail:
|
|
2290
|
+
mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
|
|
2291
|
+
evaluated = bool(pm_tail.get("evaluated", False))
|
|
2292
|
+
passed = bool(pm_tail.get("passed", True))
|
|
2293
|
+
tail_failed = bool(mode == "fail" and evaluated and (not passed))
|
|
2294
|
+
except Exception: # pragma: no cover
|
|
2295
|
+
tail_failed = False
|
|
2044
2296
|
if is_catastrophic_spike:
|
|
2045
2297
|
rollback_reason = (
|
|
2046
2298
|
f"catastrophic_ppl_spike (ratio: {drift_ratio:.3f} > {spike_threshold})"
|
|
@@ -2057,6 +2309,9 @@ class CoreRunner:
|
|
|
2057
2309
|
"immediate_rollback": True,
|
|
2058
2310
|
},
|
|
2059
2311
|
)
|
|
2312
|
+
elif tail_failed:
|
|
2313
|
+
rollback_reason = "primary_metric_tail_failed"
|
|
2314
|
+
status = RunStatus.ROLLBACK.value
|
|
2060
2315
|
elif (not all_guards_passed) or (not metrics_acceptable):
|
|
2061
2316
|
# Match historical/test expectation string exactly
|
|
2062
2317
|
rollback_reason = "guards_failed or metrics_unacceptable"
|
|
@@ -2185,20 +2440,27 @@ class CoreRunner:
|
|
|
2185
2440
|
) -> dict[str, dict[str, Any]]:
|
|
2186
2441
|
"""Resolve tier-based guard policies from configuration."""
|
|
2187
2442
|
# Use passed auto_config if available, otherwise extract from report meta
|
|
2188
|
-
|
|
2189
|
-
|
|
2443
|
+
auto_cfg: dict[str, Any] | None = auto_config
|
|
2444
|
+
if auto_cfg is None:
|
|
2445
|
+
config_meta = report.meta.get("config") or {}
|
|
2190
2446
|
|
|
2191
2447
|
# Try to get auto config from various possible locations
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2448
|
+
auto_cfg = report.__dict__.get("auto_config")
|
|
2449
|
+
if (
|
|
2450
|
+
auto_cfg is None
|
|
2451
|
+
and isinstance(config_meta, dict)
|
|
2452
|
+
and "auto" in config_meta
|
|
2453
|
+
):
|
|
2454
|
+
auto_cfg = config_meta["auto"]
|
|
2455
|
+
elif auto_cfg is None:
|
|
2197
2456
|
# Fallback to default balanced tier
|
|
2198
|
-
|
|
2457
|
+
auto_cfg = {"tier": "balanced", "enabled": True}
|
|
2458
|
+
|
|
2459
|
+
if not isinstance(auto_cfg, dict):
|
|
2460
|
+
auto_cfg = {"tier": "balanced", "enabled": True}
|
|
2199
2461
|
|
|
2200
2462
|
# Extract tier and edit name
|
|
2201
|
-
tier =
|
|
2463
|
+
tier = auto_cfg.get("tier", "balanced")
|
|
2202
2464
|
edit_name = None
|
|
2203
2465
|
if hasattr(report, "edit") and report.edit:
|
|
2204
2466
|
edit_name = report.edit.get("name")
|
|
@@ -2208,8 +2470,10 @@ class CoreRunner:
|
|
|
2208
2470
|
edit_name = report.meta["edit_name"]
|
|
2209
2471
|
|
|
2210
2472
|
# Get explicit guard overrides from config
|
|
2211
|
-
config_meta = report.meta.get("config"
|
|
2212
|
-
explicit_overrides =
|
|
2473
|
+
config_meta = report.meta.get("config") or {}
|
|
2474
|
+
explicit_overrides = (
|
|
2475
|
+
config_meta.get("guards", {}) if isinstance(config_meta, dict) else {}
|
|
2476
|
+
)
|
|
2213
2477
|
|
|
2214
2478
|
try:
|
|
2215
2479
|
# Resolve tier policies
|
|
@@ -2237,18 +2501,18 @@ class CoreRunner:
|
|
|
2237
2501
|
def _apply_guard_policy(self, guard: Guard, policy: dict[str, Any]) -> None:
|
|
2238
2502
|
"""Apply resolved policy parameters to a guard instance."""
|
|
2239
2503
|
try:
|
|
2504
|
+
guard_config = getattr(guard, "config", None)
|
|
2505
|
+
guard_policy = getattr(guard, "policy", None)
|
|
2506
|
+
|
|
2240
2507
|
# Apply policy parameters to guard
|
|
2241
2508
|
for param_name, param_value in policy.items():
|
|
2242
2509
|
if hasattr(guard, param_name):
|
|
2243
2510
|
setattr(guard, param_name, param_value)
|
|
2244
|
-
elif
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
# Try to set in guard's policy dict
|
|
2249
|
-
guard.policy[param_name] = param_value
|
|
2511
|
+
elif isinstance(guard_config, dict):
|
|
2512
|
+
guard_config[param_name] = param_value
|
|
2513
|
+
elif isinstance(guard_policy, dict):
|
|
2514
|
+
guard_policy[param_name] = param_value
|
|
2250
2515
|
else:
|
|
2251
|
-
# Last resort: add to guard as attribute
|
|
2252
2516
|
setattr(guard, param_name, param_value)
|
|
2253
2517
|
|
|
2254
2518
|
except Exception as e:
|
invarlock/eval/bench.py
CHANGED
|
@@ -92,7 +92,6 @@ class BenchmarkConfig:
|
|
|
92
92
|
epsilon: float | None = (
|
|
93
93
|
None # RMT deadband tolerance (None = use resolved deadband)
|
|
94
94
|
)
|
|
95
|
-
strict: bool = False # If True, sets epsilon = 0
|
|
96
95
|
ppl_overhead_threshold: float = 0.01 # 1%
|
|
97
96
|
guard_overhead_time_threshold: float = 0.15 # 15%
|
|
98
97
|
guard_overhead_mem_threshold: float = 0.10 # 10%
|
|
@@ -104,10 +103,6 @@ class BenchmarkConfig:
|
|
|
104
103
|
"""Apply post-initialization logic."""
|
|
105
104
|
self.output_dir = Path(self.output_dir)
|
|
106
105
|
|
|
107
|
-
# Handle strict mode
|
|
108
|
-
if self.strict:
|
|
109
|
-
self.epsilon = 0.0
|
|
110
|
-
|
|
111
106
|
|
|
112
107
|
@dataclass
|
|
113
108
|
class ScenarioResult:
|
|
@@ -1043,7 +1038,6 @@ def run_guard_effect_benchmark(
|
|
|
1043
1038
|
profile: str = "ci",
|
|
1044
1039
|
output_dir: str | Path = "benchmarks",
|
|
1045
1040
|
epsilon: float | None = None,
|
|
1046
|
-
strict: bool = False,
|
|
1047
1041
|
**kwargs,
|
|
1048
1042
|
) -> dict[str, Any]:
|
|
1049
1043
|
"""
|
|
@@ -1056,7 +1050,6 @@ def run_guard_effect_benchmark(
|
|
|
1056
1050
|
profile: "ci" (50/50 windows) or "release" (100/100 windows)
|
|
1057
1051
|
output_dir: Directory to save results
|
|
1058
1052
|
epsilon: Optional epsilon override
|
|
1059
|
-
strict: If True, sets epsilon = 0
|
|
1060
1053
|
**kwargs: Additional configuration options
|
|
1061
1054
|
|
|
1062
1055
|
Returns:
|
|
@@ -1075,7 +1068,6 @@ def run_guard_effect_benchmark(
|
|
|
1075
1068
|
profile=profile,
|
|
1076
1069
|
output_dir=Path(output_dir),
|
|
1077
1070
|
epsilon=epsilon,
|
|
1078
|
-
strict=strict,
|
|
1079
1071
|
**kwargs,
|
|
1080
1072
|
)
|
|
1081
1073
|
|
|
@@ -1384,7 +1376,6 @@ def _config_to_dict(config: BenchmarkConfig) -> dict[str, Any]:
|
|
|
1384
1376
|
"stride": config.stride,
|
|
1385
1377
|
"seed": config.seed,
|
|
1386
1378
|
"epsilon": config.epsilon,
|
|
1387
|
-
"strict": config.strict,
|
|
1388
1379
|
"ppl_overhead_threshold": config.ppl_overhead_threshold,
|
|
1389
1380
|
"guard_overhead_time_threshold": config.guard_overhead_time_threshold,
|
|
1390
1381
|
"guard_overhead_mem_threshold": config.guard_overhead_mem_threshold,
|
|
@@ -1426,9 +1417,6 @@ def main():
|
|
|
1426
1417
|
type=float,
|
|
1427
1418
|
help="RMT outliers epsilon threshold (default: use resolved RMT deadband)",
|
|
1428
1419
|
)
|
|
1429
|
-
parser.add_argument(
|
|
1430
|
-
"--strict", action="store_true", help="Set epsilon=0 (overrides --epsilon)"
|
|
1431
|
-
)
|
|
1432
1420
|
|
|
1433
1421
|
# Model and dataset configuration
|
|
1434
1422
|
parser.add_argument(
|
|
@@ -1505,7 +1493,6 @@ def main():
|
|
|
1505
1493
|
profile=args.profile,
|
|
1506
1494
|
output_dir=args.out,
|
|
1507
1495
|
epsilon=args.epsilon,
|
|
1508
|
-
strict=args.strict,
|
|
1509
1496
|
**kwargs,
|
|
1510
1497
|
)
|
|
1511
1498
|
|