invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. invarlock/__init__.py +2 -2
  2. invarlock/_data/runtime/tiers.yaml +57 -30
  3. invarlock/adapters/__init__.py +11 -15
  4. invarlock/adapters/auto.py +35 -40
  5. invarlock/adapters/capabilities.py +2 -2
  6. invarlock/adapters/hf_causal.py +418 -0
  7. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  8. invarlock/adapters/hf_mixin.py +25 -4
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/calibration/spectral_null.py +15 -10
  12. invarlock/calibration/variance_ve.py +0 -2
  13. invarlock/cli/adapter_auto.py +31 -21
  14. invarlock/cli/app.py +73 -2
  15. invarlock/cli/commands/calibrate.py +6 -2
  16. invarlock/cli/commands/certify.py +651 -91
  17. invarlock/cli/commands/doctor.py +11 -11
  18. invarlock/cli/commands/explain_gates.py +57 -8
  19. invarlock/cli/commands/plugins.py +13 -9
  20. invarlock/cli/commands/report.py +233 -69
  21. invarlock/cli/commands/run.py +1066 -244
  22. invarlock/cli/commands/verify.py +154 -15
  23. invarlock/cli/config.py +22 -6
  24. invarlock/cli/doctor_helpers.py +4 -5
  25. invarlock/cli/output.py +193 -0
  26. invarlock/cli/provenance.py +1 -1
  27. invarlock/core/api.py +45 -5
  28. invarlock/core/auto_tuning.py +65 -20
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/contracts.py +7 -1
  31. invarlock/core/registry.py +11 -13
  32. invarlock/core/runner.py +425 -75
  33. invarlock/edits/quant_rtn.py +65 -37
  34. invarlock/eval/bench.py +3 -16
  35. invarlock/eval/data.py +82 -51
  36. invarlock/eval/metrics.py +63 -2
  37. invarlock/eval/primary_metric.py +23 -0
  38. invarlock/eval/tail_stats.py +230 -0
  39. invarlock/eval/tasks/__init__.py +12 -0
  40. invarlock/eval/tasks/classification.py +48 -0
  41. invarlock/eval/tasks/qa.py +36 -0
  42. invarlock/eval/tasks/text_generation.py +102 -0
  43. invarlock/guards/_estimators.py +154 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/policies.py +16 -6
  46. invarlock/guards/rmt.py +627 -546
  47. invarlock/guards/spectral.py +348 -110
  48. invarlock/guards/tier_config.py +32 -30
  49. invarlock/guards/variance.py +7 -31
  50. invarlock/guards_ref/rmt_ref.py +23 -23
  51. invarlock/model_profile.py +90 -42
  52. invarlock/observability/health.py +6 -6
  53. invarlock/observability/metrics.py +108 -0
  54. invarlock/reporting/certificate.py +384 -55
  55. invarlock/reporting/certificate_schema.py +3 -2
  56. invarlock/reporting/dataset_hashing.py +15 -2
  57. invarlock/reporting/guards_analysis.py +350 -277
  58. invarlock/reporting/html.py +55 -5
  59. invarlock/reporting/normalizer.py +13 -0
  60. invarlock/reporting/policy_utils.py +38 -36
  61. invarlock/reporting/primary_metric_utils.py +71 -17
  62. invarlock/reporting/render.py +852 -431
  63. invarlock/reporting/report.py +40 -4
  64. invarlock/reporting/report_types.py +11 -3
  65. invarlock/reporting/telemetry.py +86 -0
  66. invarlock/reporting/validate.py +1 -18
  67. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
  68. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
  69. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
  70. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
  71. invarlock/adapters/hf_gpt2.py +0 -404
  72. invarlock/adapters/hf_llama.py +0 -487
  73. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
  74. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
invarlock/core/runner.py CHANGED
@@ -18,7 +18,23 @@ from typing import Any
18
18
 
19
19
  import numpy as np
20
20
 
21
- from .api import Guard, ModelAdapter, ModelEdit, RunConfig, RunReport
21
+ from invarlock.eval.tail_stats import evaluate_metric_tail
22
+ from invarlock.observability.metrics import (
23
+ capture_memory_snapshot,
24
+ reset_peak_memory_stats,
25
+ summarize_memory_snapshots,
26
+ )
27
+
28
+ from .api import (
29
+ EditLike,
30
+ Guard,
31
+ GuardWithContext,
32
+ GuardWithPrepare,
33
+ ModelAdapter,
34
+ ModelEdit,
35
+ RunConfig,
36
+ RunReport,
37
+ )
22
38
  from .auto_tuning import resolve_tier_policies
23
39
  from .bootstrap import (
24
40
  compute_logloss_ci,
@@ -112,7 +128,7 @@ class CoreRunner:
112
128
  self,
113
129
  model: Any,
114
130
  adapter: ModelAdapter,
115
- edit: ModelEdit,
131
+ edit: ModelEdit | EditLike,
116
132
  guards: list[Guard],
117
133
  config: RunConfig,
118
134
  calibration_data: Any = None,
@@ -175,10 +191,22 @@ class CoreRunner:
175
191
  config.context["auto"] = dict(auto_config)
176
192
  try:
177
193
  report.context["auto"] = config.context["auto"]
178
- except Exception:
194
+ except Exception: # pragma: no cover - defensive context propagation
179
195
  pass
180
196
 
181
197
  report.status = RunStatus.RUNNING.value
198
+ timings: dict[str, float] = {}
199
+ guard_timings: dict[str, float] = {}
200
+ memory_snapshots: list[dict[str, Any]] = []
201
+ total_start = time.perf_counter()
202
+
203
+ def _record_timing(key: str, start: float) -> None:
204
+ timings[key] = max(0.0, float(time.perf_counter() - start))
205
+
206
+ def _capture_memory(phase: str) -> None:
207
+ snapshot = capture_memory_snapshot(phase)
208
+ if snapshot:
209
+ memory_snapshots.append(snapshot)
182
210
 
183
211
  try:
184
212
  # Log start
@@ -194,40 +222,78 @@ class CoreRunner:
194
222
  )
195
223
 
196
224
  # Phase 1: Prepare (describe model, create checkpoint)
197
- model_desc = self._prepare_phase(model, adapter, report)
225
+ reset_peak_memory_stats()
226
+ phase_start = time.perf_counter()
227
+ try:
228
+ model_desc = self._prepare_phase(model, adapter, report)
229
+ finally:
230
+ _record_timing("prepare", phase_start)
231
+ _capture_memory("prepare")
198
232
 
199
233
  # Phase 2: Prepare guards (must happen before edit)
200
- self._prepare_guards_phase(
201
- model,
202
- adapter,
203
- guards,
204
- calibration_data,
205
- report,
206
- auto_config,
207
- config,
208
- )
234
+ reset_peak_memory_stats()
235
+ phase_start = time.perf_counter()
236
+ try:
237
+ self._prepare_guards_phase(
238
+ model,
239
+ adapter,
240
+ guards,
241
+ calibration_data,
242
+ report,
243
+ auto_config,
244
+ config,
245
+ )
246
+ finally:
247
+ _record_timing("prepare_guards", phase_start)
248
+ _capture_memory("prepare_guards")
209
249
 
210
250
  # Phase 3: Apply edit
211
- self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
251
+ reset_peak_memory_stats()
252
+ phase_start = time.perf_counter()
253
+ try:
254
+ self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
255
+ finally:
256
+ _record_timing("edit", phase_start)
257
+ _capture_memory("edit")
212
258
 
213
259
  # Phase 4: Run guards
214
- guard_results = self._guard_phase(model, adapter, guards, report)
260
+ reset_peak_memory_stats()
261
+ phase_start = time.perf_counter()
262
+ try:
263
+ guard_results = self._guard_phase(
264
+ model, adapter, guards, report, guard_timings=guard_timings
265
+ )
266
+ finally:
267
+ _record_timing("guards", phase_start)
268
+ _capture_memory("guards")
215
269
 
216
270
  # Phase 5: Evaluate final metrics
217
- metrics = self._eval_phase(
218
- model,
219
- adapter,
220
- calibration_data,
221
- report,
222
- preview_n,
223
- final_n,
224
- config,
225
- )
271
+ reset_peak_memory_stats()
272
+ phase_start = time.perf_counter()
273
+ try:
274
+ metrics = self._eval_phase(
275
+ model,
276
+ adapter,
277
+ calibration_data,
278
+ report,
279
+ preview_n,
280
+ final_n,
281
+ config,
282
+ )
283
+ finally:
284
+ _record_timing("eval", phase_start)
285
+ _capture_memory("eval")
226
286
 
227
287
  # Phase 6: Finalize or rollback
228
- final_status = self._finalize_phase(
229
- model, adapter, guard_results, metrics, config, report
230
- )
288
+ reset_peak_memory_stats()
289
+ phase_start = time.perf_counter()
290
+ try:
291
+ final_status = self._finalize_phase(
292
+ model, adapter, guard_results, metrics, config, report
293
+ )
294
+ finally:
295
+ _record_timing("finalize", phase_start)
296
+ _capture_memory("finalize")
231
297
 
232
298
  report.status = final_status
233
299
  report.meta["end_time"] = time.time()
@@ -249,6 +315,25 @@ class CoreRunner:
249
315
  return report
250
316
 
251
317
  finally:
318
+ _record_timing("total", total_start)
319
+ if not isinstance(report.metrics, dict):
320
+ report.metrics = {}
321
+ if timings:
322
+ report.metrics.setdefault("timings", {}).update(timings)
323
+ if guard_timings:
324
+ report.metrics["guard_timings"] = guard_timings
325
+ if memory_snapshots:
326
+ report.metrics["memory_snapshots"] = memory_snapshots
327
+ summary = summarize_memory_snapshots(memory_snapshots)
328
+ if summary:
329
+ mem_peak = summary.get("memory_mb_peak")
330
+ if isinstance(mem_peak, (int | float)):
331
+ existing = report.metrics.get("memory_mb_peak")
332
+ if isinstance(existing, (int | float)):
333
+ summary["memory_mb_peak"] = max(
334
+ float(existing), float(mem_peak)
335
+ )
336
+ report.metrics.update(summary)
252
337
  self._active_model = None
253
338
  self._active_adapter = None
254
339
  self._cleanup_services()
@@ -303,10 +388,10 @@ class CoreRunner:
303
388
  self,
304
389
  model: Any,
305
390
  adapter: ModelAdapter,
306
- edit: ModelEdit,
391
+ edit: ModelEdit | EditLike,
307
392
  model_desc: dict[str, Any],
308
393
  report: RunReport,
309
- edit_config: dict[str, Any] | None = None,
394
+ edit_config: dict[str, Any] | None,
310
395
  ) -> dict[str, Any]:
311
396
  """Phase 2: Apply edit operation."""
312
397
  edit_label = "baseline" if edit.name == "baseline" else edit.name
@@ -388,7 +473,7 @@ class CoreRunner:
388
473
  {"guard": guard.name, "policy": guard_policy},
389
474
  )
390
475
 
391
- if hasattr(guard, "set_run_context"):
476
+ if isinstance(guard, GuardWithContext):
392
477
  try:
393
478
  guard.set_run_context(report)
394
479
  except Exception as exc:
@@ -400,7 +485,7 @@ class CoreRunner:
400
485
  )
401
486
 
402
487
  # Call prepare method if it exists (most guards need this)
403
- if hasattr(guard, "prepare"):
488
+ if isinstance(guard, GuardWithPrepare):
404
489
  prepare_result = guard.prepare(
405
490
  model, adapter, calibration_data, guard_policy
406
491
  )
@@ -444,7 +529,13 @@ class CoreRunner:
444
529
  )
445
530
 
446
531
  def _guard_phase(
447
- self, model: Any, adapter: ModelAdapter, guards: list[Guard], report: RunReport
532
+ self,
533
+ model: Any,
534
+ adapter: ModelAdapter,
535
+ guards: list[Guard],
536
+ report: RunReport,
537
+ *,
538
+ guard_timings: dict[str, float] | None = None,
448
539
  ) -> dict[str, dict[str, Any]]:
449
540
  """Phase 4: Run safety guards."""
450
541
  self._log_event("guards", "start", LogLevel.INFO, {"count": len(guards)})
@@ -453,8 +544,9 @@ class CoreRunner:
453
544
 
454
545
  for guard in guards:
455
546
  self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
547
+ guard_start = time.perf_counter()
456
548
 
457
- if hasattr(guard, "set_run_context"):
549
+ if isinstance(guard, GuardWithContext):
458
550
  try:
459
551
  guard.set_run_context(report)
460
552
  except Exception as exc: # pragma: no cover - defensive
@@ -486,6 +578,11 @@ class CoreRunner:
486
578
  LogLevel.ERROR,
487
579
  {"guard": guard.name, "error": str(e)},
488
580
  )
581
+ finally:
582
+ if guard_timings is not None:
583
+ guard_timings[guard.name] = max(
584
+ 0.0, float(time.perf_counter() - guard_start)
585
+ )
489
586
 
490
587
  report.guards = guard_results
491
588
 
@@ -583,6 +680,116 @@ class CoreRunner:
583
680
  }
584
681
  eval_windows = {"preview": {}, "final": {}}
585
682
 
683
+ # Optional: compute primary metric tail evidence vs baseline when provided.
684
+ try:
685
+ pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
686
+ pm_kind = str(pm.get("kind", "")).lower() if isinstance(pm, dict) else ""
687
+ is_ppl_metric = pm_kind.startswith("ppl")
688
+
689
+ baseline_eval = {}
690
+ if (
691
+ is_ppl_metric
692
+ and config
693
+ and isinstance(config.context, dict)
694
+ and isinstance(config.context.get("baseline_eval_windows"), dict)
695
+ ):
696
+ baseline_eval = config.context.get("baseline_eval_windows") or {}
697
+
698
+ if is_ppl_metric and baseline_eval:
699
+ tier_policies = (
700
+ report.meta.get("tier_policies", {})
701
+ if isinstance(getattr(report, "meta", None), dict)
702
+ else {}
703
+ )
704
+ metrics_policy = (
705
+ tier_policies.get("metrics", {})
706
+ if isinstance(tier_policies, dict)
707
+ else {}
708
+ )
709
+ pm_tail_policy = (
710
+ metrics_policy.get("pm_tail", {})
711
+ if isinstance(metrics_policy, dict)
712
+ else {}
713
+ )
714
+
715
+ run_final = (
716
+ eval_windows.get("final", {})
717
+ if isinstance(eval_windows, dict)
718
+ else {}
719
+ )
720
+ base_final = (
721
+ baseline_eval.get("final", {})
722
+ if isinstance(baseline_eval, dict)
723
+ else {}
724
+ )
725
+
726
+ deltas: list[float] = []
727
+ weights: list[float] = []
728
+ run_ids = (
729
+ run_final.get("window_ids") if isinstance(run_final, dict) else None
730
+ )
731
+ run_ll = (
732
+ run_final.get("logloss") if isinstance(run_final, dict) else None
733
+ )
734
+ run_tc = (
735
+ run_final.get("token_counts")
736
+ if isinstance(run_final, dict)
737
+ else None
738
+ )
739
+ base_ids = (
740
+ base_final.get("window_ids")
741
+ if isinstance(base_final, dict)
742
+ else None
743
+ )
744
+ base_ll = (
745
+ base_final.get("logloss") if isinstance(base_final, dict) else None
746
+ )
747
+
748
+ if (
749
+ isinstance(run_ids, list)
750
+ and isinstance(run_ll, list)
751
+ and isinstance(base_ids, list)
752
+ and isinstance(base_ll, list)
753
+ ):
754
+ base_map: dict[int, float] = {}
755
+ for b_id, b_val in zip(base_ids, base_ll, strict=False):
756
+ if isinstance(b_id, int | float) and isinstance(
757
+ b_val, int | float
758
+ ):
759
+ base_map[int(b_id)] = float(b_val)
760
+ for idx, (r_id, r_val) in enumerate(
761
+ zip(run_ids, run_ll, strict=False)
762
+ ):
763
+ if not (
764
+ isinstance(r_id, int | float)
765
+ and isinstance(r_val, int | float)
766
+ ):
767
+ continue
768
+ key = int(r_id)
769
+ if key not in base_map:
770
+ continue
771
+ dv = float(r_val) - base_map[key]
772
+ if math.isfinite(dv):
773
+ deltas.append(float(dv))
774
+ if isinstance(run_tc, list) and idx < len(run_tc):
775
+ try:
776
+ wv = float(run_tc[idx])
777
+ except Exception:
778
+ wv = 0.0
779
+ weights.append(float(max(wv, 0.0)))
780
+
781
+ tail_result = evaluate_metric_tail(
782
+ deltas=deltas,
783
+ weights=weights
784
+ if (weights and len(weights) == len(deltas))
785
+ else None,
786
+ policy=pm_tail_policy if isinstance(pm_tail_policy, dict) else None,
787
+ )
788
+ tail_result["source"] = "paired_baseline.final"
789
+ metrics["primary_metric_tail"] = tail_result
790
+ except Exception: # pragma: no cover - best effort
791
+ pass
792
+
586
793
  policy_flags = self._resolve_policy_flags(config)
587
794
  eval_error = metrics.get("eval_error") if isinstance(metrics, dict) else None
588
795
  if eval_error:
@@ -834,8 +1041,10 @@ class CoreRunner:
834
1041
  pairing_reason = None
835
1042
  preview_pair_stats = {"matched": 0, "expected": 0}
836
1043
  final_pair_stats = {"matched": 0, "expected": 0}
1044
+ paired_windows_attempted = 0
837
1045
  preview_window_ids: list[int] = []
838
1046
  final_window_ids: list[int] = []
1047
+
839
1048
  preview_tokens: list[list[int]] = []
840
1049
  final_tokens: list[list[int]] = []
841
1050
  preview_limit = min(preview_n, len(preview_data)) if preview_data else 0
@@ -876,6 +1085,8 @@ class CoreRunner:
876
1085
  # even if an exception occurs during the main compute block.
877
1086
  delta_samples: list[float] = []
878
1087
  delta_weights: list[float] = []
1088
+ pm_invalid = False
1089
+ degraded_reason: str | None = None
879
1090
 
880
1091
  try:
881
1092
 
@@ -891,7 +1102,7 @@ class CoreRunner:
891
1102
  max_batches: int,
892
1103
  start_idx: int,
893
1104
  ) -> dict[str, Any]:
894
- nonlocal alignment_logged
1105
+ nonlocal alignment_logged, eval_error
895
1106
 
896
1107
  total_tokens_local = 0
897
1108
  actual_tokens_local = 0
@@ -927,7 +1138,9 @@ class CoreRunner:
927
1138
  limit = _resolve_limit(batches, max_batches)
928
1139
 
929
1140
  for batch in batches[:limit]:
930
- if max_batches > 0 and count >= max_batches:
1141
+ if (
1142
+ max_batches > 0 and count >= max_batches
1143
+ ): # pragma: no cover - slicing already caps iteration
931
1144
  break
932
1145
 
933
1146
  labels = None
@@ -1100,7 +1313,7 @@ class CoreRunner:
1100
1313
  "zero_mask_batches": zero_mask_batches,
1101
1314
  "requested": limit,
1102
1315
  },
1103
- )
1316
+ ) # pragma: no cover - requires debug tracing with zero batches
1104
1317
  if resolved_loss_mode == "mlm":
1105
1318
  error_msg = (
1106
1319
  "MLM evaluation produced zero usable batches; "
@@ -1121,7 +1334,10 @@ class CoreRunner:
1121
1334
  "zero_mask_batches": zero_mask_batches,
1122
1335
  },
1123
1336
  )
1124
- raise ValueError(error_msg)
1337
+ eval_error = {
1338
+ "error": "mlm_missing_masks",
1339
+ "detail": error_msg,
1340
+ }
1125
1341
  return {
1126
1342
  "ppl": float("nan"),
1127
1343
  "total_tokens": total_tokens_local,
@@ -1167,8 +1383,42 @@ class CoreRunner:
1167
1383
  final_data, final_limit, preview_summary["num_batches"]
1168
1384
  )
1169
1385
 
1170
- preview_log_losses = preview_summary["log_losses"]
1171
- final_log_losses = final_summary["log_losses"]
1386
+ preview_raw_losses = preview_summary["log_losses"]
1387
+ final_raw_losses = final_summary["log_losses"]
1388
+ try:
1389
+ paired_windows_attempted = min(
1390
+ len(preview_raw_losses), len(final_raw_losses)
1391
+ )
1392
+ except Exception:
1393
+ paired_windows_attempted = 0
1394
+
1395
+ preview_log_losses = [
1396
+ float(loss) for loss in preview_raw_losses if math.isfinite(loss)
1397
+ ]
1398
+ final_log_losses = [
1399
+ float(loss) for loss in final_raw_losses if math.isfinite(loss)
1400
+ ]
1401
+ if len(preview_log_losses) != len(preview_raw_losses):
1402
+ self._log_event(
1403
+ "eval",
1404
+ "non_finite_preview_losses_filtered",
1405
+ LogLevel.WARNING,
1406
+ {
1407
+ "total": len(preview_raw_losses),
1408
+ "filtered": len(preview_raw_losses) - len(preview_log_losses),
1409
+ },
1410
+ )
1411
+ if len(final_log_losses) != len(final_raw_losses):
1412
+ self._log_event(
1413
+ "eval",
1414
+ "non_finite_final_losses_filtered",
1415
+ LogLevel.WARNING,
1416
+ {
1417
+ "total": len(final_raw_losses),
1418
+ "filtered": len(final_raw_losses) - len(final_log_losses),
1419
+ },
1420
+ )
1421
+
1172
1422
  preview_tokens_ct = preview_summary["total_tokens"]
1173
1423
  final_tokens_ct = final_summary["total_tokens"]
1174
1424
  preview_batches_ct = preview_summary["num_batches"]
@@ -1235,14 +1485,29 @@ class CoreRunner:
1235
1485
  delta_mean_log = final_mean_log - preview_mean_log
1236
1486
  pm_ratio = math.exp(delta_mean_log)
1237
1487
 
1238
- if not (math.isfinite(delta_mean_log) and math.isfinite(pm_ratio)):
1239
- raise RuntimeError("Invalid perplexity ratio or delta")
1240
-
1241
- expected_ratio = math.exp(delta_mean_log)
1242
- if abs(pm_ratio - expected_ratio) > 1e-6:
1243
- raise RuntimeError(
1244
- "Primary-metric ratio mismatch with exp(mean ΔlogNLL)"
1488
+ pm_invalid = False
1489
+ try:
1490
+ if not (math.isfinite(delta_mean_log) and math.isfinite(pm_ratio)):
1491
+ raise RuntimeError("non_finite_primary_metric")
1492
+
1493
+ expected_ratio = math.exp(delta_mean_log)
1494
+ if abs(pm_ratio - expected_ratio) > 1e-6:
1495
+ raise RuntimeError("primary_metric_ratio_mismatch")
1496
+ except Exception as exc:
1497
+ pm_invalid = True
1498
+ self._log_event(
1499
+ "eval",
1500
+ "primary_metric_invalid",
1501
+ LogLevel.WARNING,
1502
+ {
1503
+ "pm_preview": float(pm_preview),
1504
+ "pm_final": float(pm_final),
1505
+ "delta_mean_log": float(delta_mean_log),
1506
+ "pm_ratio": float(pm_ratio),
1507
+ "error": str(exc),
1508
+ },
1245
1509
  )
1510
+ # Preserve downstream reporting; keep NaNs but mark degraded
1246
1511
 
1247
1512
  if bootstrap_enabled and preview_log_losses:
1248
1513
  preview_log_ci = compute_logloss_ci(
@@ -1298,7 +1563,20 @@ class CoreRunner:
1298
1563
  abs(r - e) > 1e-6
1299
1564
  for r, e in zip(ratio_ci, expected_ratio_ci, strict=False)
1300
1565
  ):
1301
- raise RuntimeError("Ratio CI inconsistent with Δlog CI")
1566
+ pm_invalid = True
1567
+ self._log_event(
1568
+ "eval",
1569
+ "ratio_ci_inconsistent",
1570
+ LogLevel.WARNING,
1571
+ {
1572
+ "ratio_ci": ratio_ci,
1573
+ "expected_ratio_ci": expected_ratio_ci,
1574
+ },
1575
+ )
1576
+ ratio_ci = (
1577
+ float(expected_ratio_ci[0]),
1578
+ float(expected_ratio_ci[1]),
1579
+ )
1302
1580
  else:
1303
1581
  delta_log_ci = (delta_mean_log, delta_mean_log)
1304
1582
  ratio_ci = (pm_ratio, pm_ratio)
@@ -1335,19 +1613,60 @@ class CoreRunner:
1335
1613
  degenerate_reason = "no_variation"
1336
1614
 
1337
1615
  if degenerate_delta:
1616
+ pm_invalid = True
1338
1617
  self._log_event(
1339
1618
  "eval",
1340
1619
  "degenerate_delta_samples",
1341
- LogLevel.ERROR,
1620
+ LogLevel.WARNING,
1342
1621
  {
1343
1622
  "reason": degenerate_reason,
1344
1623
  "sample_count": len(delta_samples),
1345
1624
  },
1346
1625
  )
1347
- if profile_label in {"ci", "release"}:
1348
- raise RuntimeError(
1349
- f"Degenerate paired ΔlogNLL distribution ({degenerate_reason})"
1626
+
1627
+ needs_pm_fallback = (not math.isfinite(pm_preview)) or (
1628
+ not math.isfinite(pm_final)
1629
+ )
1630
+ needs_delta_fallback = (not math.isfinite(delta_mean_log)) or (
1631
+ not math.isfinite(pm_ratio)
1632
+ )
1633
+
1634
+ degraded_reason: str | None = None
1635
+ if needs_pm_fallback:
1636
+ degraded_reason = "non_finite_pm"
1637
+ elif needs_delta_fallback:
1638
+ degraded_reason = "non_finite_delta"
1639
+ elif degenerate_reason:
1640
+ degraded_reason = f"degenerate_delta:{degenerate_reason}"
1641
+ elif pm_invalid:
1642
+ degraded_reason = "primary_metric_invalid"
1643
+
1644
+ if needs_pm_fallback or needs_delta_fallback:
1645
+ pm_invalid = True
1646
+ pm_fallback = (
1647
+ pm_preview
1648
+ if math.isfinite(pm_preview) and pm_preview > 0
1649
+ else pm_final
1650
+ )
1651
+ if not (math.isfinite(pm_fallback) and pm_fallback > 0):
1652
+ pm_fallback = 1.0
1653
+
1654
+ if needs_pm_fallback:
1655
+ pm_preview = (
1656
+ pm_preview
1657
+ if math.isfinite(pm_preview) and pm_preview > 0
1658
+ else pm_fallback
1350
1659
  )
1660
+ pm_final = (
1661
+ pm_final
1662
+ if math.isfinite(pm_final) and pm_final > 0
1663
+ else pm_fallback
1664
+ )
1665
+ if needs_delta_fallback:
1666
+ if not math.isfinite(delta_mean_log):
1667
+ delta_mean_log = 0.0
1668
+ if not math.isfinite(pm_ratio):
1669
+ pm_ratio = 1.0
1351
1670
 
1352
1671
  def _hash_tokens(tokens: list[int]) -> bytes:
1353
1672
  if not tokens:
@@ -1371,10 +1690,14 @@ class CoreRunner:
1371
1690
  if not isinstance(dataset_cfg, dict):
1372
1691
  return None
1373
1692
  seq_len_val = dataset_cfg.get("seq_len")
1374
- stride_val = dataset_cfg.get("stride", seq_len_val)
1693
+ if seq_len_val is None:
1694
+ return None
1695
+ stride_raw = dataset_cfg.get("stride", seq_len_val)
1696
+ if stride_raw is None:
1697
+ return None
1375
1698
  try:
1376
1699
  seq_len_f = float(seq_len_val)
1377
- stride_f = float(stride_val)
1700
+ stride_f = float(stride_raw)
1378
1701
  except (TypeError, ValueError):
1379
1702
  return None
1380
1703
  if not math.isfinite(seq_len_f) or seq_len_f <= 0:
@@ -1687,7 +2010,9 @@ class CoreRunner:
1687
2010
  except Exception:
1688
2011
  pass
1689
2012
 
1690
- paired_windows_count = len(delta_samples)
2013
+ paired_windows_count = (
2014
+ paired_windows_attempted if paired_windows_attempted else len(delta_samples)
2015
+ )
1691
2016
  unweighted_delta_mean = (
1692
2017
  float(np.mean(delta_samples)) if delta_samples else float(delta_mean_log)
1693
2018
  )
@@ -1715,8 +2040,11 @@ class CoreRunner:
1715
2040
  metrics = {
1716
2041
  "primary_metric": {
1717
2042
  "kind": pm_kind,
1718
- "preview": float(pm_preview),
1719
- "final": float(pm_final),
2043
+ "preview": float(pm_preview) if math.isfinite(pm_preview) else None,
2044
+ "final": float(pm_final) if math.isfinite(pm_final) else None,
2045
+ "invalid": bool(pm_invalid),
2046
+ "degraded": bool(pm_invalid or degraded_reason),
2047
+ "degraded_reason": degraded_reason,
1720
2048
  },
1721
2049
  "logloss_preview": float(preview_mean_log),
1722
2050
  "logloss_final": float(final_mean_log),
@@ -2030,17 +2358,27 @@ class CoreRunner:
2030
2358
  except Exception:
2031
2359
  drift_ratio = None
2032
2360
 
2361
+ spike_threshold = getattr(config, "spike_threshold", 2.0)
2033
2362
  if drift_ratio is None:
2034
2363
  is_catastrophic_spike = False
2035
2364
  metrics_acceptable = True
2036
2365
  else:
2037
- spike_threshold = getattr(config, "spike_threshold", 2.0)
2038
2366
  is_catastrophic_spike = drift_ratio > spike_threshold
2039
2367
  # Check if standard metrics are acceptable against configured max ratio
2040
2368
  metrics_acceptable = drift_ratio <= getattr(config, "max_pm_ratio", 2.0)
2041
2369
 
2042
2370
  # Determine rollback reason and status
2043
2371
  rollback_reason = None
2372
+ tail_failed = False
2373
+ try:
2374
+ pm_tail = metrics.get("primary_metric_tail", {})
2375
+ if isinstance(pm_tail, dict) and pm_tail:
2376
+ mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
2377
+ evaluated = bool(pm_tail.get("evaluated", False))
2378
+ passed = bool(pm_tail.get("passed", True))
2379
+ tail_failed = bool(mode == "fail" and evaluated and (not passed))
2380
+ except Exception: # pragma: no cover
2381
+ tail_failed = False
2044
2382
  if is_catastrophic_spike:
2045
2383
  rollback_reason = (
2046
2384
  f"catastrophic_ppl_spike (ratio: {drift_ratio:.3f} > {spike_threshold})"
@@ -2057,6 +2395,9 @@ class CoreRunner:
2057
2395
  "immediate_rollback": True,
2058
2396
  },
2059
2397
  )
2398
+ elif tail_failed:
2399
+ rollback_reason = "primary_metric_tail_failed"
2400
+ status = RunStatus.ROLLBACK.value
2060
2401
  elif (not all_guards_passed) or (not metrics_acceptable):
2061
2402
  # Match historical/test expectation string exactly
2062
2403
  rollback_reason = "guards_failed or metrics_unacceptable"
@@ -2185,20 +2526,27 @@ class CoreRunner:
2185
2526
  ) -> dict[str, dict[str, Any]]:
2186
2527
  """Resolve tier-based guard policies from configuration."""
2187
2528
  # Use passed auto_config if available, otherwise extract from report meta
2188
- if auto_config is None:
2189
- config_meta = report.meta.get("config", {})
2529
+ auto_cfg: dict[str, Any] | None = auto_config
2530
+ if auto_cfg is None:
2531
+ config_meta = report.meta.get("config") or {}
2190
2532
 
2191
2533
  # Try to get auto config from various possible locations
2192
- if hasattr(report, "auto_config"):
2193
- auto_config = report.auto_config
2194
- elif "auto" in config_meta:
2195
- auto_config = config_meta["auto"]
2196
- else:
2534
+ auto_cfg = report.__dict__.get("auto_config")
2535
+ if (
2536
+ auto_cfg is None
2537
+ and isinstance(config_meta, dict)
2538
+ and "auto" in config_meta
2539
+ ):
2540
+ auto_cfg = config_meta["auto"]
2541
+ elif auto_cfg is None:
2197
2542
  # Fallback to default balanced tier
2198
- auto_config = {"tier": "balanced", "enabled": True}
2543
+ auto_cfg = {"tier": "balanced", "enabled": True}
2544
+
2545
+ if not isinstance(auto_cfg, dict):
2546
+ auto_cfg = {"tier": "balanced", "enabled": True}
2199
2547
 
2200
2548
  # Extract tier and edit name
2201
- tier = auto_config.get("tier", "balanced")
2549
+ tier = auto_cfg.get("tier", "balanced")
2202
2550
  edit_name = None
2203
2551
  if hasattr(report, "edit") and report.edit:
2204
2552
  edit_name = report.edit.get("name")
@@ -2208,8 +2556,10 @@ class CoreRunner:
2208
2556
  edit_name = report.meta["edit_name"]
2209
2557
 
2210
2558
  # Get explicit guard overrides from config
2211
- config_meta = report.meta.get("config", {})
2212
- explicit_overrides = config_meta.get("guards", {})
2559
+ config_meta = report.meta.get("config") or {}
2560
+ explicit_overrides = (
2561
+ config_meta.get("guards", {}) if isinstance(config_meta, dict) else {}
2562
+ )
2213
2563
 
2214
2564
  try:
2215
2565
  # Resolve tier policies
@@ -2237,18 +2587,18 @@ class CoreRunner:
2237
2587
  def _apply_guard_policy(self, guard: Guard, policy: dict[str, Any]) -> None:
2238
2588
  """Apply resolved policy parameters to a guard instance."""
2239
2589
  try:
2590
+ guard_config = getattr(guard, "config", None)
2591
+ guard_policy = getattr(guard, "policy", None)
2592
+
2240
2593
  # Apply policy parameters to guard
2241
2594
  for param_name, param_value in policy.items():
2242
2595
  if hasattr(guard, param_name):
2243
2596
  setattr(guard, param_name, param_value)
2244
- elif hasattr(guard, "config") and isinstance(guard.config, dict):
2245
- # Try to set in guard's config dict
2246
- guard.config[param_name] = param_value
2247
- elif hasattr(guard, "policy") and isinstance(guard.policy, dict):
2248
- # Try to set in guard's policy dict
2249
- guard.policy[param_name] = param_value
2597
+ elif isinstance(guard_config, dict):
2598
+ guard_config[param_name] = param_value
2599
+ elif isinstance(guard_policy, dict):
2600
+ guard_policy[param_name] = param_value
2250
2601
  else:
2251
- # Last resort: add to guard as attribute
2252
2602
  setattr(guard, param_name, param_value)
2253
2603
 
2254
2604
  except Exception as e: