invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +2 -2
- invarlock/_data/runtime/tiers.yaml +57 -30
- invarlock/adapters/__init__.py +11 -15
- invarlock/adapters/auto.py +35 -40
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_mixin.py +25 -4
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/calibration/spectral_null.py +15 -10
- invarlock/calibration/variance_ve.py +0 -2
- invarlock/cli/adapter_auto.py +31 -21
- invarlock/cli/app.py +73 -2
- invarlock/cli/commands/calibrate.py +6 -2
- invarlock/cli/commands/certify.py +651 -91
- invarlock/cli/commands/doctor.py +11 -11
- invarlock/cli/commands/explain_gates.py +57 -8
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +233 -69
- invarlock/cli/commands/run.py +1066 -244
- invarlock/cli/commands/verify.py +154 -15
- invarlock/cli/config.py +22 -6
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +1 -1
- invarlock/core/api.py +45 -5
- invarlock/core/auto_tuning.py +65 -20
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/contracts.py +7 -1
- invarlock/core/registry.py +11 -13
- invarlock/core/runner.py +425 -75
- invarlock/edits/quant_rtn.py +65 -37
- invarlock/eval/bench.py +3 -16
- invarlock/eval/data.py +82 -51
- invarlock/eval/metrics.py +63 -2
- invarlock/eval/primary_metric.py +23 -0
- invarlock/eval/tail_stats.py +230 -0
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/_estimators.py +154 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/policies.py +16 -6
- invarlock/guards/rmt.py +627 -546
- invarlock/guards/spectral.py +348 -110
- invarlock/guards/tier_config.py +32 -30
- invarlock/guards/variance.py +7 -31
- invarlock/guards_ref/rmt_ref.py +23 -23
- invarlock/model_profile.py +90 -42
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/reporting/certificate.py +384 -55
- invarlock/reporting/certificate_schema.py +3 -2
- invarlock/reporting/dataset_hashing.py +15 -2
- invarlock/reporting/guards_analysis.py +350 -277
- invarlock/reporting/html.py +55 -5
- invarlock/reporting/normalizer.py +13 -0
- invarlock/reporting/policy_utils.py +38 -36
- invarlock/reporting/primary_metric_utils.py +71 -17
- invarlock/reporting/render.py +852 -431
- invarlock/reporting/report.py +40 -4
- invarlock/reporting/report_types.py +11 -3
- invarlock/reporting/telemetry.py +86 -0
- invarlock/reporting/validate.py +1 -18
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
invarlock/core/runner.py
CHANGED
|
@@ -18,7 +18,23 @@ from typing import Any
|
|
|
18
18
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
|
|
21
|
-
from .
|
|
21
|
+
from invarlock.eval.tail_stats import evaluate_metric_tail
|
|
22
|
+
from invarlock.observability.metrics import (
|
|
23
|
+
capture_memory_snapshot,
|
|
24
|
+
reset_peak_memory_stats,
|
|
25
|
+
summarize_memory_snapshots,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
from .api import (
|
|
29
|
+
EditLike,
|
|
30
|
+
Guard,
|
|
31
|
+
GuardWithContext,
|
|
32
|
+
GuardWithPrepare,
|
|
33
|
+
ModelAdapter,
|
|
34
|
+
ModelEdit,
|
|
35
|
+
RunConfig,
|
|
36
|
+
RunReport,
|
|
37
|
+
)
|
|
22
38
|
from .auto_tuning import resolve_tier_policies
|
|
23
39
|
from .bootstrap import (
|
|
24
40
|
compute_logloss_ci,
|
|
@@ -112,7 +128,7 @@ class CoreRunner:
|
|
|
112
128
|
self,
|
|
113
129
|
model: Any,
|
|
114
130
|
adapter: ModelAdapter,
|
|
115
|
-
edit: ModelEdit,
|
|
131
|
+
edit: ModelEdit | EditLike,
|
|
116
132
|
guards: list[Guard],
|
|
117
133
|
config: RunConfig,
|
|
118
134
|
calibration_data: Any = None,
|
|
@@ -175,10 +191,22 @@ class CoreRunner:
|
|
|
175
191
|
config.context["auto"] = dict(auto_config)
|
|
176
192
|
try:
|
|
177
193
|
report.context["auto"] = config.context["auto"]
|
|
178
|
-
except Exception:
|
|
194
|
+
except Exception: # pragma: no cover - defensive context propagation
|
|
179
195
|
pass
|
|
180
196
|
|
|
181
197
|
report.status = RunStatus.RUNNING.value
|
|
198
|
+
timings: dict[str, float] = {}
|
|
199
|
+
guard_timings: dict[str, float] = {}
|
|
200
|
+
memory_snapshots: list[dict[str, Any]] = []
|
|
201
|
+
total_start = time.perf_counter()
|
|
202
|
+
|
|
203
|
+
def _record_timing(key: str, start: float) -> None:
|
|
204
|
+
timings[key] = max(0.0, float(time.perf_counter() - start))
|
|
205
|
+
|
|
206
|
+
def _capture_memory(phase: str) -> None:
|
|
207
|
+
snapshot = capture_memory_snapshot(phase)
|
|
208
|
+
if snapshot:
|
|
209
|
+
memory_snapshots.append(snapshot)
|
|
182
210
|
|
|
183
211
|
try:
|
|
184
212
|
# Log start
|
|
@@ -194,40 +222,78 @@ class CoreRunner:
|
|
|
194
222
|
)
|
|
195
223
|
|
|
196
224
|
# Phase 1: Prepare (describe model, create checkpoint)
|
|
197
|
-
|
|
225
|
+
reset_peak_memory_stats()
|
|
226
|
+
phase_start = time.perf_counter()
|
|
227
|
+
try:
|
|
228
|
+
model_desc = self._prepare_phase(model, adapter, report)
|
|
229
|
+
finally:
|
|
230
|
+
_record_timing("prepare", phase_start)
|
|
231
|
+
_capture_memory("prepare")
|
|
198
232
|
|
|
199
233
|
# Phase 2: Prepare guards (must happen before edit)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
234
|
+
reset_peak_memory_stats()
|
|
235
|
+
phase_start = time.perf_counter()
|
|
236
|
+
try:
|
|
237
|
+
self._prepare_guards_phase(
|
|
238
|
+
model,
|
|
239
|
+
adapter,
|
|
240
|
+
guards,
|
|
241
|
+
calibration_data,
|
|
242
|
+
report,
|
|
243
|
+
auto_config,
|
|
244
|
+
config,
|
|
245
|
+
)
|
|
246
|
+
finally:
|
|
247
|
+
_record_timing("prepare_guards", phase_start)
|
|
248
|
+
_capture_memory("prepare_guards")
|
|
209
249
|
|
|
210
250
|
# Phase 3: Apply edit
|
|
211
|
-
|
|
251
|
+
reset_peak_memory_stats()
|
|
252
|
+
phase_start = time.perf_counter()
|
|
253
|
+
try:
|
|
254
|
+
self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
|
|
255
|
+
finally:
|
|
256
|
+
_record_timing("edit", phase_start)
|
|
257
|
+
_capture_memory("edit")
|
|
212
258
|
|
|
213
259
|
# Phase 4: Run guards
|
|
214
|
-
|
|
260
|
+
reset_peak_memory_stats()
|
|
261
|
+
phase_start = time.perf_counter()
|
|
262
|
+
try:
|
|
263
|
+
guard_results = self._guard_phase(
|
|
264
|
+
model, adapter, guards, report, guard_timings=guard_timings
|
|
265
|
+
)
|
|
266
|
+
finally:
|
|
267
|
+
_record_timing("guards", phase_start)
|
|
268
|
+
_capture_memory("guards")
|
|
215
269
|
|
|
216
270
|
# Phase 5: Evaluate final metrics
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
271
|
+
reset_peak_memory_stats()
|
|
272
|
+
phase_start = time.perf_counter()
|
|
273
|
+
try:
|
|
274
|
+
metrics = self._eval_phase(
|
|
275
|
+
model,
|
|
276
|
+
adapter,
|
|
277
|
+
calibration_data,
|
|
278
|
+
report,
|
|
279
|
+
preview_n,
|
|
280
|
+
final_n,
|
|
281
|
+
config,
|
|
282
|
+
)
|
|
283
|
+
finally:
|
|
284
|
+
_record_timing("eval", phase_start)
|
|
285
|
+
_capture_memory("eval")
|
|
226
286
|
|
|
227
287
|
# Phase 6: Finalize or rollback
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
288
|
+
reset_peak_memory_stats()
|
|
289
|
+
phase_start = time.perf_counter()
|
|
290
|
+
try:
|
|
291
|
+
final_status = self._finalize_phase(
|
|
292
|
+
model, adapter, guard_results, metrics, config, report
|
|
293
|
+
)
|
|
294
|
+
finally:
|
|
295
|
+
_record_timing("finalize", phase_start)
|
|
296
|
+
_capture_memory("finalize")
|
|
231
297
|
|
|
232
298
|
report.status = final_status
|
|
233
299
|
report.meta["end_time"] = time.time()
|
|
@@ -249,6 +315,25 @@ class CoreRunner:
|
|
|
249
315
|
return report
|
|
250
316
|
|
|
251
317
|
finally:
|
|
318
|
+
_record_timing("total", total_start)
|
|
319
|
+
if not isinstance(report.metrics, dict):
|
|
320
|
+
report.metrics = {}
|
|
321
|
+
if timings:
|
|
322
|
+
report.metrics.setdefault("timings", {}).update(timings)
|
|
323
|
+
if guard_timings:
|
|
324
|
+
report.metrics["guard_timings"] = guard_timings
|
|
325
|
+
if memory_snapshots:
|
|
326
|
+
report.metrics["memory_snapshots"] = memory_snapshots
|
|
327
|
+
summary = summarize_memory_snapshots(memory_snapshots)
|
|
328
|
+
if summary:
|
|
329
|
+
mem_peak = summary.get("memory_mb_peak")
|
|
330
|
+
if isinstance(mem_peak, (int | float)):
|
|
331
|
+
existing = report.metrics.get("memory_mb_peak")
|
|
332
|
+
if isinstance(existing, (int | float)):
|
|
333
|
+
summary["memory_mb_peak"] = max(
|
|
334
|
+
float(existing), float(mem_peak)
|
|
335
|
+
)
|
|
336
|
+
report.metrics.update(summary)
|
|
252
337
|
self._active_model = None
|
|
253
338
|
self._active_adapter = None
|
|
254
339
|
self._cleanup_services()
|
|
@@ -303,10 +388,10 @@ class CoreRunner:
|
|
|
303
388
|
self,
|
|
304
389
|
model: Any,
|
|
305
390
|
adapter: ModelAdapter,
|
|
306
|
-
edit: ModelEdit,
|
|
391
|
+
edit: ModelEdit | EditLike,
|
|
307
392
|
model_desc: dict[str, Any],
|
|
308
393
|
report: RunReport,
|
|
309
|
-
edit_config: dict[str, Any] | None
|
|
394
|
+
edit_config: dict[str, Any] | None,
|
|
310
395
|
) -> dict[str, Any]:
|
|
311
396
|
"""Phase 2: Apply edit operation."""
|
|
312
397
|
edit_label = "baseline" if edit.name == "baseline" else edit.name
|
|
@@ -388,7 +473,7 @@ class CoreRunner:
|
|
|
388
473
|
{"guard": guard.name, "policy": guard_policy},
|
|
389
474
|
)
|
|
390
475
|
|
|
391
|
-
if
|
|
476
|
+
if isinstance(guard, GuardWithContext):
|
|
392
477
|
try:
|
|
393
478
|
guard.set_run_context(report)
|
|
394
479
|
except Exception as exc:
|
|
@@ -400,7 +485,7 @@ class CoreRunner:
|
|
|
400
485
|
)
|
|
401
486
|
|
|
402
487
|
# Call prepare method if it exists (most guards need this)
|
|
403
|
-
if
|
|
488
|
+
if isinstance(guard, GuardWithPrepare):
|
|
404
489
|
prepare_result = guard.prepare(
|
|
405
490
|
model, adapter, calibration_data, guard_policy
|
|
406
491
|
)
|
|
@@ -444,7 +529,13 @@ class CoreRunner:
|
|
|
444
529
|
)
|
|
445
530
|
|
|
446
531
|
def _guard_phase(
|
|
447
|
-
self,
|
|
532
|
+
self,
|
|
533
|
+
model: Any,
|
|
534
|
+
adapter: ModelAdapter,
|
|
535
|
+
guards: list[Guard],
|
|
536
|
+
report: RunReport,
|
|
537
|
+
*,
|
|
538
|
+
guard_timings: dict[str, float] | None = None,
|
|
448
539
|
) -> dict[str, dict[str, Any]]:
|
|
449
540
|
"""Phase 4: Run safety guards."""
|
|
450
541
|
self._log_event("guards", "start", LogLevel.INFO, {"count": len(guards)})
|
|
@@ -453,8 +544,9 @@ class CoreRunner:
|
|
|
453
544
|
|
|
454
545
|
for guard in guards:
|
|
455
546
|
self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
|
|
547
|
+
guard_start = time.perf_counter()
|
|
456
548
|
|
|
457
|
-
if
|
|
549
|
+
if isinstance(guard, GuardWithContext):
|
|
458
550
|
try:
|
|
459
551
|
guard.set_run_context(report)
|
|
460
552
|
except Exception as exc: # pragma: no cover - defensive
|
|
@@ -486,6 +578,11 @@ class CoreRunner:
|
|
|
486
578
|
LogLevel.ERROR,
|
|
487
579
|
{"guard": guard.name, "error": str(e)},
|
|
488
580
|
)
|
|
581
|
+
finally:
|
|
582
|
+
if guard_timings is not None:
|
|
583
|
+
guard_timings[guard.name] = max(
|
|
584
|
+
0.0, float(time.perf_counter() - guard_start)
|
|
585
|
+
)
|
|
489
586
|
|
|
490
587
|
report.guards = guard_results
|
|
491
588
|
|
|
@@ -583,6 +680,116 @@ class CoreRunner:
|
|
|
583
680
|
}
|
|
584
681
|
eval_windows = {"preview": {}, "final": {}}
|
|
585
682
|
|
|
683
|
+
# Optional: compute primary metric tail evidence vs baseline when provided.
|
|
684
|
+
try:
|
|
685
|
+
pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
|
|
686
|
+
pm_kind = str(pm.get("kind", "")).lower() if isinstance(pm, dict) else ""
|
|
687
|
+
is_ppl_metric = pm_kind.startswith("ppl")
|
|
688
|
+
|
|
689
|
+
baseline_eval = {}
|
|
690
|
+
if (
|
|
691
|
+
is_ppl_metric
|
|
692
|
+
and config
|
|
693
|
+
and isinstance(config.context, dict)
|
|
694
|
+
and isinstance(config.context.get("baseline_eval_windows"), dict)
|
|
695
|
+
):
|
|
696
|
+
baseline_eval = config.context.get("baseline_eval_windows") or {}
|
|
697
|
+
|
|
698
|
+
if is_ppl_metric and baseline_eval:
|
|
699
|
+
tier_policies = (
|
|
700
|
+
report.meta.get("tier_policies", {})
|
|
701
|
+
if isinstance(getattr(report, "meta", None), dict)
|
|
702
|
+
else {}
|
|
703
|
+
)
|
|
704
|
+
metrics_policy = (
|
|
705
|
+
tier_policies.get("metrics", {})
|
|
706
|
+
if isinstance(tier_policies, dict)
|
|
707
|
+
else {}
|
|
708
|
+
)
|
|
709
|
+
pm_tail_policy = (
|
|
710
|
+
metrics_policy.get("pm_tail", {})
|
|
711
|
+
if isinstance(metrics_policy, dict)
|
|
712
|
+
else {}
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
run_final = (
|
|
716
|
+
eval_windows.get("final", {})
|
|
717
|
+
if isinstance(eval_windows, dict)
|
|
718
|
+
else {}
|
|
719
|
+
)
|
|
720
|
+
base_final = (
|
|
721
|
+
baseline_eval.get("final", {})
|
|
722
|
+
if isinstance(baseline_eval, dict)
|
|
723
|
+
else {}
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
deltas: list[float] = []
|
|
727
|
+
weights: list[float] = []
|
|
728
|
+
run_ids = (
|
|
729
|
+
run_final.get("window_ids") if isinstance(run_final, dict) else None
|
|
730
|
+
)
|
|
731
|
+
run_ll = (
|
|
732
|
+
run_final.get("logloss") if isinstance(run_final, dict) else None
|
|
733
|
+
)
|
|
734
|
+
run_tc = (
|
|
735
|
+
run_final.get("token_counts")
|
|
736
|
+
if isinstance(run_final, dict)
|
|
737
|
+
else None
|
|
738
|
+
)
|
|
739
|
+
base_ids = (
|
|
740
|
+
base_final.get("window_ids")
|
|
741
|
+
if isinstance(base_final, dict)
|
|
742
|
+
else None
|
|
743
|
+
)
|
|
744
|
+
base_ll = (
|
|
745
|
+
base_final.get("logloss") if isinstance(base_final, dict) else None
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
if (
|
|
749
|
+
isinstance(run_ids, list)
|
|
750
|
+
and isinstance(run_ll, list)
|
|
751
|
+
and isinstance(base_ids, list)
|
|
752
|
+
and isinstance(base_ll, list)
|
|
753
|
+
):
|
|
754
|
+
base_map: dict[int, float] = {}
|
|
755
|
+
for b_id, b_val in zip(base_ids, base_ll, strict=False):
|
|
756
|
+
if isinstance(b_id, int | float) and isinstance(
|
|
757
|
+
b_val, int | float
|
|
758
|
+
):
|
|
759
|
+
base_map[int(b_id)] = float(b_val)
|
|
760
|
+
for idx, (r_id, r_val) in enumerate(
|
|
761
|
+
zip(run_ids, run_ll, strict=False)
|
|
762
|
+
):
|
|
763
|
+
if not (
|
|
764
|
+
isinstance(r_id, int | float)
|
|
765
|
+
and isinstance(r_val, int | float)
|
|
766
|
+
):
|
|
767
|
+
continue
|
|
768
|
+
key = int(r_id)
|
|
769
|
+
if key not in base_map:
|
|
770
|
+
continue
|
|
771
|
+
dv = float(r_val) - base_map[key]
|
|
772
|
+
if math.isfinite(dv):
|
|
773
|
+
deltas.append(float(dv))
|
|
774
|
+
if isinstance(run_tc, list) and idx < len(run_tc):
|
|
775
|
+
try:
|
|
776
|
+
wv = float(run_tc[idx])
|
|
777
|
+
except Exception:
|
|
778
|
+
wv = 0.0
|
|
779
|
+
weights.append(float(max(wv, 0.0)))
|
|
780
|
+
|
|
781
|
+
tail_result = evaluate_metric_tail(
|
|
782
|
+
deltas=deltas,
|
|
783
|
+
weights=weights
|
|
784
|
+
if (weights and len(weights) == len(deltas))
|
|
785
|
+
else None,
|
|
786
|
+
policy=pm_tail_policy if isinstance(pm_tail_policy, dict) else None,
|
|
787
|
+
)
|
|
788
|
+
tail_result["source"] = "paired_baseline.final"
|
|
789
|
+
metrics["primary_metric_tail"] = tail_result
|
|
790
|
+
except Exception: # pragma: no cover - best effort
|
|
791
|
+
pass
|
|
792
|
+
|
|
586
793
|
policy_flags = self._resolve_policy_flags(config)
|
|
587
794
|
eval_error = metrics.get("eval_error") if isinstance(metrics, dict) else None
|
|
588
795
|
if eval_error:
|
|
@@ -834,8 +1041,10 @@ class CoreRunner:
|
|
|
834
1041
|
pairing_reason = None
|
|
835
1042
|
preview_pair_stats = {"matched": 0, "expected": 0}
|
|
836
1043
|
final_pair_stats = {"matched": 0, "expected": 0}
|
|
1044
|
+
paired_windows_attempted = 0
|
|
837
1045
|
preview_window_ids: list[int] = []
|
|
838
1046
|
final_window_ids: list[int] = []
|
|
1047
|
+
|
|
839
1048
|
preview_tokens: list[list[int]] = []
|
|
840
1049
|
final_tokens: list[list[int]] = []
|
|
841
1050
|
preview_limit = min(preview_n, len(preview_data)) if preview_data else 0
|
|
@@ -876,6 +1085,8 @@ class CoreRunner:
|
|
|
876
1085
|
# even if an exception occurs during the main compute block.
|
|
877
1086
|
delta_samples: list[float] = []
|
|
878
1087
|
delta_weights: list[float] = []
|
|
1088
|
+
pm_invalid = False
|
|
1089
|
+
degraded_reason: str | None = None
|
|
879
1090
|
|
|
880
1091
|
try:
|
|
881
1092
|
|
|
@@ -891,7 +1102,7 @@ class CoreRunner:
|
|
|
891
1102
|
max_batches: int,
|
|
892
1103
|
start_idx: int,
|
|
893
1104
|
) -> dict[str, Any]:
|
|
894
|
-
nonlocal alignment_logged
|
|
1105
|
+
nonlocal alignment_logged, eval_error
|
|
895
1106
|
|
|
896
1107
|
total_tokens_local = 0
|
|
897
1108
|
actual_tokens_local = 0
|
|
@@ -927,7 +1138,9 @@ class CoreRunner:
|
|
|
927
1138
|
limit = _resolve_limit(batches, max_batches)
|
|
928
1139
|
|
|
929
1140
|
for batch in batches[:limit]:
|
|
930
|
-
if
|
|
1141
|
+
if (
|
|
1142
|
+
max_batches > 0 and count >= max_batches
|
|
1143
|
+
): # pragma: no cover - slicing already caps iteration
|
|
931
1144
|
break
|
|
932
1145
|
|
|
933
1146
|
labels = None
|
|
@@ -1100,7 +1313,7 @@ class CoreRunner:
|
|
|
1100
1313
|
"zero_mask_batches": zero_mask_batches,
|
|
1101
1314
|
"requested": limit,
|
|
1102
1315
|
},
|
|
1103
|
-
)
|
|
1316
|
+
) # pragma: no cover - requires debug tracing with zero batches
|
|
1104
1317
|
if resolved_loss_mode == "mlm":
|
|
1105
1318
|
error_msg = (
|
|
1106
1319
|
"MLM evaluation produced zero usable batches; "
|
|
@@ -1121,7 +1334,10 @@ class CoreRunner:
|
|
|
1121
1334
|
"zero_mask_batches": zero_mask_batches,
|
|
1122
1335
|
},
|
|
1123
1336
|
)
|
|
1124
|
-
|
|
1337
|
+
eval_error = {
|
|
1338
|
+
"error": "mlm_missing_masks",
|
|
1339
|
+
"detail": error_msg,
|
|
1340
|
+
}
|
|
1125
1341
|
return {
|
|
1126
1342
|
"ppl": float("nan"),
|
|
1127
1343
|
"total_tokens": total_tokens_local,
|
|
@@ -1167,8 +1383,42 @@ class CoreRunner:
|
|
|
1167
1383
|
final_data, final_limit, preview_summary["num_batches"]
|
|
1168
1384
|
)
|
|
1169
1385
|
|
|
1170
|
-
|
|
1171
|
-
|
|
1386
|
+
preview_raw_losses = preview_summary["log_losses"]
|
|
1387
|
+
final_raw_losses = final_summary["log_losses"]
|
|
1388
|
+
try:
|
|
1389
|
+
paired_windows_attempted = min(
|
|
1390
|
+
len(preview_raw_losses), len(final_raw_losses)
|
|
1391
|
+
)
|
|
1392
|
+
except Exception:
|
|
1393
|
+
paired_windows_attempted = 0
|
|
1394
|
+
|
|
1395
|
+
preview_log_losses = [
|
|
1396
|
+
float(loss) for loss in preview_raw_losses if math.isfinite(loss)
|
|
1397
|
+
]
|
|
1398
|
+
final_log_losses = [
|
|
1399
|
+
float(loss) for loss in final_raw_losses if math.isfinite(loss)
|
|
1400
|
+
]
|
|
1401
|
+
if len(preview_log_losses) != len(preview_raw_losses):
|
|
1402
|
+
self._log_event(
|
|
1403
|
+
"eval",
|
|
1404
|
+
"non_finite_preview_losses_filtered",
|
|
1405
|
+
LogLevel.WARNING,
|
|
1406
|
+
{
|
|
1407
|
+
"total": len(preview_raw_losses),
|
|
1408
|
+
"filtered": len(preview_raw_losses) - len(preview_log_losses),
|
|
1409
|
+
},
|
|
1410
|
+
)
|
|
1411
|
+
if len(final_log_losses) != len(final_raw_losses):
|
|
1412
|
+
self._log_event(
|
|
1413
|
+
"eval",
|
|
1414
|
+
"non_finite_final_losses_filtered",
|
|
1415
|
+
LogLevel.WARNING,
|
|
1416
|
+
{
|
|
1417
|
+
"total": len(final_raw_losses),
|
|
1418
|
+
"filtered": len(final_raw_losses) - len(final_log_losses),
|
|
1419
|
+
},
|
|
1420
|
+
)
|
|
1421
|
+
|
|
1172
1422
|
preview_tokens_ct = preview_summary["total_tokens"]
|
|
1173
1423
|
final_tokens_ct = final_summary["total_tokens"]
|
|
1174
1424
|
preview_batches_ct = preview_summary["num_batches"]
|
|
@@ -1235,14 +1485,29 @@ class CoreRunner:
|
|
|
1235
1485
|
delta_mean_log = final_mean_log - preview_mean_log
|
|
1236
1486
|
pm_ratio = math.exp(delta_mean_log)
|
|
1237
1487
|
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1488
|
+
pm_invalid = False
|
|
1489
|
+
try:
|
|
1490
|
+
if not (math.isfinite(delta_mean_log) and math.isfinite(pm_ratio)):
|
|
1491
|
+
raise RuntimeError("non_finite_primary_metric")
|
|
1492
|
+
|
|
1493
|
+
expected_ratio = math.exp(delta_mean_log)
|
|
1494
|
+
if abs(pm_ratio - expected_ratio) > 1e-6:
|
|
1495
|
+
raise RuntimeError("primary_metric_ratio_mismatch")
|
|
1496
|
+
except Exception as exc:
|
|
1497
|
+
pm_invalid = True
|
|
1498
|
+
self._log_event(
|
|
1499
|
+
"eval",
|
|
1500
|
+
"primary_metric_invalid",
|
|
1501
|
+
LogLevel.WARNING,
|
|
1502
|
+
{
|
|
1503
|
+
"pm_preview": float(pm_preview),
|
|
1504
|
+
"pm_final": float(pm_final),
|
|
1505
|
+
"delta_mean_log": float(delta_mean_log),
|
|
1506
|
+
"pm_ratio": float(pm_ratio),
|
|
1507
|
+
"error": str(exc),
|
|
1508
|
+
},
|
|
1245
1509
|
)
|
|
1510
|
+
# Preserve downstream reporting; keep NaNs but mark degraded
|
|
1246
1511
|
|
|
1247
1512
|
if bootstrap_enabled and preview_log_losses:
|
|
1248
1513
|
preview_log_ci = compute_logloss_ci(
|
|
@@ -1298,7 +1563,20 @@ class CoreRunner:
|
|
|
1298
1563
|
abs(r - e) > 1e-6
|
|
1299
1564
|
for r, e in zip(ratio_ci, expected_ratio_ci, strict=False)
|
|
1300
1565
|
):
|
|
1301
|
-
|
|
1566
|
+
pm_invalid = True
|
|
1567
|
+
self._log_event(
|
|
1568
|
+
"eval",
|
|
1569
|
+
"ratio_ci_inconsistent",
|
|
1570
|
+
LogLevel.WARNING,
|
|
1571
|
+
{
|
|
1572
|
+
"ratio_ci": ratio_ci,
|
|
1573
|
+
"expected_ratio_ci": expected_ratio_ci,
|
|
1574
|
+
},
|
|
1575
|
+
)
|
|
1576
|
+
ratio_ci = (
|
|
1577
|
+
float(expected_ratio_ci[0]),
|
|
1578
|
+
float(expected_ratio_ci[1]),
|
|
1579
|
+
)
|
|
1302
1580
|
else:
|
|
1303
1581
|
delta_log_ci = (delta_mean_log, delta_mean_log)
|
|
1304
1582
|
ratio_ci = (pm_ratio, pm_ratio)
|
|
@@ -1335,19 +1613,60 @@ class CoreRunner:
|
|
|
1335
1613
|
degenerate_reason = "no_variation"
|
|
1336
1614
|
|
|
1337
1615
|
if degenerate_delta:
|
|
1616
|
+
pm_invalid = True
|
|
1338
1617
|
self._log_event(
|
|
1339
1618
|
"eval",
|
|
1340
1619
|
"degenerate_delta_samples",
|
|
1341
|
-
LogLevel.
|
|
1620
|
+
LogLevel.WARNING,
|
|
1342
1621
|
{
|
|
1343
1622
|
"reason": degenerate_reason,
|
|
1344
1623
|
"sample_count": len(delta_samples),
|
|
1345
1624
|
},
|
|
1346
1625
|
)
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1626
|
+
|
|
1627
|
+
needs_pm_fallback = (not math.isfinite(pm_preview)) or (
|
|
1628
|
+
not math.isfinite(pm_final)
|
|
1629
|
+
)
|
|
1630
|
+
needs_delta_fallback = (not math.isfinite(delta_mean_log)) or (
|
|
1631
|
+
not math.isfinite(pm_ratio)
|
|
1632
|
+
)
|
|
1633
|
+
|
|
1634
|
+
degraded_reason: str | None = None
|
|
1635
|
+
if needs_pm_fallback:
|
|
1636
|
+
degraded_reason = "non_finite_pm"
|
|
1637
|
+
elif needs_delta_fallback:
|
|
1638
|
+
degraded_reason = "non_finite_delta"
|
|
1639
|
+
elif degenerate_reason:
|
|
1640
|
+
degraded_reason = f"degenerate_delta:{degenerate_reason}"
|
|
1641
|
+
elif pm_invalid:
|
|
1642
|
+
degraded_reason = "primary_metric_invalid"
|
|
1643
|
+
|
|
1644
|
+
if needs_pm_fallback or needs_delta_fallback:
|
|
1645
|
+
pm_invalid = True
|
|
1646
|
+
pm_fallback = (
|
|
1647
|
+
pm_preview
|
|
1648
|
+
if math.isfinite(pm_preview) and pm_preview > 0
|
|
1649
|
+
else pm_final
|
|
1650
|
+
)
|
|
1651
|
+
if not (math.isfinite(pm_fallback) and pm_fallback > 0):
|
|
1652
|
+
pm_fallback = 1.0
|
|
1653
|
+
|
|
1654
|
+
if needs_pm_fallback:
|
|
1655
|
+
pm_preview = (
|
|
1656
|
+
pm_preview
|
|
1657
|
+
if math.isfinite(pm_preview) and pm_preview > 0
|
|
1658
|
+
else pm_fallback
|
|
1350
1659
|
)
|
|
1660
|
+
pm_final = (
|
|
1661
|
+
pm_final
|
|
1662
|
+
if math.isfinite(pm_final) and pm_final > 0
|
|
1663
|
+
else pm_fallback
|
|
1664
|
+
)
|
|
1665
|
+
if needs_delta_fallback:
|
|
1666
|
+
if not math.isfinite(delta_mean_log):
|
|
1667
|
+
delta_mean_log = 0.0
|
|
1668
|
+
if not math.isfinite(pm_ratio):
|
|
1669
|
+
pm_ratio = 1.0
|
|
1351
1670
|
|
|
1352
1671
|
def _hash_tokens(tokens: list[int]) -> bytes:
|
|
1353
1672
|
if not tokens:
|
|
@@ -1371,10 +1690,14 @@ class CoreRunner:
|
|
|
1371
1690
|
if not isinstance(dataset_cfg, dict):
|
|
1372
1691
|
return None
|
|
1373
1692
|
seq_len_val = dataset_cfg.get("seq_len")
|
|
1374
|
-
|
|
1693
|
+
if seq_len_val is None:
|
|
1694
|
+
return None
|
|
1695
|
+
stride_raw = dataset_cfg.get("stride", seq_len_val)
|
|
1696
|
+
if stride_raw is None:
|
|
1697
|
+
return None
|
|
1375
1698
|
try:
|
|
1376
1699
|
seq_len_f = float(seq_len_val)
|
|
1377
|
-
stride_f = float(
|
|
1700
|
+
stride_f = float(stride_raw)
|
|
1378
1701
|
except (TypeError, ValueError):
|
|
1379
1702
|
return None
|
|
1380
1703
|
if not math.isfinite(seq_len_f) or seq_len_f <= 0:
|
|
@@ -1687,7 +2010,9 @@ class CoreRunner:
|
|
|
1687
2010
|
except Exception:
|
|
1688
2011
|
pass
|
|
1689
2012
|
|
|
1690
|
-
paired_windows_count =
|
|
2013
|
+
paired_windows_count = (
|
|
2014
|
+
paired_windows_attempted if paired_windows_attempted else len(delta_samples)
|
|
2015
|
+
)
|
|
1691
2016
|
unweighted_delta_mean = (
|
|
1692
2017
|
float(np.mean(delta_samples)) if delta_samples else float(delta_mean_log)
|
|
1693
2018
|
)
|
|
@@ -1715,8 +2040,11 @@ class CoreRunner:
|
|
|
1715
2040
|
metrics = {
|
|
1716
2041
|
"primary_metric": {
|
|
1717
2042
|
"kind": pm_kind,
|
|
1718
|
-
"preview": float(pm_preview),
|
|
1719
|
-
"final": float(pm_final),
|
|
2043
|
+
"preview": float(pm_preview) if math.isfinite(pm_preview) else None,
|
|
2044
|
+
"final": float(pm_final) if math.isfinite(pm_final) else None,
|
|
2045
|
+
"invalid": bool(pm_invalid),
|
|
2046
|
+
"degraded": bool(pm_invalid or degraded_reason),
|
|
2047
|
+
"degraded_reason": degraded_reason,
|
|
1720
2048
|
},
|
|
1721
2049
|
"logloss_preview": float(preview_mean_log),
|
|
1722
2050
|
"logloss_final": float(final_mean_log),
|
|
@@ -2030,17 +2358,27 @@ class CoreRunner:
|
|
|
2030
2358
|
except Exception:
|
|
2031
2359
|
drift_ratio = None
|
|
2032
2360
|
|
|
2361
|
+
spike_threshold = getattr(config, "spike_threshold", 2.0)
|
|
2033
2362
|
if drift_ratio is None:
|
|
2034
2363
|
is_catastrophic_spike = False
|
|
2035
2364
|
metrics_acceptable = True
|
|
2036
2365
|
else:
|
|
2037
|
-
spike_threshold = getattr(config, "spike_threshold", 2.0)
|
|
2038
2366
|
is_catastrophic_spike = drift_ratio > spike_threshold
|
|
2039
2367
|
# Check if standard metrics are acceptable against configured max ratio
|
|
2040
2368
|
metrics_acceptable = drift_ratio <= getattr(config, "max_pm_ratio", 2.0)
|
|
2041
2369
|
|
|
2042
2370
|
# Determine rollback reason and status
|
|
2043
2371
|
rollback_reason = None
|
|
2372
|
+
tail_failed = False
|
|
2373
|
+
try:
|
|
2374
|
+
pm_tail = metrics.get("primary_metric_tail", {})
|
|
2375
|
+
if isinstance(pm_tail, dict) and pm_tail:
|
|
2376
|
+
mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
|
|
2377
|
+
evaluated = bool(pm_tail.get("evaluated", False))
|
|
2378
|
+
passed = bool(pm_tail.get("passed", True))
|
|
2379
|
+
tail_failed = bool(mode == "fail" and evaluated and (not passed))
|
|
2380
|
+
except Exception: # pragma: no cover
|
|
2381
|
+
tail_failed = False
|
|
2044
2382
|
if is_catastrophic_spike:
|
|
2045
2383
|
rollback_reason = (
|
|
2046
2384
|
f"catastrophic_ppl_spike (ratio: {drift_ratio:.3f} > {spike_threshold})"
|
|
@@ -2057,6 +2395,9 @@ class CoreRunner:
|
|
|
2057
2395
|
"immediate_rollback": True,
|
|
2058
2396
|
},
|
|
2059
2397
|
)
|
|
2398
|
+
elif tail_failed:
|
|
2399
|
+
rollback_reason = "primary_metric_tail_failed"
|
|
2400
|
+
status = RunStatus.ROLLBACK.value
|
|
2060
2401
|
elif (not all_guards_passed) or (not metrics_acceptable):
|
|
2061
2402
|
# Match historical/test expectation string exactly
|
|
2062
2403
|
rollback_reason = "guards_failed or metrics_unacceptable"
|
|
@@ -2185,20 +2526,27 @@ class CoreRunner:
|
|
|
2185
2526
|
) -> dict[str, dict[str, Any]]:
|
|
2186
2527
|
"""Resolve tier-based guard policies from configuration."""
|
|
2187
2528
|
# Use passed auto_config if available, otherwise extract from report meta
|
|
2188
|
-
|
|
2189
|
-
|
|
2529
|
+
auto_cfg: dict[str, Any] | None = auto_config
|
|
2530
|
+
if auto_cfg is None:
|
|
2531
|
+
config_meta = report.meta.get("config") or {}
|
|
2190
2532
|
|
|
2191
2533
|
# Try to get auto config from various possible locations
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2534
|
+
auto_cfg = report.__dict__.get("auto_config")
|
|
2535
|
+
if (
|
|
2536
|
+
auto_cfg is None
|
|
2537
|
+
and isinstance(config_meta, dict)
|
|
2538
|
+
and "auto" in config_meta
|
|
2539
|
+
):
|
|
2540
|
+
auto_cfg = config_meta["auto"]
|
|
2541
|
+
elif auto_cfg is None:
|
|
2197
2542
|
# Fallback to default balanced tier
|
|
2198
|
-
|
|
2543
|
+
auto_cfg = {"tier": "balanced", "enabled": True}
|
|
2544
|
+
|
|
2545
|
+
if not isinstance(auto_cfg, dict):
|
|
2546
|
+
auto_cfg = {"tier": "balanced", "enabled": True}
|
|
2199
2547
|
|
|
2200
2548
|
# Extract tier and edit name
|
|
2201
|
-
tier =
|
|
2549
|
+
tier = auto_cfg.get("tier", "balanced")
|
|
2202
2550
|
edit_name = None
|
|
2203
2551
|
if hasattr(report, "edit") and report.edit:
|
|
2204
2552
|
edit_name = report.edit.get("name")
|
|
@@ -2208,8 +2556,10 @@ class CoreRunner:
|
|
|
2208
2556
|
edit_name = report.meta["edit_name"]
|
|
2209
2557
|
|
|
2210
2558
|
# Get explicit guard overrides from config
|
|
2211
|
-
config_meta = report.meta.get("config"
|
|
2212
|
-
explicit_overrides =
|
|
2559
|
+
config_meta = report.meta.get("config") or {}
|
|
2560
|
+
explicit_overrides = (
|
|
2561
|
+
config_meta.get("guards", {}) if isinstance(config_meta, dict) else {}
|
|
2562
|
+
)
|
|
2213
2563
|
|
|
2214
2564
|
try:
|
|
2215
2565
|
# Resolve tier policies
|
|
@@ -2237,18 +2587,18 @@ class CoreRunner:
|
|
|
2237
2587
|
def _apply_guard_policy(self, guard: Guard, policy: dict[str, Any]) -> None:
|
|
2238
2588
|
"""Apply resolved policy parameters to a guard instance."""
|
|
2239
2589
|
try:
|
|
2590
|
+
guard_config = getattr(guard, "config", None)
|
|
2591
|
+
guard_policy = getattr(guard, "policy", None)
|
|
2592
|
+
|
|
2240
2593
|
# Apply policy parameters to guard
|
|
2241
2594
|
for param_name, param_value in policy.items():
|
|
2242
2595
|
if hasattr(guard, param_name):
|
|
2243
2596
|
setattr(guard, param_name, param_value)
|
|
2244
|
-
elif
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
# Try to set in guard's policy dict
|
|
2249
|
-
guard.policy[param_name] = param_value
|
|
2597
|
+
elif isinstance(guard_config, dict):
|
|
2598
|
+
guard_config[param_name] = param_value
|
|
2599
|
+
elif isinstance(guard_policy, dict):
|
|
2600
|
+
guard_policy[param_name] = param_value
|
|
2250
2601
|
else:
|
|
2251
|
-
# Last resort: add to guard as attribute
|
|
2252
2602
|
setattr(guard, param_name, param_value)
|
|
2253
2603
|
|
|
2254
2604
|
except Exception as e:
|