invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/tiers.yaml +61 -0
- invarlock/adapters/hf_loading.py +97 -0
- invarlock/calibration/__init__.py +6 -0
- invarlock/calibration/spectral_null.py +301 -0
- invarlock/calibration/variance_ve.py +154 -0
- invarlock/cli/app.py +15 -0
- invarlock/cli/commands/calibrate.py +576 -0
- invarlock/cli/commands/doctor.py +9 -3
- invarlock/cli/commands/explain_gates.py +53 -9
- invarlock/cli/commands/plugins.py +12 -2
- invarlock/cli/commands/run.py +181 -79
- invarlock/cli/commands/verify.py +40 -0
- invarlock/cli/config.py +11 -1
- invarlock/cli/determinism.py +252 -0
- invarlock/core/auto_tuning.py +215 -17
- invarlock/core/bootstrap.py +137 -5
- invarlock/core/registry.py +9 -4
- invarlock/core/runner.py +305 -35
- invarlock/eval/bench.py +467 -141
- invarlock/eval/bench_regression.py +12 -0
- invarlock/eval/bootstrap.py +3 -1
- invarlock/eval/data.py +29 -7
- invarlock/eval/primary_metric.py +20 -5
- invarlock/guards/rmt.py +536 -46
- invarlock/guards/spectral.py +217 -10
- invarlock/guards/variance.py +124 -42
- invarlock/reporting/certificate.py +476 -45
- invarlock/reporting/certificate_schema.py +4 -1
- invarlock/reporting/guards_analysis.py +108 -10
- invarlock/reporting/normalizer.py +24 -1
- invarlock/reporting/policy_utils.py +97 -15
- invarlock/reporting/primary_metric_utils.py +17 -0
- invarlock/reporting/validate.py +10 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
invarlock/eval/bench.py
CHANGED
|
@@ -116,6 +116,7 @@ class ScenarioResult:
|
|
|
116
116
|
config: ScenarioConfig
|
|
117
117
|
bare_result: RunResult | None = None
|
|
118
118
|
guarded_result: RunResult | None = None
|
|
119
|
+
artifacts: dict[str, Any] = field(default_factory=dict)
|
|
119
120
|
metrics: dict[str, Any] = field(default_factory=dict)
|
|
120
121
|
gates: dict[str, bool] = field(default_factory=dict)
|
|
121
122
|
skipped: bool = False
|
|
@@ -269,6 +270,7 @@ class MetricsAggregator:
|
|
|
269
270
|
def extract_core_metrics(report: RunReport) -> dict[str, float]:
|
|
270
271
|
"""Extract core metrics from a RunReport (primary_metric-first)."""
|
|
271
272
|
metrics = report.get("metrics", {}) or {}
|
|
273
|
+
meta = report.get("meta", {}) or {}
|
|
272
274
|
pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
|
|
273
275
|
pm_preview = float("nan")
|
|
274
276
|
pm_final = float("nan")
|
|
@@ -281,29 +283,78 @@ class MetricsAggregator:
|
|
|
281
283
|
except Exception:
|
|
282
284
|
pm_preview = float("nan")
|
|
283
285
|
pm_final = float("nan")
|
|
286
|
+
duration_s = float("nan")
|
|
287
|
+
try:
|
|
288
|
+
if isinstance(meta, dict):
|
|
289
|
+
dur = meta.get("duration_s", meta.get("duration"))
|
|
290
|
+
if isinstance(dur, int | float):
|
|
291
|
+
duration_s = float(dur)
|
|
292
|
+
except Exception:
|
|
293
|
+
duration_s = float("nan")
|
|
284
294
|
return {
|
|
285
295
|
"primary_metric_preview": pm_preview,
|
|
286
296
|
"primary_metric_final": pm_final,
|
|
287
297
|
"latency_ms_per_tok": metrics.get("latency_ms_per_tok", float("nan")),
|
|
288
298
|
"memory_mb_peak": metrics.get("memory_mb_peak", float("nan")),
|
|
299
|
+
"duration_s": duration_s,
|
|
289
300
|
}
|
|
290
301
|
|
|
291
302
|
@staticmethod
|
|
292
303
|
def extract_guard_metrics(report: RunReport) -> dict[str, Any]:
|
|
293
304
|
"""Extract guard-specific metrics from a RunReport."""
|
|
294
|
-
guard_metrics = {}
|
|
305
|
+
guard_metrics: dict[str, Any] = {}
|
|
306
|
+
|
|
307
|
+
# Prefer structured guard reports when available
|
|
308
|
+
guards = report.get("guards", [])
|
|
309
|
+
if isinstance(guards, list):
|
|
310
|
+
for guard in guards:
|
|
311
|
+
if not isinstance(guard, dict):
|
|
312
|
+
continue
|
|
313
|
+
name = str(guard.get("name", "")).lower()
|
|
314
|
+
metrics = (
|
|
315
|
+
guard.get("metrics", {})
|
|
316
|
+
if isinstance(guard.get("metrics"), dict)
|
|
317
|
+
else {}
|
|
318
|
+
)
|
|
319
|
+
violations = guard.get("violations", [])
|
|
320
|
+
if name == "rmt":
|
|
321
|
+
for key in ("outliers_total", "rmt_outliers", "layers_flagged"):
|
|
322
|
+
val = metrics.get(key)
|
|
323
|
+
if isinstance(val, int | float):
|
|
324
|
+
guard_metrics["rmt_outliers"] = int(val)
|
|
325
|
+
break
|
|
326
|
+
if name == "invariants":
|
|
327
|
+
val = metrics.get("violations_found")
|
|
328
|
+
if isinstance(val, int | float):
|
|
329
|
+
guard_metrics["tying_violations_post"] = int(val)
|
|
330
|
+
elif isinstance(violations, list):
|
|
331
|
+
guard_metrics["tying_violations_post"] = len(violations)
|
|
295
332
|
|
|
296
333
|
# Extract RMT outliers
|
|
297
|
-
|
|
298
|
-
|
|
334
|
+
if "rmt_outliers" not in guard_metrics:
|
|
335
|
+
rmt_metrics = report.get("metrics", {}).get("rmt", {})
|
|
336
|
+
if isinstance(rmt_metrics, dict):
|
|
337
|
+
guard_metrics["rmt_outliers"] = int(rmt_metrics.get("outliers", 0) or 0)
|
|
338
|
+
else:
|
|
339
|
+
guard_metrics["rmt_outliers"] = 0
|
|
299
340
|
|
|
300
341
|
# Extract invariant violations
|
|
301
|
-
|
|
302
|
-
|
|
342
|
+
if "tying_violations_post" not in guard_metrics:
|
|
343
|
+
invariant_metrics = report.get("metrics", {}).get("invariants", {})
|
|
344
|
+
if isinstance(invariant_metrics, dict):
|
|
345
|
+
guard_metrics["tying_violations_post"] = int(
|
|
346
|
+
invariant_metrics.get("violations", 0) or 0
|
|
347
|
+
)
|
|
348
|
+
else:
|
|
349
|
+
guard_metrics["tying_violations_post"] = 0
|
|
303
350
|
|
|
304
351
|
# Check if rollback occurred (catastrophic spike)
|
|
305
|
-
|
|
306
|
-
|
|
352
|
+
flags = report.get("flags", {}) or {}
|
|
353
|
+
meta = report.get("meta", {}) or {}
|
|
354
|
+
guard_metrics["catastrophic_spike"] = bool(
|
|
355
|
+
(flags.get("guard_recovered") if isinstance(flags, dict) else False)
|
|
356
|
+
or (meta.get("guard_recovered") if isinstance(meta, dict) else False)
|
|
357
|
+
or (meta.get("rollback_reason") if isinstance(meta, dict) else False)
|
|
307
358
|
)
|
|
308
359
|
|
|
309
360
|
return guard_metrics
|
|
@@ -342,6 +393,8 @@ class MetricsAggregator:
|
|
|
342
393
|
"latency_guarded": guarded_metrics.get(
|
|
343
394
|
"latency_ms_per_tok", float("nan")
|
|
344
395
|
),
|
|
396
|
+
"duration_bare_s": bare_metrics.get("duration_s", float("nan")),
|
|
397
|
+
"duration_guarded_s": guarded_metrics.get("duration_s", float("nan")),
|
|
345
398
|
"mem_bare": bare_metrics.get("memory_mb_peak", float("nan")),
|
|
346
399
|
"mem_guarded": guarded_metrics.get("memory_mb_peak", float("nan")),
|
|
347
400
|
}
|
|
@@ -355,17 +408,30 @@ class MetricsAggregator:
|
|
|
355
408
|
else:
|
|
356
409
|
comparison["primary_metric_overhead"] = float("nan")
|
|
357
410
|
|
|
358
|
-
|
|
359
|
-
|
|
411
|
+
# Prefer end-to-end pipeline duration when available; fall back to per-token latency
|
|
412
|
+
duration_bare = comparison.get("duration_bare_s", float("nan"))
|
|
413
|
+
duration_guarded = comparison.get("duration_guarded_s", float("nan"))
|
|
360
414
|
if (
|
|
361
|
-
|
|
362
|
-
and
|
|
415
|
+
isinstance(duration_bare, int | float)
|
|
416
|
+
and isinstance(duration_guarded, int | float)
|
|
417
|
+
and not (math.isnan(duration_bare) or math.isnan(duration_guarded))
|
|
418
|
+
and float(duration_bare) > 0
|
|
363
419
|
):
|
|
364
420
|
comparison["guard_overhead_time"] = (
|
|
365
|
-
|
|
366
|
-
) /
|
|
421
|
+
float(duration_guarded) - float(duration_bare)
|
|
422
|
+
) / float(duration_bare)
|
|
367
423
|
else:
|
|
368
|
-
comparison["
|
|
424
|
+
latency_bare = comparison["latency_bare"]
|
|
425
|
+
latency_guarded = comparison["latency_guarded"]
|
|
426
|
+
if (
|
|
427
|
+
not (math.isnan(latency_bare) or math.isnan(latency_guarded))
|
|
428
|
+
and latency_bare > 0
|
|
429
|
+
):
|
|
430
|
+
comparison["guard_overhead_time"] = (
|
|
431
|
+
latency_guarded - latency_bare
|
|
432
|
+
) / latency_bare
|
|
433
|
+
else:
|
|
434
|
+
comparison["guard_overhead_time"] = float("nan")
|
|
369
435
|
|
|
370
436
|
mem_bare = comparison["mem_bare"]
|
|
371
437
|
mem_guarded = comparison["mem_guarded"]
|
|
@@ -506,141 +572,333 @@ def execute_single_run(
|
|
|
506
572
|
scenario: ScenarioConfig,
|
|
507
573
|
run_type: str,
|
|
508
574
|
output_dir: Path,
|
|
575
|
+
*,
|
|
576
|
+
runtime: dict[str, Any] | None = None,
|
|
509
577
|
) -> RunResult:
|
|
510
578
|
"""Execute a single benchmark run and return results."""
|
|
511
579
|
try:
|
|
512
|
-
#
|
|
513
|
-
|
|
580
|
+
# Deferred imports: heavy deps only when executing real pipeline
|
|
581
|
+
from invarlock.core.api import RunConfig as _RunConfig
|
|
582
|
+
from invarlock.core.auto_tuning import get_tier_policies as _get_tier_policies
|
|
583
|
+
from invarlock.core.registry import get_registry as _get_registry
|
|
584
|
+
from invarlock.core.runner import CoreRunner as _CoreRunner
|
|
585
|
+
from invarlock.eval.data import get_provider as _get_provider
|
|
586
|
+
from invarlock.guards.rmt import capture_baseline_mp_stats as _capture_mp_stats
|
|
587
|
+
from invarlock.guards.rmt import rmt_detect as _rmt_detect
|
|
588
|
+
from invarlock.model_profile import detect_model_profile as _detect_profile
|
|
589
|
+
|
|
590
|
+
def _ensure_dir(path: Path) -> None:
|
|
591
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
592
|
+
|
|
593
|
+
def _write_json(path: Path, payload: dict[str, Any]) -> None:
|
|
594
|
+
path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
595
|
+
|
|
596
|
+
if runtime is None:
|
|
597
|
+
runtime = {}
|
|
598
|
+
|
|
599
|
+
# Resolve shared runtime resources (tokenizer/windows/model snapshot) when absent.
|
|
600
|
+
adapter = runtime.get("adapter")
|
|
601
|
+
model = runtime.get("model")
|
|
602
|
+
baseline_snapshot = runtime.get("baseline_snapshot")
|
|
603
|
+
pairing_schedule = runtime.get("pairing_schedule")
|
|
604
|
+
calibration_data = runtime.get("calibration_data")
|
|
605
|
+
tokenizer_hash = runtime.get("tokenizer_hash")
|
|
606
|
+
split = runtime.get("split", "validation")
|
|
607
|
+
dataset_name = runtime.get("dataset_name")
|
|
608
|
+
|
|
609
|
+
if not isinstance(dataset_name, str) or not dataset_name:
|
|
610
|
+
dataset_name = str(
|
|
611
|
+
run_config.get("dataset", {}).get("provider", "wikitext2")
|
|
612
|
+
)
|
|
514
613
|
|
|
515
|
-
#
|
|
516
|
-
|
|
614
|
+
# Tokenizer + pairing schedule
|
|
615
|
+
if not (
|
|
616
|
+
isinstance(pairing_schedule, dict) and isinstance(calibration_data, list)
|
|
617
|
+
):
|
|
618
|
+
profile = _detect_profile(scenario.model_id, adapter=scenario.adapter)
|
|
619
|
+
tokenizer, tokenizer_hash = profile.make_tokenizer()
|
|
620
|
+
provider_kwargs: dict[str, Any] = {}
|
|
621
|
+
if scenario.device != "auto" and dataset_name == "wikitext2":
|
|
622
|
+
provider_kwargs["device_hint"] = str(scenario.device)
|
|
623
|
+
provider = _get_provider(dataset_name, **provider_kwargs)
|
|
624
|
+
preview_window, final_window = provider.windows(
|
|
625
|
+
tokenizer=tokenizer,
|
|
626
|
+
seq_len=scenario.seq_len,
|
|
627
|
+
stride=scenario.stride,
|
|
628
|
+
preview_n=scenario.preview_n or 0,
|
|
629
|
+
final_n=scenario.final_n or 0,
|
|
630
|
+
seed=scenario.seed,
|
|
631
|
+
split=split,
|
|
632
|
+
)
|
|
633
|
+
prev_ids = list(range(len(preview_window.input_ids)))
|
|
634
|
+
fin_ids = list(
|
|
635
|
+
range(
|
|
636
|
+
len(preview_window.input_ids),
|
|
637
|
+
len(preview_window.input_ids) + len(final_window.input_ids),
|
|
638
|
+
)
|
|
639
|
+
)
|
|
640
|
+
pairing_schedule = {
|
|
641
|
+
"preview": {
|
|
642
|
+
"window_ids": prev_ids,
|
|
643
|
+
"input_ids": preview_window.input_ids,
|
|
644
|
+
"attention_masks": preview_window.attention_masks,
|
|
645
|
+
},
|
|
646
|
+
"final": {
|
|
647
|
+
"window_ids": fin_ids,
|
|
648
|
+
"input_ids": final_window.input_ids,
|
|
649
|
+
"attention_masks": final_window.attention_masks,
|
|
650
|
+
},
|
|
651
|
+
}
|
|
652
|
+
calibration_data = []
|
|
653
|
+
for idx, (input_ids, attention_mask) in enumerate(
|
|
654
|
+
zip(
|
|
655
|
+
preview_window.input_ids,
|
|
656
|
+
preview_window.attention_masks,
|
|
657
|
+
strict=False,
|
|
658
|
+
)
|
|
659
|
+
):
|
|
660
|
+
calibration_data.append(
|
|
661
|
+
{
|
|
662
|
+
"input_ids": input_ids,
|
|
663
|
+
"attention_mask": attention_mask,
|
|
664
|
+
"window_id": f"preview::{idx}",
|
|
665
|
+
}
|
|
666
|
+
)
|
|
667
|
+
for idx, (input_ids, attention_mask) in enumerate(
|
|
668
|
+
zip(final_window.input_ids, final_window.attention_masks, strict=False)
|
|
669
|
+
):
|
|
670
|
+
calibration_data.append(
|
|
671
|
+
{
|
|
672
|
+
"input_ids": input_ids,
|
|
673
|
+
"attention_mask": attention_mask,
|
|
674
|
+
"window_id": f"final::{idx}",
|
|
675
|
+
}
|
|
676
|
+
)
|
|
677
|
+
runtime["pairing_schedule"] = pairing_schedule
|
|
678
|
+
runtime["calibration_data"] = calibration_data
|
|
679
|
+
runtime["tokenizer_hash"] = tokenizer_hash
|
|
680
|
+
runtime["split"] = split
|
|
681
|
+
runtime["dataset_name"] = dataset_name
|
|
682
|
+
|
|
683
|
+
# Adapter/model snapshot
|
|
684
|
+
if adapter is None or model is None or baseline_snapshot is None:
|
|
685
|
+
registry = _get_registry()
|
|
686
|
+
adapter = registry.get_adapter(scenario.adapter)
|
|
687
|
+
model = adapter.load_model(scenario.model_id, device=scenario.device)
|
|
688
|
+
baseline_snapshot = adapter.snapshot(model)
|
|
689
|
+
runtime["adapter"] = adapter
|
|
690
|
+
runtime["model"] = model
|
|
691
|
+
runtime["baseline_snapshot"] = baseline_snapshot
|
|
692
|
+
|
|
693
|
+
# Baseline RMT stats (used to compute comparable outlier counts for bare vs guarded)
|
|
694
|
+
rmt_baseline_mp_stats = runtime.get("rmt_baseline_mp_stats")
|
|
695
|
+
rmt_baseline_sigmas = runtime.get("rmt_baseline_sigmas")
|
|
696
|
+
if not isinstance(rmt_baseline_mp_stats, dict) or not isinstance(
|
|
697
|
+
rmt_baseline_sigmas, dict
|
|
698
|
+
):
|
|
699
|
+
adapter.restore(model, baseline_snapshot)
|
|
700
|
+
rmt_baseline_mp_stats = _capture_mp_stats(model)
|
|
701
|
+
rmt_baseline_sigmas = {
|
|
702
|
+
name: float(stats.get("sigma_base", 0.0) or 0.0)
|
|
703
|
+
for name, stats in rmt_baseline_mp_stats.items()
|
|
704
|
+
if isinstance(stats, dict)
|
|
705
|
+
}
|
|
706
|
+
runtime["rmt_baseline_mp_stats"] = rmt_baseline_mp_stats
|
|
707
|
+
runtime["rmt_baseline_sigmas"] = rmt_baseline_sigmas
|
|
517
708
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
report["meta"]["device"] = run_config["model"]["device"]
|
|
522
|
-
report["meta"]["ts"] = datetime.now().isoformat()
|
|
523
|
-
report["meta"]["seed"] = run_config["dataset"]["seed"]
|
|
524
|
-
|
|
525
|
-
# Fill in dataset config
|
|
526
|
-
report["data"]["dataset"] = run_config["dataset"]["provider"]
|
|
527
|
-
report["data"]["seq_len"] = run_config["dataset"]["seq_len"]
|
|
528
|
-
report["data"]["stride"] = run_config["dataset"]["stride"]
|
|
529
|
-
report["data"]["preview_n"] = run_config["dataset"]["preview_n"]
|
|
530
|
-
report["data"]["final_n"] = run_config["dataset"]["final_n"]
|
|
531
|
-
|
|
532
|
-
# Fill in edit info
|
|
533
|
-
report["edit"]["name"] = scenario.edit
|
|
534
|
-
report["edit"]["plan_digest"] = (
|
|
535
|
-
f"mock_digest_{scenario.edit}_{scenario.tier}_{scenario.probes}"
|
|
709
|
+
tier_policies = _get_tier_policies()
|
|
710
|
+
tier_policy = tier_policies.get(
|
|
711
|
+
scenario.tier, tier_policies.get("balanced", {})
|
|
536
712
|
)
|
|
713
|
+
rmt_policy = tier_policy.get("rmt", {}) if isinstance(tier_policy, dict) else {}
|
|
714
|
+
rmt_margin = float(rmt_policy.get("margin", 1.5) or 1.5)
|
|
715
|
+
rmt_deadband = float(rmt_policy.get("deadband", 0.10) or 0.10)
|
|
716
|
+
|
|
717
|
+
# Restore baseline model for this run
|
|
718
|
+
adapter.restore(model, baseline_snapshot)
|
|
719
|
+
|
|
720
|
+
run_dir = output_dir / run_type
|
|
721
|
+
_ensure_dir(run_dir)
|
|
722
|
+
event_path = run_dir / "events.jsonl"
|
|
723
|
+
|
|
724
|
+
# Core objects
|
|
725
|
+
registry = _get_registry()
|
|
726
|
+
edit_op = registry.get_edit(scenario.edit)
|
|
727
|
+
|
|
728
|
+
guards: list[Any] = []
|
|
729
|
+
auto_config = None
|
|
730
|
+
if run_type == "guarded":
|
|
731
|
+
for guard_name in ("invariants", "spectral", "rmt", "variance"):
|
|
732
|
+
try:
|
|
733
|
+
guards.append(registry.get_guard(guard_name))
|
|
734
|
+
except Exception:
|
|
735
|
+
continue
|
|
736
|
+
auto_config = {
|
|
737
|
+
"tier": scenario.tier,
|
|
738
|
+
"probes": scenario.probes,
|
|
739
|
+
"enabled": True,
|
|
740
|
+
}
|
|
537
741
|
|
|
538
|
-
#
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
742
|
+
# Wire run context for pairing verification
|
|
743
|
+
run_context = {
|
|
744
|
+
"profile": scenario.profile,
|
|
745
|
+
"dataset": {"provider": dataset_name, "seed": scenario.seed},
|
|
746
|
+
"pairing_baseline": pairing_schedule,
|
|
747
|
+
"eval": {"loss": {"resolved_type": "causal"}},
|
|
748
|
+
"run_id": f"{scenario.edit}-{scenario.tier}-p{scenario.probes}-{run_type}",
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
spike_threshold = float(
|
|
752
|
+
run_config.get("eval", {}).get("spike_threshold", 2.0) or 2.0
|
|
753
|
+
)
|
|
754
|
+
cfg = _RunConfig(
|
|
755
|
+
device=scenario.device,
|
|
756
|
+
max_pm_ratio=spike_threshold,
|
|
757
|
+
spike_threshold=spike_threshold,
|
|
758
|
+
event_path=event_path,
|
|
759
|
+
context=run_context,
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
runner = _CoreRunner()
|
|
763
|
+
core_report = runner.execute(
|
|
764
|
+
model=model,
|
|
765
|
+
adapter=adapter,
|
|
766
|
+
edit=edit_op,
|
|
767
|
+
guards=guards,
|
|
768
|
+
config=cfg,
|
|
769
|
+
calibration_data=calibration_data,
|
|
770
|
+
auto_config=auto_config,
|
|
771
|
+
edit_config=run_config.get("edit", {}).get("plan", {}),
|
|
772
|
+
preview_n=scenario.preview_n,
|
|
773
|
+
final_n=scenario.final_n,
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
# Convert to evaluation RunReport (dict) for downstream tooling
|
|
777
|
+
report = create_empty_report()
|
|
778
|
+
report["meta"].update(
|
|
779
|
+
{
|
|
780
|
+
"model_id": scenario.model_id,
|
|
781
|
+
"adapter": scenario.adapter,
|
|
782
|
+
"device": str(scenario.device),
|
|
783
|
+
"commit": "",
|
|
784
|
+
"seed": scenario.seed,
|
|
785
|
+
"ts": datetime.now().isoformat(),
|
|
546
786
|
}
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
)
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
scenario.
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
scenario.
|
|
562
|
-
) # Small improvement with probes
|
|
563
|
-
|
|
564
|
-
base_ppl = 45.0 + (hash(f"{scenario.edit}_{scenario.tier}") % 100) / 100.0
|
|
565
|
-
report["metrics"]["primary_metric"] = {
|
|
566
|
-
"kind": "perplexity",
|
|
567
|
-
"preview": base_ppl * tier_factor,
|
|
568
|
-
"final": base_ppl * tier_factor * probe_factor,
|
|
787
|
+
)
|
|
788
|
+
if tokenizer_hash:
|
|
789
|
+
report["meta"]["tokenizer_hash"] = tokenizer_hash
|
|
790
|
+
dur = core_report.meta.get("duration") if hasattr(core_report, "meta") else None
|
|
791
|
+
if isinstance(dur, int | float):
|
|
792
|
+
report["meta"]["duration_s"] = float(dur)
|
|
793
|
+
|
|
794
|
+
report["data"].update(
|
|
795
|
+
{
|
|
796
|
+
"dataset": dataset_name,
|
|
797
|
+
"split": split,
|
|
798
|
+
"seq_len": scenario.seq_len,
|
|
799
|
+
"stride": scenario.stride,
|
|
800
|
+
"preview_n": int(scenario.preview_n or 0),
|
|
801
|
+
"final_n": int(scenario.final_n or 0),
|
|
569
802
|
}
|
|
803
|
+
)
|
|
570
804
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
"
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
report["metrics"]["rmt"] = {
|
|
588
|
-
"outliers": max(
|
|
589
|
-
0,
|
|
590
|
-
2
|
|
591
|
-
+ (hash(scenario.edit) % 3)
|
|
592
|
-
- (1 if scenario.tier == "conservative" else 0),
|
|
593
|
-
)
|
|
805
|
+
edit_meta = core_report.edit if hasattr(core_report, "edit") else {}
|
|
806
|
+
plan_digest = ""
|
|
807
|
+
try:
|
|
808
|
+
if isinstance(edit_meta, dict):
|
|
809
|
+
plan_digest = str(edit_meta.get("plan_digest", ""))
|
|
810
|
+
except Exception:
|
|
811
|
+
plan_digest = ""
|
|
812
|
+
report["edit"].update(
|
|
813
|
+
{
|
|
814
|
+
"name": scenario.edit,
|
|
815
|
+
"plan_digest": plan_digest,
|
|
816
|
+
"deltas": (
|
|
817
|
+
edit_meta.get("deltas", report["edit"]["deltas"])
|
|
818
|
+
if isinstance(edit_meta, dict)
|
|
819
|
+
else report["edit"]["deltas"]
|
|
820
|
+
),
|
|
594
821
|
}
|
|
595
|
-
|
|
822
|
+
)
|
|
596
823
|
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
"
|
|
615
|
-
"
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
},
|
|
626
|
-
"metrics": {
|
|
627
|
-
"outliers": report["metrics"]["rmt"]["outliers"],
|
|
628
|
-
"mp_fit": 0.95,
|
|
629
|
-
},
|
|
630
|
-
"actions": ["validated"],
|
|
631
|
-
"violations": [],
|
|
632
|
-
},
|
|
633
|
-
]
|
|
824
|
+
# Transfer metrics
|
|
825
|
+
if hasattr(core_report, "metrics") and isinstance(core_report.metrics, dict):
|
|
826
|
+
report["metrics"].update(core_report.metrics)
|
|
827
|
+
|
|
828
|
+
if hasattr(core_report, "evaluation_windows") and isinstance(
|
|
829
|
+
core_report.evaluation_windows, dict
|
|
830
|
+
):
|
|
831
|
+
report["evaluation_windows"] = core_report.evaluation_windows
|
|
832
|
+
|
|
833
|
+
# Transfer guards
|
|
834
|
+
if hasattr(core_report, "guards") and isinstance(core_report.guards, dict):
|
|
835
|
+
for name, guard_result in core_report.guards.items():
|
|
836
|
+
if not isinstance(guard_result, dict):
|
|
837
|
+
continue
|
|
838
|
+
report["guards"].append(
|
|
839
|
+
{
|
|
840
|
+
"name": name,
|
|
841
|
+
"passed": guard_result.get("passed"),
|
|
842
|
+
"action": guard_result.get("action"),
|
|
843
|
+
"policy": guard_result.get("policy", {}),
|
|
844
|
+
"metrics": guard_result.get("metrics", {}),
|
|
845
|
+
"actions": guard_result.get("actions", []),
|
|
846
|
+
"violations": guard_result.get("violations", []),
|
|
847
|
+
"warnings": guard_result.get("warnings", []),
|
|
848
|
+
"errors": guard_result.get("errors", []),
|
|
849
|
+
"details": guard_result.get("details", {}),
|
|
850
|
+
}
|
|
851
|
+
)
|
|
634
852
|
|
|
635
|
-
#
|
|
636
|
-
|
|
637
|
-
|
|
853
|
+
# Compute comparable RMT outliers for both bare and guarded models
|
|
854
|
+
try:
|
|
855
|
+
detection = _rmt_detect(
|
|
856
|
+
model=model,
|
|
857
|
+
threshold=rmt_margin,
|
|
858
|
+
detect_only=True,
|
|
859
|
+
baseline_sigmas=rmt_baseline_sigmas,
|
|
860
|
+
baseline_mp_stats=rmt_baseline_mp_stats,
|
|
861
|
+
deadband=rmt_deadband,
|
|
862
|
+
)
|
|
863
|
+
report["metrics"].setdefault("rmt", {})
|
|
864
|
+
if isinstance(report["metrics"].get("rmt"), dict):
|
|
865
|
+
report["metrics"]["rmt"]["outliers"] = int(
|
|
866
|
+
detection.get("n_layers_flagged", 0) or 0
|
|
867
|
+
)
|
|
868
|
+
except Exception:
|
|
869
|
+
pass
|
|
870
|
+
|
|
871
|
+
# Flags and artifacts
|
|
872
|
+
status = getattr(core_report, "status", "")
|
|
873
|
+
rollback_reason = (
|
|
874
|
+
core_report.meta.get("rollback_reason")
|
|
875
|
+
if hasattr(core_report, "meta") and isinstance(core_report.meta, dict)
|
|
876
|
+
else None
|
|
638
877
|
)
|
|
639
|
-
report["
|
|
640
|
-
|
|
878
|
+
report["flags"].update(
|
|
879
|
+
{
|
|
880
|
+
"guard_recovered": bool(
|
|
881
|
+
(
|
|
882
|
+
hasattr(core_report, "meta")
|
|
883
|
+
and core_report.meta.get("guard_recovered")
|
|
884
|
+
)
|
|
885
|
+
or str(status).lower() == "rollback"
|
|
886
|
+
),
|
|
887
|
+
"rollback_reason": rollback_reason,
|
|
888
|
+
}
|
|
641
889
|
)
|
|
890
|
+
report["artifacts"].update(
|
|
891
|
+
{
|
|
892
|
+
"events_path": str(event_path),
|
|
893
|
+
"logs_path": "",
|
|
894
|
+
"checkpoint_path": None,
|
|
895
|
+
"report_path": str(run_dir / "report.json"),
|
|
896
|
+
}
|
|
897
|
+
)
|
|
898
|
+
_write_json(Path(report["artifacts"]["report_path"]), report)
|
|
642
899
|
|
|
643
|
-
|
|
900
|
+
success = str(status).lower() != "failed"
|
|
901
|
+
return RunResult(run_type=run_type, report=report, success=success)
|
|
644
902
|
|
|
645
903
|
except Exception as e:
|
|
646
904
|
logger.error(f"Run failed for {scenario.edit} ({run_type}): {e}")
|
|
@@ -671,21 +929,70 @@ def execute_scenario(
|
|
|
671
929
|
config_manager = ConfigurationManager()
|
|
672
930
|
metrics_aggregator = MetricsAggregator()
|
|
673
931
|
|
|
932
|
+
# Scenario-scoped artifact directory
|
|
933
|
+
scenario_slug = f"{scenario.edit}__{scenario.tier}__p{scenario.probes}"
|
|
934
|
+
scenario_dir = output_dir / "scenarios" / scenario_slug
|
|
935
|
+
scenario_dir.mkdir(parents=True, exist_ok=True)
|
|
936
|
+
|
|
937
|
+
runtime: dict[str, Any] = {"dataset_name": config.dataset}
|
|
938
|
+
|
|
674
939
|
# Run bare configuration
|
|
675
940
|
logger.debug(f"Running bare configuration for {scenario.edit}")
|
|
676
941
|
bare_config = config_manager.create_bare_config(scenario)
|
|
677
|
-
|
|
942
|
+
try:
|
|
943
|
+
bare_config.setdefault("dataset", {})["provider"] = config.dataset
|
|
944
|
+
except Exception:
|
|
945
|
+
pass
|
|
946
|
+
bare_result = execute_single_run(
|
|
947
|
+
bare_config, scenario, "bare", scenario_dir, runtime=runtime
|
|
948
|
+
)
|
|
678
949
|
|
|
679
950
|
# Run guarded configuration
|
|
680
951
|
logger.debug(f"Running guarded configuration for {scenario.edit}")
|
|
681
952
|
guarded_config = config_manager.create_guarded_config(scenario)
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
953
|
+
try:
|
|
954
|
+
guarded_config.setdefault("dataset", {})["provider"] = config.dataset
|
|
955
|
+
except Exception:
|
|
956
|
+
pass
|
|
957
|
+
guarded_result = execute_single_run(
|
|
958
|
+
guarded_config, scenario, "guarded", scenario_dir, runtime=runtime
|
|
687
959
|
)
|
|
688
960
|
|
|
961
|
+
artifacts: dict[str, Any] = {"scenario_dir": str(scenario_dir)}
|
|
962
|
+
pairing_schedule = runtime.get("pairing_schedule")
|
|
963
|
+
if isinstance(pairing_schedule, dict):
|
|
964
|
+
pairing_path = scenario_dir / "pairing_schedule.json"
|
|
965
|
+
pairing_path.write_text(
|
|
966
|
+
json.dumps(pairing_schedule, indent=2), encoding="utf-8"
|
|
967
|
+
)
|
|
968
|
+
artifacts["pairing_schedule"] = str(pairing_path)
|
|
969
|
+
try:
|
|
970
|
+
if bare_result and bare_result.report:
|
|
971
|
+
artifacts["bare_report"] = bare_result.report.get("artifacts", {}).get(
|
|
972
|
+
"report_path"
|
|
973
|
+
)
|
|
974
|
+
except Exception:
|
|
975
|
+
pass
|
|
976
|
+
try:
|
|
977
|
+
if guarded_result and guarded_result.report:
|
|
978
|
+
artifacts["guarded_report"] = guarded_result.report.get(
|
|
979
|
+
"artifacts", {}
|
|
980
|
+
).get("report_path")
|
|
981
|
+
except Exception:
|
|
982
|
+
pass
|
|
983
|
+
|
|
984
|
+
# Generate certificate artifact when both runs produced reports
|
|
985
|
+
try:
|
|
986
|
+
if bare_result.success and guarded_result.success:
|
|
987
|
+
from invarlock.reporting.certificate import make_certificate
|
|
988
|
+
|
|
989
|
+
cert = make_certificate(guarded_result.report, bare_result.report)
|
|
990
|
+
cert_path = scenario_dir / "certificate.json"
|
|
991
|
+
cert_path.write_text(json.dumps(cert, indent=2), encoding="utf-8")
|
|
992
|
+
artifacts["certificate"] = str(cert_path)
|
|
993
|
+
except Exception as exc:
|
|
994
|
+
logger.warning(f"Certificate generation failed for {scenario_slug}: {exc}")
|
|
995
|
+
|
|
689
996
|
# Resolve epsilon from runtime or use config
|
|
690
997
|
epsilon_used = config.epsilon
|
|
691
998
|
if epsilon_used is None and guarded_result.success:
|
|
@@ -693,8 +1000,24 @@ def execute_scenario(
|
|
|
693
1000
|
elif epsilon_used is None:
|
|
694
1001
|
epsilon_used = 0.10 # Default fallback
|
|
695
1002
|
|
|
696
|
-
#
|
|
697
|
-
|
|
1003
|
+
# Compute comparison metrics and validate gates.
|
|
1004
|
+
comparison_metrics = metrics_aggregator.compute_comparison_metrics(
|
|
1005
|
+
bare_result, guarded_result
|
|
1006
|
+
)
|
|
1007
|
+
if not (bare_result.success and guarded_result.success):
|
|
1008
|
+
# Treat execution failures as a hard FAIL: benchmarks are only meaningful
|
|
1009
|
+
# when both paired runs complete.
|
|
1010
|
+
comparison_metrics = {
|
|
1011
|
+
"error_bare": bare_result.error_message,
|
|
1012
|
+
"error_guarded": guarded_result.error_message,
|
|
1013
|
+
}
|
|
1014
|
+
gates = dict.fromkeys(
|
|
1015
|
+
("spike", "tying", "rmt", "quality", "time", "mem"), False
|
|
1016
|
+
)
|
|
1017
|
+
else:
|
|
1018
|
+
gates = ValidationGates.validate_all_gates(
|
|
1019
|
+
comparison_metrics, config, epsilon_used
|
|
1020
|
+
)
|
|
698
1021
|
|
|
699
1022
|
# Mock probes_used based on scenario.probes (in real implementation, this would come from auto-tuner)
|
|
700
1023
|
probes_used = min(
|
|
@@ -705,6 +1028,7 @@ def execute_scenario(
|
|
|
705
1028
|
config=scenario,
|
|
706
1029
|
bare_result=bare_result,
|
|
707
1030
|
guarded_result=guarded_result,
|
|
1031
|
+
artifacts=artifacts,
|
|
708
1032
|
metrics=comparison_metrics,
|
|
709
1033
|
gates=gates,
|
|
710
1034
|
probes_used=probes_used,
|
|
@@ -843,6 +1167,7 @@ def _summary_to_step14_json(summary: BenchmarkSummary) -> dict[str, Any]:
|
|
|
843
1167
|
"probes_used": result.probes_used,
|
|
844
1168
|
"skip": result.skipped,
|
|
845
1169
|
"skip_reason": result.skip_reason,
|
|
1170
|
+
"artifacts": result.artifacts,
|
|
846
1171
|
}
|
|
847
1172
|
|
|
848
1173
|
if not result.skipped and result.metrics:
|
|
@@ -1033,6 +1358,7 @@ def _scenario_result_to_dict(result: ScenarioResult) -> dict[str, Any]:
|
|
|
1033
1358
|
"probes_used": result.probes_used,
|
|
1034
1359
|
"skipped": result.skipped,
|
|
1035
1360
|
"skip_reason": result.skip_reason,
|
|
1361
|
+
"artifacts": result.artifacts,
|
|
1036
1362
|
"metrics": result.metrics,
|
|
1037
1363
|
"gates": result.gates,
|
|
1038
1364
|
"epsilon_used": result.epsilon_used,
|