PyPI - invarlock - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

invarlock 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

invarlock/__init__.py +1 -1
invarlock/_data/runtime/tiers.yaml +61 -0
invarlock/adapters/hf_loading.py +97 -0
invarlock/calibration/__init__.py +6 -0
invarlock/calibration/spectral_null.py +301 -0
invarlock/calibration/variance_ve.py +154 -0
invarlock/cli/app.py +15 -0
invarlock/cli/commands/calibrate.py +576 -0
invarlock/cli/commands/doctor.py +9 -3
invarlock/cli/commands/explain_gates.py +53 -9
invarlock/cli/commands/plugins.py +12 -2
invarlock/cli/commands/run.py +181 -79
invarlock/cli/commands/verify.py +40 -0
invarlock/cli/config.py +11 -1
invarlock/cli/determinism.py +252 -0
invarlock/core/auto_tuning.py +215 -17
invarlock/core/bootstrap.py +137 -5
invarlock/core/registry.py +9 -4
invarlock/core/runner.py +305 -35
invarlock/eval/bench.py +467 -141
invarlock/eval/bench_regression.py +12 -0
invarlock/eval/bootstrap.py +3 -1
invarlock/eval/data.py +29 -7
invarlock/eval/primary_metric.py +20 -5
invarlock/guards/rmt.py +536 -46
invarlock/guards/spectral.py +217 -10
invarlock/guards/variance.py +124 -42
invarlock/reporting/certificate.py +476 -45
invarlock/reporting/certificate_schema.py +4 -1
invarlock/reporting/guards_analysis.py +108 -10
invarlock/reporting/normalizer.py +24 -1
invarlock/reporting/policy_utils.py +97 -15
invarlock/reporting/primary_metric_utils.py +17 -0
invarlock/reporting/validate.py +10 -10
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0

invarlock/eval/bench.py CHANGED Viewed

@@ -116,6 +116,7 @@ class ScenarioResult:
     config: ScenarioConfig
     bare_result: RunResult | None = None
     guarded_result: RunResult | None = None
+    artifacts: dict[str, Any] = field(default_factory=dict)
     metrics: dict[str, Any] = field(default_factory=dict)
     gates: dict[str, bool] = field(default_factory=dict)
     skipped: bool = False
@@ -269,6 +270,7 @@ class MetricsAggregator:
     def extract_core_metrics(report: RunReport) -> dict[str, float]:
         """Extract core metrics from a RunReport (primary_metric-first)."""
         metrics = report.get("metrics", {}) or {}
+        meta = report.get("meta", {}) or {}
         pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
         pm_preview = float("nan")
         pm_final = float("nan")
@@ -281,29 +283,78 @@ class MetricsAggregator:
         except Exception:
             pm_preview = float("nan")
             pm_final = float("nan")
+        duration_s = float("nan")
+        try:
+            if isinstance(meta, dict):
+                dur = meta.get("duration_s", meta.get("duration"))
+                if isinstance(dur, int | float):
+                    duration_s = float(dur)
+        except Exception:
+            duration_s = float("nan")
         return {
             "primary_metric_preview": pm_preview,
             "primary_metric_final": pm_final,
             "latency_ms_per_tok": metrics.get("latency_ms_per_tok", float("nan")),
             "memory_mb_peak": metrics.get("memory_mb_peak", float("nan")),
+            "duration_s": duration_s,
         }
     @staticmethod
     def extract_guard_metrics(report: RunReport) -> dict[str, Any]:
         """Extract guard-specific metrics from a RunReport."""
-        guard_metrics = {}
+        guard_metrics: dict[str, Any] = {}
+        # Prefer structured guard reports when available
+        guards = report.get("guards", [])
+        if isinstance(guards, list):
+            for guard in guards:
+                if not isinstance(guard, dict):
+                    continue
+                name = str(guard.get("name", "")).lower()
+                metrics = (
+                    guard.get("metrics", {})
+                    if isinstance(guard.get("metrics"), dict)
+                    else {}
+                )
+                violations = guard.get("violations", [])
+                if name == "rmt":
+                    for key in ("outliers_total", "rmt_outliers", "layers_flagged"):
+                        val = metrics.get(key)
+                        if isinstance(val, int | float):
+                            guard_metrics["rmt_outliers"] = int(val)
+                            break
+                if name == "invariants":
+                    val = metrics.get("violations_found")
+                    if isinstance(val, int | float):
+                        guard_metrics["tying_violations_post"] = int(val)
+                    elif isinstance(violations, list):
+                        guard_metrics["tying_violations_post"] = len(violations)
         # Extract RMT outliers
-        rmt_metrics = report.get("metrics", {}).get("rmt", {})
-        guard_metrics["rmt_outliers"] = rmt_metrics.get("outliers", 0)
+        if "rmt_outliers" not in guard_metrics:
+            rmt_metrics = report.get("metrics", {}).get("rmt", {})
+            if isinstance(rmt_metrics, dict):
+                guard_metrics["rmt_outliers"] = int(rmt_metrics.get("outliers", 0) or 0)
+            else:
+                guard_metrics["rmt_outliers"] = 0
         # Extract invariant violations
-        invariant_metrics = report.get("metrics", {}).get("invariants", {})
-        guard_metrics["tying_violations_post"] = invariant_metrics.get("violations", 0)
+        if "tying_violations_post" not in guard_metrics:
+            invariant_metrics = report.get("metrics", {}).get("invariants", {})
+            if isinstance(invariant_metrics, dict):
+                guard_metrics["tying_violations_post"] = int(
+                    invariant_metrics.get("violations", 0) or 0
+                )
+            else:
+                guard_metrics["tying_violations_post"] = 0
         # Check if rollback occurred (catastrophic spike)
-        guard_metrics["catastrophic_spike"] = report.get("flags", {}).get(
-            "guard_recovered", False
+        flags = report.get("flags", {}) or {}
+        meta = report.get("meta", {}) or {}
+        guard_metrics["catastrophic_spike"] = bool(
+            (flags.get("guard_recovered") if isinstance(flags, dict) else False)
+            or (meta.get("guard_recovered") if isinstance(meta, dict) else False)
+            or (meta.get("rollback_reason") if isinstance(meta, dict) else False)
         )
         return guard_metrics
@@ -342,6 +393,8 @@ class MetricsAggregator:
                 "latency_guarded": guarded_metrics.get(
                     "latency_ms_per_tok", float("nan")
                 ),
+                "duration_bare_s": bare_metrics.get("duration_s", float("nan")),
+                "duration_guarded_s": guarded_metrics.get("duration_s", float("nan")),
                 "mem_bare": bare_metrics.get("memory_mb_peak", float("nan")),
                 "mem_guarded": guarded_metrics.get("memory_mb_peak", float("nan")),
             }
@@ -355,17 +408,30 @@ class MetricsAggregator:
         else:
             comparison["primary_metric_overhead"] = float("nan")
-        latency_bare = comparison["latency_bare"]
-        latency_guarded = comparison["latency_guarded"]
+        # Prefer end-to-end pipeline duration when available; fall back to per-token latency
+        duration_bare = comparison.get("duration_bare_s", float("nan"))
+        duration_guarded = comparison.get("duration_guarded_s", float("nan"))
         if (
-            not (math.isnan(latency_bare) or math.isnan(latency_guarded))
-            and latency_bare > 0
+            isinstance(duration_bare, int | float)
+            and isinstance(duration_guarded, int | float)
+            and not (math.isnan(duration_bare) or math.isnan(duration_guarded))
+            and float(duration_bare) > 0
         ):
             comparison["guard_overhead_time"] = (
-                latency_guarded - latency_bare
-            ) / latency_bare
+                float(duration_guarded) - float(duration_bare)
+            ) / float(duration_bare)
         else:
-            comparison["guard_overhead_time"] = float("nan")
+            latency_bare = comparison["latency_bare"]
+            latency_guarded = comparison["latency_guarded"]
+            if (
+                not (math.isnan(latency_bare) or math.isnan(latency_guarded))
+                and latency_bare > 0
+            ):
+                comparison["guard_overhead_time"] = (
+                    latency_guarded - latency_bare
+                ) / latency_bare
+            else:
+                comparison["guard_overhead_time"] = float("nan")
         mem_bare = comparison["mem_bare"]
         mem_guarded = comparison["mem_guarded"]
@@ -506,141 +572,333 @@ def execute_single_run(
     scenario: ScenarioConfig,
     run_type: str,
     output_dir: Path,
+    *,
+    runtime: dict[str, Any] | None = None,
 ) -> RunResult:
     """Execute a single benchmark run and return results."""
     try:
-        # For now, create a mock run since we don't have the full pipeline
-        # In real implementation, this would call the actual InvarLock pipeline
+        # Deferred imports: heavy deps only when executing real pipeline
+        from invarlock.core.api import RunConfig as _RunConfig
+        from invarlock.core.auto_tuning import get_tier_policies as _get_tier_policies
+        from invarlock.core.registry import get_registry as _get_registry
+        from invarlock.core.runner import CoreRunner as _CoreRunner
+        from invarlock.eval.data import get_provider as _get_provider
+        from invarlock.guards.rmt import capture_baseline_mp_stats as _capture_mp_stats
+        from invarlock.guards.rmt import rmt_detect as _rmt_detect
+        from invarlock.model_profile import detect_model_profile as _detect_profile
+        def _ensure_dir(path: Path) -> None:
+            path.mkdir(parents=True, exist_ok=True)
+        def _write_json(path: Path, payload: dict[str, Any]) -> None:
+            path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        if runtime is None:
+            runtime = {}
+        # Resolve shared runtime resources (tokenizer/windows/model snapshot) when absent.
+        adapter = runtime.get("adapter")
+        model = runtime.get("model")
+        baseline_snapshot = runtime.get("baseline_snapshot")
+        pairing_schedule = runtime.get("pairing_schedule")
+        calibration_data = runtime.get("calibration_data")
+        tokenizer_hash = runtime.get("tokenizer_hash")
+        split = runtime.get("split", "validation")
+        dataset_name = runtime.get("dataset_name")
+        if not isinstance(dataset_name, str) or not dataset_name:
+            dataset_name = str(
+                run_config.get("dataset", {}).get("provider", "wikitext2")
+            )
-        # Create a mock RunReport with realistic values
-        report = create_empty_report()
+        # Tokenizer + pairing schedule
+        if not (
+            isinstance(pairing_schedule, dict) and isinstance(calibration_data, list)
+        ):
+            profile = _detect_profile(scenario.model_id, adapter=scenario.adapter)
+            tokenizer, tokenizer_hash = profile.make_tokenizer()
+            provider_kwargs: dict[str, Any] = {}
+            if scenario.device != "auto" and dataset_name == "wikitext2":
+                provider_kwargs["device_hint"] = str(scenario.device)
+            provider = _get_provider(dataset_name, **provider_kwargs)
+            preview_window, final_window = provider.windows(
+                tokenizer=tokenizer,
+                seq_len=scenario.seq_len,
+                stride=scenario.stride,
+                preview_n=scenario.preview_n or 0,
+                final_n=scenario.final_n or 0,
+                seed=scenario.seed,
+                split=split,
+            )
+            prev_ids = list(range(len(preview_window.input_ids)))
+            fin_ids = list(
+                range(
+                    len(preview_window.input_ids),
+                    len(preview_window.input_ids) + len(final_window.input_ids),
+                )
+            )
+            pairing_schedule = {
+                "preview": {
+                    "window_ids": prev_ids,
+                    "input_ids": preview_window.input_ids,
+                    "attention_masks": preview_window.attention_masks,
+                },
+                "final": {
+                    "window_ids": fin_ids,
+                    "input_ids": final_window.input_ids,
+                    "attention_masks": final_window.attention_masks,
+                },
+            }
+            calibration_data = []
+            for idx, (input_ids, attention_mask) in enumerate(
+                zip(
+                    preview_window.input_ids,
+                    preview_window.attention_masks,
+                    strict=False,
+                )
+            ):
+                calibration_data.append(
+                    {
+                        "input_ids": input_ids,
+                        "attention_mask": attention_mask,
+                        "window_id": f"preview::{idx}",
+                    }
+                )
+            for idx, (input_ids, attention_mask) in enumerate(
+                zip(final_window.input_ids, final_window.attention_masks, strict=False)
+            ):
+                calibration_data.append(
+                    {
+                        "input_ids": input_ids,
+                        "attention_mask": attention_mask,
+                        "window_id": f"final::{idx}",
+                    }
+                )
+            runtime["pairing_schedule"] = pairing_schedule
+            runtime["calibration_data"] = calibration_data
+            runtime["tokenizer_hash"] = tokenizer_hash
+            runtime["split"] = split
+            runtime["dataset_name"] = dataset_name
+        # Adapter/model snapshot
+        if adapter is None or model is None or baseline_snapshot is None:
+            registry = _get_registry()
+            adapter = registry.get_adapter(scenario.adapter)
+            model = adapter.load_model(scenario.model_id, device=scenario.device)
+            baseline_snapshot = adapter.snapshot(model)
+            runtime["adapter"] = adapter
+            runtime["model"] = model
+            runtime["baseline_snapshot"] = baseline_snapshot
+        # Baseline RMT stats (used to compute comparable outlier counts for bare vs guarded)
+        rmt_baseline_mp_stats = runtime.get("rmt_baseline_mp_stats")
+        rmt_baseline_sigmas = runtime.get("rmt_baseline_sigmas")
+        if not isinstance(rmt_baseline_mp_stats, dict) or not isinstance(
+            rmt_baseline_sigmas, dict
+        ):
+            adapter.restore(model, baseline_snapshot)
+            rmt_baseline_mp_stats = _capture_mp_stats(model)
+            rmt_baseline_sigmas = {
+                name: float(stats.get("sigma_base", 0.0) or 0.0)
+                for name, stats in rmt_baseline_mp_stats.items()
+                if isinstance(stats, dict)
+            }
+            runtime["rmt_baseline_mp_stats"] = rmt_baseline_mp_stats
+            runtime["rmt_baseline_sigmas"] = rmt_baseline_sigmas
-        # Fill in metadata
-        report["meta"]["model_id"] = run_config["model"]["id"]
-        report["meta"]["adapter"] = run_config["model"]["adapter"]
-        report["meta"]["device"] = run_config["model"]["device"]
-        report["meta"]["ts"] = datetime.now().isoformat()
-        report["meta"]["seed"] = run_config["dataset"]["seed"]
-        # Fill in dataset config
-        report["data"]["dataset"] = run_config["dataset"]["provider"]
-        report["data"]["seq_len"] = run_config["dataset"]["seq_len"]
-        report["data"]["stride"] = run_config["dataset"]["stride"]
-        report["data"]["preview_n"] = run_config["dataset"]["preview_n"]
-        report["data"]["final_n"] = run_config["dataset"]["final_n"]
-        # Fill in edit info
-        report["edit"]["name"] = scenario.edit
-        report["edit"]["plan_digest"] = (
-            f"mock_digest_{scenario.edit}_{scenario.tier}_{scenario.probes}"
+        tier_policies = _get_tier_policies()
+        tier_policy = tier_policies.get(
+            scenario.tier, tier_policies.get("balanced", {})
         )
+        rmt_policy = tier_policy.get("rmt", {}) if isinstance(tier_policy, dict) else {}
+        rmt_margin = float(rmt_policy.get("margin", 1.5) or 1.5)
+        rmt_deadband = float(rmt_policy.get("deadband", 0.10) or 0.10)
+        # Restore baseline model for this run
+        adapter.restore(model, baseline_snapshot)
+        run_dir = output_dir / run_type
+        _ensure_dir(run_dir)
+        event_path = run_dir / "events.jsonl"
+        # Core objects
+        registry = _get_registry()
+        edit_op = registry.get_edit(scenario.edit)
+        guards: list[Any] = []
+        auto_config = None
+        if run_type == "guarded":
+            for guard_name in ("invariants", "spectral", "rmt", "variance"):
+                try:
+                    guards.append(registry.get_guard(guard_name))
+                except Exception:
+                    continue
+            auto_config = {
+                "tier": scenario.tier,
+                "probes": scenario.probes,
+                "enabled": True,
+            }
-        # Mock realistic metrics based on run type and scenario
-        if run_type == "bare":
-            # Bare runs: no guard overhead, potentially higher PM (ppl-like)
-            base_ppl = 45.0 + (hash(f"{scenario.edit}_{scenario.tier}") % 100) / 100.0
-            report["metrics"]["primary_metric"] = {
-                "kind": "perplexity",
-                "preview": base_ppl,
-                "final": base_ppl + 1.0,
+        # Wire run context for pairing verification
+        run_context = {
+            "profile": scenario.profile,
+            "dataset": {"provider": dataset_name, "seed": scenario.seed},
+            "pairing_baseline": pairing_schedule,
+            "eval": {"loss": {"resolved_type": "causal"}},
+            "run_id": f"{scenario.edit}-{scenario.tier}-p{scenario.probes}-{run_type}",
+        }
+        spike_threshold = float(
+            run_config.get("eval", {}).get("spike_threshold", 2.0) or 2.0
+        )
+        cfg = _RunConfig(
+            device=scenario.device,
+            max_pm_ratio=spike_threshold,
+            spike_threshold=spike_threshold,
+            event_path=event_path,
+            context=run_context,
+        )
+        runner = _CoreRunner()
+        core_report = runner.execute(
+            model=model,
+            adapter=adapter,
+            edit=edit_op,
+            guards=guards,
+            config=cfg,
+            calibration_data=calibration_data,
+            auto_config=auto_config,
+            edit_config=run_config.get("edit", {}).get("plan", {}),
+            preview_n=scenario.preview_n,
+            final_n=scenario.final_n,
+        )
+        # Convert to evaluation RunReport (dict) for downstream tooling
+        report = create_empty_report()
+        report["meta"].update(
+            {
+                "model_id": scenario.model_id,
+                "adapter": scenario.adapter,
+                "device": str(scenario.device),
+                "commit": "",
+                "seed": scenario.seed,
+                "ts": datetime.now().isoformat(),
             }
-            report["metrics"]["latency_ms_per_tok"] = (
-                12.0 + (hash(scenario.tier) % 20) / 10.0
-            )
-            report["metrics"]["memory_mb_peak"] = 2000.0 + (
-                hash(str(scenario.probes)) % 200
-            )
-            report["metrics"]["rmt"] = {"outliers": 2 + (hash(scenario.edit) % 3)}
-            report["metrics"]["invariants"] = {"violations": 0}
-        else:
-            # Guarded runs: guard overhead, better stability, varies by tier
-            tier_factor = {"conservative": 0.95, "balanced": 0.97, "aggressive": 0.99}[
-                scenario.tier
-            ]
-            probe_factor = 1.0 - (
-                scenario.probes * 0.01
-            )  # Small improvement with probes
-            base_ppl = 45.0 + (hash(f"{scenario.edit}_{scenario.tier}") % 100) / 100.0
-            report["metrics"]["primary_metric"] = {
-                "kind": "perplexity",
-                "preview": base_ppl * tier_factor,
-                "final": base_ppl * tier_factor * probe_factor,
+        )
+        if tokenizer_hash:
+            report["meta"]["tokenizer_hash"] = tokenizer_hash
+        dur = core_report.meta.get("duration") if hasattr(core_report, "meta") else None
+        if isinstance(dur, int | float):
+            report["meta"]["duration_s"] = float(dur)
+        report["data"].update(
+            {
+                "dataset": dataset_name,
+                "split": split,
+                "seq_len": scenario.seq_len,
+                "stride": scenario.stride,
+                "preview_n": int(scenario.preview_n or 0),
+                "final_n": int(scenario.final_n or 0),
             }
+        )
-            # Guard overhead varies by tier
-            time_overhead = {
-                "conservative": 0.12,
-                "balanced": 0.08,
-                "aggressive": 0.05,
-            }[scenario.tier]
-            mem_overhead = {"conservative": 0.08, "balanced": 0.06, "aggressive": 0.04}[
-                scenario.tier
-            ]
-            report["metrics"]["latency_ms_per_tok"] = (
-                12.0 + (hash(scenario.tier) % 20) / 10.0
-            ) * (1 + time_overhead)
-            report["metrics"]["memory_mb_peak"] = (
-                2000.0 + (hash(str(scenario.probes)) % 200)
-            ) * (1 + mem_overhead)
-            report["metrics"]["rmt"] = {
-                "outliers": max(
-                    0,
-                    2
-                    + (hash(scenario.edit) % 3)
-                    - (1 if scenario.tier == "conservative" else 0),
-                )
+        edit_meta = core_report.edit if hasattr(core_report, "edit") else {}
+        plan_digest = ""
+        try:
+            if isinstance(edit_meta, dict):
+                plan_digest = str(edit_meta.get("plan_digest", ""))
+        except Exception:
+            plan_digest = ""
+        report["edit"].update(
+            {
+                "name": scenario.edit,
+                "plan_digest": plan_digest,
+                "deltas": (
+                    edit_meta.get("deltas", report["edit"]["deltas"])
+                    if isinstance(edit_meta, dict)
+                    else report["edit"]["deltas"]
+                ),
             }
-            report["metrics"]["invariants"] = {"violations": 0}
+        )
-            # Mock guard reports for guarded runs
-            report["guards"] = [
-                {
-                    "name": "invariants",
-                    "policy": {"mode": "enforce"},
-                    "metrics": {"checks": 5, "violations": 0},
-                    "actions": ["validated"],
-                    "violations": [],
-                },
-                {
-                    "name": "spectral",
-                    "policy": {
-                        "sigma_quantile": tier_factor,
-                        "scope": "ffn",
-                        "deadband": 0.10,
-                    },
-                    "metrics": {
-                        "max_sigma": 1.2,
-                        "corrections": 1 if scenario.tier == "conservative" else 0,
-                    },
-                    "actions": ["monitored"],
-                    "violations": [],
-                },
-                {
-                    "name": "rmt",
-                    "policy": {
-                        "deadband": 0.05 if scenario.tier == "conservative" else 0.10,
-                        "margin": 1.5,
-                    },
-                    "metrics": {
-                        "outliers": report["metrics"]["rmt"]["outliers"],
-                        "mp_fit": 0.95,
-                    },
-                    "actions": ["validated"],
-                    "violations": [],
-                },
-            ]
+        # Transfer metrics
+        if hasattr(core_report, "metrics") and isinstance(core_report.metrics, dict):
+            report["metrics"].update(core_report.metrics)
+        if hasattr(core_report, "evaluation_windows") and isinstance(
+            core_report.evaluation_windows, dict
+        ):
+            report["evaluation_windows"] = core_report.evaluation_windows
+        # Transfer guards
+        if hasattr(core_report, "guards") and isinstance(core_report.guards, dict):
+            for name, guard_result in core_report.guards.items():
+                if not isinstance(guard_result, dict):
+                    continue
+                report["guards"].append(
+                    {
+                        "name": name,
+                        "passed": guard_result.get("passed"),
+                        "action": guard_result.get("action"),
+                        "policy": guard_result.get("policy", {}),
+                        "metrics": guard_result.get("metrics", {}),
+                        "actions": guard_result.get("actions", []),
+                        "violations": guard_result.get("violations", []),
+                        "warnings": guard_result.get("warnings", []),
+                        "errors": guard_result.get("errors", []),
+                        "details": guard_result.get("details", {}),
+                    }
+                )
-        # Mock artifacts
-        report["artifacts"]["events_path"] = (
-            f"mock_events_{scenario.edit}_{scenario.tier}_{scenario.probes}_{run_type}.jsonl"
+        # Compute comparable RMT outliers for both bare and guarded models
+        try:
+            detection = _rmt_detect(
+                model=model,
+                threshold=rmt_margin,
+                detect_only=True,
+                baseline_sigmas=rmt_baseline_sigmas,
+                baseline_mp_stats=rmt_baseline_mp_stats,
+                deadband=rmt_deadband,
+            )
+            report["metrics"].setdefault("rmt", {})
+            if isinstance(report["metrics"].get("rmt"), dict):
+                report["metrics"]["rmt"]["outliers"] = int(
+                    detection.get("n_layers_flagged", 0) or 0
+                )
+        except Exception:
+            pass
+        # Flags and artifacts
+        status = getattr(core_report, "status", "")
+        rollback_reason = (
+            core_report.meta.get("rollback_reason")
+            if hasattr(core_report, "meta") and isinstance(core_report.meta, dict)
+            else None
         )
-        report["artifacts"]["logs_path"] = (
-            f"mock_logs_{scenario.edit}_{scenario.tier}_{scenario.probes}_{run_type}.txt"
+        report["flags"].update(
+            {
+                "guard_recovered": bool(
+                    (
+                        hasattr(core_report, "meta")
+                        and core_report.meta.get("guard_recovered")
+                    )
+                    or str(status).lower() == "rollback"
+                ),
+                "rollback_reason": rollback_reason,
+            }
         )
+        report["artifacts"].update(
+            {
+                "events_path": str(event_path),
+                "logs_path": "",
+                "checkpoint_path": None,
+                "report_path": str(run_dir / "report.json"),
+            }
+        )
+        _write_json(Path(report["artifacts"]["report_path"]), report)
-        return RunResult(run_type=run_type, report=report, success=True)
+        success = str(status).lower() != "failed"
+        return RunResult(run_type=run_type, report=report, success=success)
     except Exception as e:
         logger.error(f"Run failed for {scenario.edit} ({run_type}): {e}")
@@ -671,21 +929,70 @@ def execute_scenario(
     config_manager = ConfigurationManager()
     metrics_aggregator = MetricsAggregator()
+    # Scenario-scoped artifact directory
+    scenario_slug = f"{scenario.edit}__{scenario.tier}__p{scenario.probes}"
+    scenario_dir = output_dir / "scenarios" / scenario_slug
+    scenario_dir.mkdir(parents=True, exist_ok=True)
+    runtime: dict[str, Any] = {"dataset_name": config.dataset}
     # Run bare configuration
     logger.debug(f"Running bare configuration for {scenario.edit}")
     bare_config = config_manager.create_bare_config(scenario)
-    bare_result = execute_single_run(bare_config, scenario, "bare", output_dir)
+    try:
+        bare_config.setdefault("dataset", {})["provider"] = config.dataset
+    except Exception:
+        pass
+    bare_result = execute_single_run(
+        bare_config, scenario, "bare", scenario_dir, runtime=runtime
+    )
     # Run guarded configuration
     logger.debug(f"Running guarded configuration for {scenario.edit}")
     guarded_config = config_manager.create_guarded_config(scenario)
-    guarded_result = execute_single_run(guarded_config, scenario, "guarded", output_dir)
-    # Compute comparison metrics
-    comparison_metrics = metrics_aggregator.compute_comparison_metrics(
-        bare_result, guarded_result
+    try:
+        guarded_config.setdefault("dataset", {})["provider"] = config.dataset
+    except Exception:
+        pass
+    guarded_result = execute_single_run(
+        guarded_config, scenario, "guarded", scenario_dir, runtime=runtime
     )
+    artifacts: dict[str, Any] = {"scenario_dir": str(scenario_dir)}
+    pairing_schedule = runtime.get("pairing_schedule")
+    if isinstance(pairing_schedule, dict):
+        pairing_path = scenario_dir / "pairing_schedule.json"
+        pairing_path.write_text(
+            json.dumps(pairing_schedule, indent=2), encoding="utf-8"
+        )
+        artifacts["pairing_schedule"] = str(pairing_path)
+    try:
+        if bare_result and bare_result.report:
+            artifacts["bare_report"] = bare_result.report.get("artifacts", {}).get(
+                "report_path"
+            )
+    except Exception:
+        pass
+    try:
+        if guarded_result and guarded_result.report:
+            artifacts["guarded_report"] = guarded_result.report.get(
+                "artifacts", {}
+            ).get("report_path")
+    except Exception:
+        pass
+    # Generate certificate artifact when both runs produced reports
+    try:
+        if bare_result.success and guarded_result.success:
+            from invarlock.reporting.certificate import make_certificate
+            cert = make_certificate(guarded_result.report, bare_result.report)
+            cert_path = scenario_dir / "certificate.json"
+            cert_path.write_text(json.dumps(cert, indent=2), encoding="utf-8")
+            artifacts["certificate"] = str(cert_path)
+    except Exception as exc:
+        logger.warning(f"Certificate generation failed for {scenario_slug}: {exc}")
     # Resolve epsilon from runtime or use config
     epsilon_used = config.epsilon
     if epsilon_used is None and guarded_result.success:
@@ -693,8 +1000,24 @@ def execute_scenario(
     elif epsilon_used is None:
         epsilon_used = 0.10  # Default fallback
-    # Validate gates
-    gates = ValidationGates.validate_all_gates(comparison_metrics, config, epsilon_used)
+    # Compute comparison metrics and validate gates.
+    comparison_metrics = metrics_aggregator.compute_comparison_metrics(
+        bare_result, guarded_result
+    )
+    if not (bare_result.success and guarded_result.success):
+        # Treat execution failures as a hard FAIL: benchmarks are only meaningful
+        # when both paired runs complete.
+        comparison_metrics = {
+            "error_bare": bare_result.error_message,
+            "error_guarded": guarded_result.error_message,
+        }
+        gates = dict.fromkeys(
+            ("spike", "tying", "rmt", "quality", "time", "mem"), False
+        )
+    else:
+        gates = ValidationGates.validate_all_gates(
+            comparison_metrics, config, epsilon_used
+        )
     # Mock probes_used based on scenario.probes (in real implementation, this would come from auto-tuner)
     probes_used = min(
@@ -705,6 +1028,7 @@ def execute_scenario(
         config=scenario,
         bare_result=bare_result,
         guarded_result=guarded_result,
+        artifacts=artifacts,
         metrics=comparison_metrics,
         gates=gates,
         probes_used=probes_used,
@@ -843,6 +1167,7 @@ def _summary_to_step14_json(summary: BenchmarkSummary) -> dict[str, Any]:
             "probes_used": result.probes_used,
             "skip": result.skipped,
             "skip_reason": result.skip_reason,
+            "artifacts": result.artifacts,
         }
         if not result.skipped and result.metrics:
@@ -1033,6 +1358,7 @@ def _scenario_result_to_dict(result: ScenarioResult) -> dict[str, Any]:
         "probes_used": result.probes_used,
         "skipped": result.skipped,
         "skip_reason": result.skip_reason,
+        "artifacts": result.artifacts,
         "metrics": result.metrics,
         "gates": result.gates,
         "epsilon_used": result.epsilon_used,

invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

invarlock 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl