PyPI - invarlock - Versions diffs - 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

invarlock 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

invarlock/__init__.py +2 -2
invarlock/_data/runtime/tiers.yaml +57 -30
invarlock/adapters/__init__.py +11 -15
invarlock/adapters/auto.py +35 -40
invarlock/adapters/capabilities.py +2 -2
invarlock/adapters/hf_causal.py +418 -0
invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
invarlock/adapters/hf_mixin.py +25 -4
invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
invarlock/calibration/spectral_null.py +15 -10
invarlock/calibration/variance_ve.py +0 -2
invarlock/cli/adapter_auto.py +31 -21
invarlock/cli/app.py +73 -2
invarlock/cli/commands/calibrate.py +6 -2
invarlock/cli/commands/certify.py +651 -91
invarlock/cli/commands/doctor.py +11 -11
invarlock/cli/commands/explain_gates.py +57 -8
invarlock/cli/commands/plugins.py +13 -9
invarlock/cli/commands/report.py +233 -69
invarlock/cli/commands/run.py +1066 -244
invarlock/cli/commands/verify.py +154 -15
invarlock/cli/config.py +22 -6
invarlock/cli/doctor_helpers.py +4 -5
invarlock/cli/output.py +193 -0
invarlock/cli/provenance.py +1 -1
invarlock/core/api.py +45 -5
invarlock/core/auto_tuning.py +65 -20
invarlock/core/bootstrap.py +1 -1
invarlock/core/contracts.py +7 -1
invarlock/core/registry.py +11 -13
invarlock/core/runner.py +425 -75
invarlock/edits/quant_rtn.py +65 -37
invarlock/eval/bench.py +3 -16
invarlock/eval/data.py +82 -51
invarlock/eval/metrics.py +63 -2
invarlock/eval/primary_metric.py +23 -0
invarlock/eval/tail_stats.py +230 -0
invarlock/eval/tasks/__init__.py +12 -0
invarlock/eval/tasks/classification.py +48 -0
invarlock/eval/tasks/qa.py +36 -0
invarlock/eval/tasks/text_generation.py +102 -0
invarlock/guards/_estimators.py +154 -0
invarlock/guards/invariants.py +19 -10
invarlock/guards/policies.py +16 -6
invarlock/guards/rmt.py +627 -546
invarlock/guards/spectral.py +348 -110
invarlock/guards/tier_config.py +32 -30
invarlock/guards/variance.py +7 -31
invarlock/guards_ref/rmt_ref.py +23 -23
invarlock/model_profile.py +90 -42
invarlock/observability/health.py +6 -6
invarlock/observability/metrics.py +108 -0
invarlock/reporting/certificate.py +384 -55
invarlock/reporting/certificate_schema.py +3 -2
invarlock/reporting/dataset_hashing.py +15 -2
invarlock/reporting/guards_analysis.py +350 -277
invarlock/reporting/html.py +55 -5
invarlock/reporting/normalizer.py +13 -0
invarlock/reporting/policy_utils.py +38 -36
invarlock/reporting/primary_metric_utils.py +71 -17
invarlock/reporting/render.py +852 -431
invarlock/reporting/report.py +40 -4
invarlock/reporting/report_types.py +11 -3
invarlock/reporting/telemetry.py +86 -0
invarlock/reporting/validate.py +1 -18
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
invarlock/adapters/hf_gpt2.py +0 -404
invarlock/adapters/hf_llama.py +0 -487
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0

invarlock/core/runner.py CHANGED Viewed

@@ -18,7 +18,23 @@ from typing import Any
 import numpy as np
-from .api import Guard, ModelAdapter, ModelEdit, RunConfig, RunReport
+from invarlock.eval.tail_stats import evaluate_metric_tail
+from invarlock.observability.metrics import (
+    capture_memory_snapshot,
+    reset_peak_memory_stats,
+    summarize_memory_snapshots,
+)
+from .api import (
+    EditLike,
+    Guard,
+    GuardWithContext,
+    GuardWithPrepare,
+    ModelAdapter,
+    ModelEdit,
+    RunConfig,
+    RunReport,
+)
 from .auto_tuning import resolve_tier_policies
 from .bootstrap import (
     compute_logloss_ci,
@@ -112,7 +128,7 @@ class CoreRunner:
         self,
         model: Any,
         adapter: ModelAdapter,
-        edit: ModelEdit,
+        edit: ModelEdit | EditLike,
         guards: list[Guard],
         config: RunConfig,
         calibration_data: Any = None,
@@ -175,10 +191,22 @@ class CoreRunner:
                     config.context["auto"] = dict(auto_config)
                 try:
                     report.context["auto"] = config.context["auto"]
-                except Exception:
+                except Exception:  # pragma: no cover - defensive context propagation
                     pass
         report.status = RunStatus.RUNNING.value
+        timings: dict[str, float] = {}
+        guard_timings: dict[str, float] = {}
+        memory_snapshots: list[dict[str, Any]] = []
+        total_start = time.perf_counter()
+        def _record_timing(key: str, start: float) -> None:
+            timings[key] = max(0.0, float(time.perf_counter() - start))
+        def _capture_memory(phase: str) -> None:
+            snapshot = capture_memory_snapshot(phase)
+            if snapshot:
+                memory_snapshots.append(snapshot)
         try:
             # Log start
@@ -194,40 +222,78 @@ class CoreRunner:
             )
             # Phase 1: Prepare (describe model, create checkpoint)
-            model_desc = self._prepare_phase(model, adapter, report)
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                model_desc = self._prepare_phase(model, adapter, report)
+            finally:
+                _record_timing("prepare", phase_start)
+                _capture_memory("prepare")
             # Phase 2: Prepare guards (must happen before edit)
-            self._prepare_guards_phase(
-                model,
-                adapter,
-                guards,
-                calibration_data,
-                report,
-                auto_config,
-                config,
-            )
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                self._prepare_guards_phase(
+                    model,
+                    adapter,
+                    guards,
+                    calibration_data,
+                    report,
+                    auto_config,
+                    config,
+                )
+            finally:
+                _record_timing("prepare_guards", phase_start)
+                _capture_memory("prepare_guards")
             # Phase 3: Apply edit
-            self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
+            finally:
+                _record_timing("edit", phase_start)
+                _capture_memory("edit")
             # Phase 4: Run guards
-            guard_results = self._guard_phase(model, adapter, guards, report)
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                guard_results = self._guard_phase(
+                    model, adapter, guards, report, guard_timings=guard_timings
+                )
+            finally:
+                _record_timing("guards", phase_start)
+                _capture_memory("guards")
             # Phase 5: Evaluate final metrics
-            metrics = self._eval_phase(
-                model,
-                adapter,
-                calibration_data,
-                report,
-                preview_n,
-                final_n,
-                config,
-            )
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                metrics = self._eval_phase(
+                    model,
+                    adapter,
+                    calibration_data,
+                    report,
+                    preview_n,
+                    final_n,
+                    config,
+                )
+            finally:
+                _record_timing("eval", phase_start)
+                _capture_memory("eval")
             # Phase 6: Finalize or rollback
-            final_status = self._finalize_phase(
-                model, adapter, guard_results, metrics, config, report
-            )
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                final_status = self._finalize_phase(
+                    model, adapter, guard_results, metrics, config, report
+                )
+            finally:
+                _record_timing("finalize", phase_start)
+                _capture_memory("finalize")
             report.status = final_status
             report.meta["end_time"] = time.time()
@@ -249,6 +315,25 @@ class CoreRunner:
             return report
         finally:
+            _record_timing("total", total_start)
+            if not isinstance(report.metrics, dict):
+                report.metrics = {}
+            if timings:
+                report.metrics.setdefault("timings", {}).update(timings)
+            if guard_timings:
+                report.metrics["guard_timings"] = guard_timings
+            if memory_snapshots:
+                report.metrics["memory_snapshots"] = memory_snapshots
+                summary = summarize_memory_snapshots(memory_snapshots)
+                if summary:
+                    mem_peak = summary.get("memory_mb_peak")
+                    if isinstance(mem_peak, (int | float)):
+                        existing = report.metrics.get("memory_mb_peak")
+                        if isinstance(existing, (int | float)):
+                            summary["memory_mb_peak"] = max(
+                                float(existing), float(mem_peak)
+                            )
+                    report.metrics.update(summary)
             self._active_model = None
             self._active_adapter = None
             self._cleanup_services()
@@ -303,10 +388,10 @@ class CoreRunner:
         self,
         model: Any,
         adapter: ModelAdapter,
-        edit: ModelEdit,
+        edit: ModelEdit | EditLike,
         model_desc: dict[str, Any],
         report: RunReport,
-        edit_config: dict[str, Any] | None = None,
+        edit_config: dict[str, Any] | None,
     ) -> dict[str, Any]:
         """Phase 2: Apply edit operation."""
         edit_label = "baseline" if edit.name == "baseline" else edit.name
@@ -388,7 +473,7 @@ class CoreRunner:
                         {"guard": guard.name, "policy": guard_policy},
                     )
-                if hasattr(guard, "set_run_context"):
+                if isinstance(guard, GuardWithContext):
                     try:
                         guard.set_run_context(report)
                     except Exception as exc:
@@ -400,7 +485,7 @@ class CoreRunner:
                         )
                 # Call prepare method if it exists (most guards need this)
-                if hasattr(guard, "prepare"):
+                if isinstance(guard, GuardWithPrepare):
                     prepare_result = guard.prepare(
                         model, adapter, calibration_data, guard_policy
                     )
@@ -444,7 +529,13 @@ class CoreRunner:
         )
     def _guard_phase(
-        self, model: Any, adapter: ModelAdapter, guards: list[Guard], report: RunReport
+        self,
+        model: Any,
+        adapter: ModelAdapter,
+        guards: list[Guard],
+        report: RunReport,
+        *,
+        guard_timings: dict[str, float] | None = None,
     ) -> dict[str, dict[str, Any]]:
         """Phase 4: Run safety guards."""
         self._log_event("guards", "start", LogLevel.INFO, {"count": len(guards)})
@@ -453,8 +544,9 @@ class CoreRunner:
         for guard in guards:
             self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
+            guard_start = time.perf_counter()
-            if hasattr(guard, "set_run_context"):
+            if isinstance(guard, GuardWithContext):
                 try:
                     guard.set_run_context(report)
                 except Exception as exc:  # pragma: no cover - defensive
@@ -486,6 +578,11 @@ class CoreRunner:
                     LogLevel.ERROR,
                     {"guard": guard.name, "error": str(e)},
                 )
+            finally:
+                if guard_timings is not None:
+                    guard_timings[guard.name] = max(
+                        0.0, float(time.perf_counter() - guard_start)
+                    )
         report.guards = guard_results
@@ -583,6 +680,116 @@ class CoreRunner:
             }
             eval_windows = {"preview": {}, "final": {}}
+        # Optional: compute primary metric tail evidence vs baseline when provided.
+        try:
+            pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
+            pm_kind = str(pm.get("kind", "")).lower() if isinstance(pm, dict) else ""
+            is_ppl_metric = pm_kind.startswith("ppl")
+            baseline_eval = {}
+            if (
+                is_ppl_metric
+                and config
+                and isinstance(config.context, dict)
+                and isinstance(config.context.get("baseline_eval_windows"), dict)
+            ):
+                baseline_eval = config.context.get("baseline_eval_windows") or {}
+            if is_ppl_metric and baseline_eval:
+                tier_policies = (
+                    report.meta.get("tier_policies", {})
+                    if isinstance(getattr(report, "meta", None), dict)
+                    else {}
+                )
+                metrics_policy = (
+                    tier_policies.get("metrics", {})
+                    if isinstance(tier_policies, dict)
+                    else {}
+                )
+                pm_tail_policy = (
+                    metrics_policy.get("pm_tail", {})
+                    if isinstance(metrics_policy, dict)
+                    else {}
+                )
+                run_final = (
+                    eval_windows.get("final", {})
+                    if isinstance(eval_windows, dict)
+                    else {}
+                )
+                base_final = (
+                    baseline_eval.get("final", {})
+                    if isinstance(baseline_eval, dict)
+                    else {}
+                )
+                deltas: list[float] = []
+                weights: list[float] = []
+                run_ids = (
+                    run_final.get("window_ids") if isinstance(run_final, dict) else None
+                )
+                run_ll = (
+                    run_final.get("logloss") if isinstance(run_final, dict) else None
+                )
+                run_tc = (
+                    run_final.get("token_counts")
+                    if isinstance(run_final, dict)
+                    else None
+                )
+                base_ids = (
+                    base_final.get("window_ids")
+                    if isinstance(base_final, dict)
+                    else None
+                )
+                base_ll = (
+                    base_final.get("logloss") if isinstance(base_final, dict) else None
+                )
+                if (
+                    isinstance(run_ids, list)
+                    and isinstance(run_ll, list)
+                    and isinstance(base_ids, list)
+                    and isinstance(base_ll, list)
+                ):
+                    base_map: dict[int, float] = {}
+                    for b_id, b_val in zip(base_ids, base_ll, strict=False):
+                        if isinstance(b_id, int | float) and isinstance(
+                            b_val, int | float
+                        ):
+                            base_map[int(b_id)] = float(b_val)
+                    for idx, (r_id, r_val) in enumerate(
+                        zip(run_ids, run_ll, strict=False)
+                    ):
+                        if not (
+                            isinstance(r_id, int | float)
+                            and isinstance(r_val, int | float)
+                        ):
+                            continue
+                        key = int(r_id)
+                        if key not in base_map:
+                            continue
+                        dv = float(r_val) - base_map[key]
+                        if math.isfinite(dv):
+                            deltas.append(float(dv))
+                            if isinstance(run_tc, list) and idx < len(run_tc):
+                                try:
+                                    wv = float(run_tc[idx])
+                                except Exception:
+                                    wv = 0.0
+                                weights.append(float(max(wv, 0.0)))
+                tail_result = evaluate_metric_tail(
+                    deltas=deltas,
+                    weights=weights
+                    if (weights and len(weights) == len(deltas))
+                    else None,
+                    policy=pm_tail_policy if isinstance(pm_tail_policy, dict) else None,
+                )
+                tail_result["source"] = "paired_baseline.final"
+                metrics["primary_metric_tail"] = tail_result
+        except Exception:  # pragma: no cover - best effort
+            pass
         policy_flags = self._resolve_policy_flags(config)
         eval_error = metrics.get("eval_error") if isinstance(metrics, dict) else None
         if eval_error:
@@ -834,8 +1041,10 @@ class CoreRunner:
         pairing_reason = None
         preview_pair_stats = {"matched": 0, "expected": 0}
         final_pair_stats = {"matched": 0, "expected": 0}
+        paired_windows_attempted = 0
         preview_window_ids: list[int] = []
         final_window_ids: list[int] = []
         preview_tokens: list[list[int]] = []
         final_tokens: list[list[int]] = []
         preview_limit = min(preview_n, len(preview_data)) if preview_data else 0
@@ -876,6 +1085,8 @@ class CoreRunner:
         # even if an exception occurs during the main compute block.
         delta_samples: list[float] = []
         delta_weights: list[float] = []
+        pm_invalid = False
+        degraded_reason: str | None = None
         try:
@@ -891,7 +1102,7 @@ class CoreRunner:
                 max_batches: int,
                 start_idx: int,
             ) -> dict[str, Any]:
-                nonlocal alignment_logged
+                nonlocal alignment_logged, eval_error
                 total_tokens_local = 0
                 actual_tokens_local = 0
@@ -927,7 +1138,9 @@ class CoreRunner:
                 limit = _resolve_limit(batches, max_batches)
                 for batch in batches[:limit]:
-                    if max_batches > 0 and count >= max_batches:
+                    if (
+                        max_batches > 0 and count >= max_batches
+                    ):  # pragma: no cover - slicing already caps iteration
                         break
                     labels = None
@@ -1100,7 +1313,7 @@ class CoreRunner:
                                 "zero_mask_batches": zero_mask_batches,
                                 "requested": limit,
                             },
-                        )
+                        )  # pragma: no cover - requires debug tracing with zero batches
                     if resolved_loss_mode == "mlm":
                         error_msg = (
                             "MLM evaluation produced zero usable batches; "
@@ -1121,7 +1334,10 @@ class CoreRunner:
                                 "zero_mask_batches": zero_mask_batches,
                             },
                         )
-                        raise ValueError(error_msg)
+                        eval_error = {
+                            "error": "mlm_missing_masks",
+                            "detail": error_msg,
+                        }
                     return {
                         "ppl": float("nan"),
                         "total_tokens": total_tokens_local,
@@ -1167,8 +1383,42 @@ class CoreRunner:
                 final_data, final_limit, preview_summary["num_batches"]
             )
-            preview_log_losses = preview_summary["log_losses"]
-            final_log_losses = final_summary["log_losses"]
+            preview_raw_losses = preview_summary["log_losses"]
+            final_raw_losses = final_summary["log_losses"]
+            try:
+                paired_windows_attempted = min(
+                    len(preview_raw_losses), len(final_raw_losses)
+                )
+            except Exception:
+                paired_windows_attempted = 0
+            preview_log_losses = [
+                float(loss) for loss in preview_raw_losses if math.isfinite(loss)
+            ]
+            final_log_losses = [
+                float(loss) for loss in final_raw_losses if math.isfinite(loss)
+            ]
+            if len(preview_log_losses) != len(preview_raw_losses):
+                self._log_event(
+                    "eval",
+                    "non_finite_preview_losses_filtered",
+                    LogLevel.WARNING,
+                    {
+                        "total": len(preview_raw_losses),
+                        "filtered": len(preview_raw_losses) - len(preview_log_losses),
+                    },
+                )
+            if len(final_log_losses) != len(final_raw_losses):
+                self._log_event(
+                    "eval",
+                    "non_finite_final_losses_filtered",
+                    LogLevel.WARNING,
+                    {
+                        "total": len(final_raw_losses),
+                        "filtered": len(final_raw_losses) - len(final_log_losses),
+                    },
+                )
             preview_tokens_ct = preview_summary["total_tokens"]
             final_tokens_ct = final_summary["total_tokens"]
             preview_batches_ct = preview_summary["num_batches"]
@@ -1235,14 +1485,29 @@ class CoreRunner:
             delta_mean_log = final_mean_log - preview_mean_log
             pm_ratio = math.exp(delta_mean_log)
-            if not (math.isfinite(delta_mean_log) and math.isfinite(pm_ratio)):
-                raise RuntimeError("Invalid perplexity ratio or delta")
-            expected_ratio = math.exp(delta_mean_log)
-            if abs(pm_ratio - expected_ratio) > 1e-6:
-                raise RuntimeError(
-                    "Primary-metric ratio mismatch with exp(mean ΔlogNLL)"
+            pm_invalid = False
+            try:
+                if not (math.isfinite(delta_mean_log) and math.isfinite(pm_ratio)):
+                    raise RuntimeError("non_finite_primary_metric")
+                expected_ratio = math.exp(delta_mean_log)
+                if abs(pm_ratio - expected_ratio) > 1e-6:
+                    raise RuntimeError("primary_metric_ratio_mismatch")
+            except Exception as exc:
+                pm_invalid = True
+                self._log_event(
+                    "eval",
+                    "primary_metric_invalid",
+                    LogLevel.WARNING,
+                    {
+                        "pm_preview": float(pm_preview),
+                        "pm_final": float(pm_final),
+                        "delta_mean_log": float(delta_mean_log),
+                        "pm_ratio": float(pm_ratio),
+                        "error": str(exc),
+                    },
                 )
+                # Preserve downstream reporting; keep NaNs but mark degraded
             if bootstrap_enabled and preview_log_losses:
                 preview_log_ci = compute_logloss_ci(
@@ -1298,7 +1563,20 @@ class CoreRunner:
                     abs(r - e) > 1e-6
                     for r, e in zip(ratio_ci, expected_ratio_ci, strict=False)
                 ):
-                    raise RuntimeError("Ratio CI inconsistent with Δlog CI")
+                    pm_invalid = True
+                    self._log_event(
+                        "eval",
+                        "ratio_ci_inconsistent",
+                        LogLevel.WARNING,
+                        {
+                            "ratio_ci": ratio_ci,
+                            "expected_ratio_ci": expected_ratio_ci,
+                        },
+                    )
+                    ratio_ci = (
+                        float(expected_ratio_ci[0]),
+                        float(expected_ratio_ci[1]),
+                    )
             else:
                 delta_log_ci = (delta_mean_log, delta_mean_log)
                 ratio_ci = (pm_ratio, pm_ratio)
@@ -1335,19 +1613,60 @@ class CoreRunner:
                 degenerate_reason = "no_variation"
             if degenerate_delta:
+                pm_invalid = True
                 self._log_event(
                     "eval",
                     "degenerate_delta_samples",
-                    LogLevel.ERROR,
+                    LogLevel.WARNING,
                     {
                         "reason": degenerate_reason,
                         "sample_count": len(delta_samples),
                     },
                 )
-                if profile_label in {"ci", "release"}:
-                    raise RuntimeError(
-                        f"Degenerate paired ΔlogNLL distribution ({degenerate_reason})"
+            needs_pm_fallback = (not math.isfinite(pm_preview)) or (
+                not math.isfinite(pm_final)
+            )
+            needs_delta_fallback = (not math.isfinite(delta_mean_log)) or (
+                not math.isfinite(pm_ratio)
+            )
+            degraded_reason: str | None = None
+            if needs_pm_fallback:
+                degraded_reason = "non_finite_pm"
+            elif needs_delta_fallback:
+                degraded_reason = "non_finite_delta"
+            elif degenerate_reason:
+                degraded_reason = f"degenerate_delta:{degenerate_reason}"
+            elif pm_invalid:
+                degraded_reason = "primary_metric_invalid"
+            if needs_pm_fallback or needs_delta_fallback:
+                pm_invalid = True
+                pm_fallback = (
+                    pm_preview
+                    if math.isfinite(pm_preview) and pm_preview > 0
+                    else pm_final
+                )
+                if not (math.isfinite(pm_fallback) and pm_fallback > 0):
+                    pm_fallback = 1.0
+                if needs_pm_fallback:
+                    pm_preview = (
+                        pm_preview
+                        if math.isfinite(pm_preview) and pm_preview > 0
+                        else pm_fallback
                     )
+                    pm_final = (
+                        pm_final
+                        if math.isfinite(pm_final) and pm_final > 0
+                        else pm_fallback
+                    )
+                if needs_delta_fallback:
+                    if not math.isfinite(delta_mean_log):
+                        delta_mean_log = 0.0
+                    if not math.isfinite(pm_ratio):
+                        pm_ratio = 1.0
             def _hash_tokens(tokens: list[int]) -> bytes:
                 if not tokens:
@@ -1371,10 +1690,14 @@ class CoreRunner:
                 if not isinstance(dataset_cfg, dict):
                     return None
                 seq_len_val = dataset_cfg.get("seq_len")
-                stride_val = dataset_cfg.get("stride", seq_len_val)
+                if seq_len_val is None:
+                    return None
+                stride_raw = dataset_cfg.get("stride", seq_len_val)
+                if stride_raw is None:
+                    return None
                 try:
                     seq_len_f = float(seq_len_val)
-                    stride_f = float(stride_val)
+                    stride_f = float(stride_raw)
                 except (TypeError, ValueError):
                     return None
                 if not math.isfinite(seq_len_f) or seq_len_f <= 0:
@@ -1687,7 +2010,9 @@ class CoreRunner:
         except Exception:
             pass
-        paired_windows_count = len(delta_samples)
+        paired_windows_count = (
+            paired_windows_attempted if paired_windows_attempted else len(delta_samples)
+        )
         unweighted_delta_mean = (
             float(np.mean(delta_samples)) if delta_samples else float(delta_mean_log)
         )
@@ -1715,8 +2040,11 @@ class CoreRunner:
         metrics = {
             "primary_metric": {
                 "kind": pm_kind,
-                "preview": float(pm_preview),
-                "final": float(pm_final),
+                "preview": float(pm_preview) if math.isfinite(pm_preview) else None,
+                "final": float(pm_final) if math.isfinite(pm_final) else None,
+                "invalid": bool(pm_invalid),
+                "degraded": bool(pm_invalid or degraded_reason),
+                "degraded_reason": degraded_reason,
             },
             "logloss_preview": float(preview_mean_log),
             "logloss_final": float(final_mean_log),
@@ -2030,17 +2358,27 @@ class CoreRunner:
             except Exception:
                 drift_ratio = None
+        spike_threshold = getattr(config, "spike_threshold", 2.0)
         if drift_ratio is None:
             is_catastrophic_spike = False
             metrics_acceptable = True
         else:
-            spike_threshold = getattr(config, "spike_threshold", 2.0)
             is_catastrophic_spike = drift_ratio > spike_threshold
             # Check if standard metrics are acceptable against configured max ratio
             metrics_acceptable = drift_ratio <= getattr(config, "max_pm_ratio", 2.0)
         # Determine rollback reason and status
         rollback_reason = None
+        tail_failed = False
+        try:
+            pm_tail = metrics.get("primary_metric_tail", {})
+            if isinstance(pm_tail, dict) and pm_tail:
+                mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
+                evaluated = bool(pm_tail.get("evaluated", False))
+                passed = bool(pm_tail.get("passed", True))
+                tail_failed = bool(mode == "fail" and evaluated and (not passed))
+        except Exception:  # pragma: no cover
+            tail_failed = False
         if is_catastrophic_spike:
             rollback_reason = (
                 f"catastrophic_ppl_spike (ratio: {drift_ratio:.3f} > {spike_threshold})"
@@ -2057,6 +2395,9 @@ class CoreRunner:
                     "immediate_rollback": True,
                 },
             )
+        elif tail_failed:
+            rollback_reason = "primary_metric_tail_failed"
+            status = RunStatus.ROLLBACK.value
         elif (not all_guards_passed) or (not metrics_acceptable):
             # Match historical/test expectation string exactly
             rollback_reason = "guards_failed or metrics_unacceptable"
@@ -2185,20 +2526,27 @@ class CoreRunner:
     ) -> dict[str, dict[str, Any]]:
         """Resolve tier-based guard policies from configuration."""
         # Use passed auto_config if available, otherwise extract from report meta
-        if auto_config is None:
-            config_meta = report.meta.get("config", {})
+        auto_cfg: dict[str, Any] | None = auto_config
+        if auto_cfg is None:
+            config_meta = report.meta.get("config") or {}
             # Try to get auto config from various possible locations
-            if hasattr(report, "auto_config"):
-                auto_config = report.auto_config
-            elif "auto" in config_meta:
-                auto_config = config_meta["auto"]
-            else:
+            auto_cfg = report.__dict__.get("auto_config")
+            if (
+                auto_cfg is None
+                and isinstance(config_meta, dict)
+                and "auto" in config_meta
+            ):
+                auto_cfg = config_meta["auto"]
+            elif auto_cfg is None:
                 # Fallback to default balanced tier
-                auto_config = {"tier": "balanced", "enabled": True}
+                auto_cfg = {"tier": "balanced", "enabled": True}
+        if not isinstance(auto_cfg, dict):
+            auto_cfg = {"tier": "balanced", "enabled": True}
         # Extract tier and edit name
-        tier = auto_config.get("tier", "balanced")
+        tier = auto_cfg.get("tier", "balanced")
         edit_name = None
         if hasattr(report, "edit") and report.edit:
             edit_name = report.edit.get("name")
@@ -2208,8 +2556,10 @@ class CoreRunner:
             edit_name = report.meta["edit_name"]
         # Get explicit guard overrides from config
-        config_meta = report.meta.get("config", {})
-        explicit_overrides = config_meta.get("guards", {})
+        config_meta = report.meta.get("config") or {}
+        explicit_overrides = (
+            config_meta.get("guards", {}) if isinstance(config_meta, dict) else {}
+        )
         try:
             # Resolve tier policies
@@ -2237,18 +2587,18 @@ class CoreRunner:
     def _apply_guard_policy(self, guard: Guard, policy: dict[str, Any]) -> None:
         """Apply resolved policy parameters to a guard instance."""
         try:
+            guard_config = getattr(guard, "config", None)
+            guard_policy = getattr(guard, "policy", None)
             # Apply policy parameters to guard
             for param_name, param_value in policy.items():
                 if hasattr(guard, param_name):
                     setattr(guard, param_name, param_value)
-                elif hasattr(guard, "config") and isinstance(guard.config, dict):
-                    # Try to set in guard's config dict
-                    guard.config[param_name] = param_value
-                elif hasattr(guard, "policy") and isinstance(guard.policy, dict):
-                    # Try to set in guard's policy dict
-                    guard.policy[param_name] = param_value
+                elif isinstance(guard_config, dict):
+                    guard_config[param_name] = param_value
+                elif isinstance(guard_policy, dict):
+                    guard_policy[param_name] = param_value
                 else:
-                    # Last resort: add to guard as attribute
                     setattr(guard, param_name, param_value)
         except Exception as e:

invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

invarlock 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl