PyPI - invarlock - Versions diffs - 0.2.0__py3-none-any.whl - Mend

invarlock 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

invarlock/__init__.py +33 -0
invarlock/__main__.py +10 -0
invarlock/_data/runtime/profiles/ci_cpu.yaml +15 -0
invarlock/_data/runtime/profiles/release.yaml +23 -0
invarlock/_data/runtime/tiers.yaml +76 -0
invarlock/adapters/__init__.py +102 -0
invarlock/adapters/_capabilities.py +45 -0
invarlock/adapters/auto.py +99 -0
invarlock/adapters/base.py +530 -0
invarlock/adapters/base_types.py +85 -0
invarlock/adapters/hf_bert.py +852 -0
invarlock/adapters/hf_gpt2.py +403 -0
invarlock/adapters/hf_llama.py +485 -0
invarlock/adapters/hf_mixin.py +383 -0
invarlock/adapters/hf_onnx.py +112 -0
invarlock/adapters/hf_t5.py +137 -0
invarlock/adapters/py.typed +1 -0
invarlock/assurance/__init__.py +43 -0
invarlock/cli/__init__.py +8 -0
invarlock/cli/__main__.py +8 -0
invarlock/cli/_evidence.py +25 -0
invarlock/cli/_json.py +75 -0
invarlock/cli/adapter_auto.py +162 -0
invarlock/cli/app.py +287 -0
invarlock/cli/commands/__init__.py +26 -0
invarlock/cli/commands/certify.py +403 -0
invarlock/cli/commands/doctor.py +1358 -0
invarlock/cli/commands/explain_gates.py +151 -0
invarlock/cli/commands/export_html.py +100 -0
invarlock/cli/commands/plugins.py +1331 -0
invarlock/cli/commands/report.py +354 -0
invarlock/cli/commands/run.py +4146 -0
invarlock/cli/commands/verify.py +1040 -0
invarlock/cli/config.py +396 -0
invarlock/cli/constants.py +68 -0
invarlock/cli/device.py +92 -0
invarlock/cli/doctor_helpers.py +74 -0
invarlock/cli/errors.py +6 -0
invarlock/cli/overhead_utils.py +60 -0
invarlock/cli/provenance.py +66 -0
invarlock/cli/utils.py +41 -0
invarlock/config.py +56 -0
invarlock/core/__init__.py +62 -0
invarlock/core/abi.py +15 -0
invarlock/core/api.py +274 -0
invarlock/core/auto_tuning.py +317 -0
invarlock/core/bootstrap.py +226 -0
invarlock/core/checkpoint.py +221 -0
invarlock/core/contracts.py +73 -0
invarlock/core/error_utils.py +64 -0
invarlock/core/events.py +298 -0
invarlock/core/exceptions.py +95 -0
invarlock/core/registry.py +481 -0
invarlock/core/retry.py +146 -0
invarlock/core/runner.py +2041 -0
invarlock/core/types.py +154 -0
invarlock/edits/__init__.py +12 -0
invarlock/edits/_edit_utils.py +249 -0
invarlock/edits/_external_utils.py +268 -0
invarlock/edits/noop.py +47 -0
invarlock/edits/py.typed +1 -0
invarlock/edits/quant_rtn.py +801 -0
invarlock/edits/registry.py +166 -0
invarlock/eval/__init__.py +23 -0
invarlock/eval/bench.py +1207 -0
invarlock/eval/bootstrap.py +50 -0
invarlock/eval/data.py +2052 -0
invarlock/eval/metrics.py +2167 -0
invarlock/eval/primary_metric.py +767 -0
invarlock/eval/probes/__init__.py +24 -0
invarlock/eval/probes/fft.py +139 -0
invarlock/eval/probes/mi.py +213 -0
invarlock/eval/probes/post_attention.py +323 -0
invarlock/eval/providers/base.py +67 -0
invarlock/eval/providers/seq2seq.py +111 -0
invarlock/eval/providers/text_lm.py +113 -0
invarlock/eval/providers/vision_text.py +93 -0
invarlock/eval/py.typed +1 -0
invarlock/guards/__init__.py +18 -0
invarlock/guards/_contracts.py +9 -0
invarlock/guards/invariants.py +640 -0
invarlock/guards/policies.py +805 -0
invarlock/guards/py.typed +1 -0
invarlock/guards/rmt.py +2097 -0
invarlock/guards/spectral.py +1419 -0
invarlock/guards/tier_config.py +354 -0
invarlock/guards/variance.py +3298 -0
invarlock/guards_ref/__init__.py +15 -0
invarlock/guards_ref/rmt_ref.py +40 -0
invarlock/guards_ref/spectral_ref.py +135 -0
invarlock/guards_ref/variance_ref.py +60 -0
invarlock/model_profile.py +353 -0
invarlock/model_utils.py +221 -0
invarlock/observability/__init__.py +10 -0
invarlock/observability/alerting.py +535 -0
invarlock/observability/core.py +546 -0
invarlock/observability/exporters.py +565 -0
invarlock/observability/health.py +588 -0
invarlock/observability/metrics.py +457 -0
invarlock/observability/py.typed +1 -0
invarlock/observability/utils.py +553 -0
invarlock/plugins/__init__.py +12 -0
invarlock/plugins/hello_guard.py +33 -0
invarlock/plugins/hf_awq_adapter.py +82 -0
invarlock/plugins/hf_bnb_adapter.py +79 -0
invarlock/plugins/hf_gptq_adapter.py +78 -0
invarlock/plugins/py.typed +1 -0
invarlock/py.typed +1 -0
invarlock/reporting/__init__.py +7 -0
invarlock/reporting/certificate.py +3221 -0
invarlock/reporting/certificate_schema.py +244 -0
invarlock/reporting/dataset_hashing.py +215 -0
invarlock/reporting/guards_analysis.py +948 -0
invarlock/reporting/html.py +32 -0
invarlock/reporting/normalizer.py +235 -0
invarlock/reporting/policy_utils.py +517 -0
invarlock/reporting/primary_metric_utils.py +265 -0
invarlock/reporting/render.py +1442 -0
invarlock/reporting/report.py +903 -0
invarlock/reporting/report_types.py +278 -0
invarlock/reporting/utils.py +175 -0
invarlock/reporting/validate.py +631 -0
invarlock/security.py +176 -0
invarlock/sparsity_utils.py +323 -0
invarlock/utils/__init__.py +150 -0
invarlock/utils/digest.py +45 -0
invarlock-0.2.0.dist-info/METADATA +586 -0
invarlock-0.2.0.dist-info/RECORD +132 -0
invarlock-0.2.0.dist-info/WHEEL +5 -0
invarlock-0.2.0.dist-info/entry_points.txt +20 -0
invarlock-0.2.0.dist-info/licenses/LICENSE +201 -0
invarlock-0.2.0.dist-info/top_level.txt +1 -0

invarlock/reporting/validate.py ADDED Viewed

@@ -0,0 +1,631 @@
+"""
+InvarLock Validation Framework
+=========================
+Validation utilities for checking pruning results against baseline metrics.
+Supports both automated CI testing and flexible user validation.
+"""
+from __future__ import annotations
+import json
+import os
+import warnings
+from pathlib import Path
+from typing import Any, cast
+__all__ = [
+    "validate_against_baseline",
+    "validate_drift_gate",
+    "validate_guard_overhead",
+    "ValidationResult",
+    "load_baseline",
+    "save_baseline",
+    "create_baseline_from_report",
+]
+class ValidationResult:
+    """Container for validation results."""
+    def __init__(
+        self,
+        passed: bool,
+        checks: dict[str, bool],
+        metrics: dict[str, float],
+        messages: list[str],
+        warnings: list[str] | None = None,
+        errors: list[str] | None = None,
+    ):
+        self.passed = passed
+        self.checks = checks
+        self.metrics = metrics
+        self.messages = messages
+        self.warnings = warnings or []
+        self.errors = errors or []
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "passed": self.passed,
+            "checks": self.checks,
+            "metrics": self.metrics,
+            "messages": self.messages,
+            "warnings": self.warnings,
+            "errors": self.errors,
+        }
+    def summary(self) -> str:
+        """Get human-readable summary."""
+        status = "✓ PASSED" if self.passed else "✗ FAILED"
+        passed_count = sum(1 for check in self.checks.values() if check)
+        total_count = len(self.checks)
+        lines = [
+            f"Validation {status} ({passed_count}/{total_count} checks passed)",
+            "",
+        ]
+        # Show individual check results
+        for check_name, passed in self.checks.items():
+            symbol = "✓" if passed else "✗"
+            lines.append(f"  {symbol} {check_name}")
+        # Show messages
+        if self.messages:
+            lines.append("")
+            lines.extend(f"  {msg}" for msg in self.messages)
+        # Show warnings and errors
+        if self.warnings:
+            lines.append("")
+            lines.append("Warnings:")
+            lines.extend(f"  ⚠️ {warning}" for warning in self.warnings)
+        if self.errors:
+            lines.append("")
+            lines.append("Errors:")
+            lines.extend(f"  ❌ {error}" for error in self.errors)
+        return "\n".join(lines)
+def validate_against_baseline(
+    run_report: dict[str, Any],
+    baseline: dict[str, Any],
+    *,
+    tol_ratio: float = 0.02,
+    tol_param_ratio: float = 0.02,
+    ratio_bounds: tuple[float, float] = (1.25, 1.32),
+    delta_bounds_pp: tuple[float, float] | None = None,
+    structural_exact: bool = True,
+    **kwargs,
+) -> ValidationResult:
+    # Backward-compatible kwargs (deprecated): enable via INVARLOCK_VALIDATE_LEGACY=1
+    legacy = str(os.environ.get("INVARLOCK_VALIDATE_LEGACY", "")).strip().lower() in {
+        "1",
+        "true",
+        "yes",
+        "on",
+    }
+    if legacy:
+        if "tol_ppl_ratio" in kwargs and isinstance(
+            kwargs["tol_ppl_ratio"], int | float
+        ):
+            tol_ratio = float(kwargs["tol_ppl_ratio"])
+        if "ppl_bounds" in kwargs and isinstance(kwargs["ppl_bounds"], tuple):
+            # Coerce after runtime guard
+            ratio_bounds = cast(tuple[float, float], kwargs["ppl_bounds"])
+    """
+    Validate pruning results against baseline metrics (PM-only API).
+    Args:
+        run_report: Report from pruning run (dict with metrics)
+        baseline: Baseline metrics to compare against
+        tol_ratio: Tolerance for primary metric ratio deviation (±2% = 0.02) for lower-is-better families
+        tol_param_ratio: Tolerance for parameter reduction ratio deviation
+        ratio_bounds: Acceptable ratio bounds for lower-is-better families (min, max)
+        delta_bounds_pp: Acceptable delta bounds in percentage points for higher-is-better families (min, max)
+        structural_exact: Whether structural counts must match exactly
+    Returns:
+        ValidationResult with detailed check results
+    """
+    checks: dict[str, bool] = {}
+    metrics: dict[str, float] = {}
+    messages: list[str] = []
+    warnings_list: list[str] = []
+    errors: list[str] = []
+    try:
+        # Extract primary metric ratio (canonical)
+        current_ratio = None
+        pm_kind = None
+        pm = (
+            (run_report.get("metrics") or {}).get("primary_metric")
+            if isinstance(run_report.get("metrics"), dict)
+            else None
+        )
+        if isinstance(pm, dict) and pm:
+            val = pm.get("ratio_vs_baseline")
+            if isinstance(val, int | float):
+                current_ratio = float(val)
+            try:
+                pm_kind = str(pm.get("kind") or "").lower()
+            except Exception:
+                pm_kind = None
+        if current_ratio is None:
+            errors.append("Cannot extract ratio_vs_baseline from run report")
+        if "param_reduction_ratio" in run_report:
+            current_param_ratio = run_report["param_reduction_ratio"]
+        elif "parameters_removed" in run_report and "original_params" in run_report:
+            current_param_ratio = (
+                run_report["parameters_removed"] / run_report["original_params"]
+            )
+        else:
+            current_param_ratio = None
+            errors.append("Cannot extract parameter reduction ratio from run report")
+        # Extract baseline metrics
+        baseline_ratio = baseline.get("ratio_vs_baseline")
+        baseline_param_ratio = baseline.get("param_reduction_ratio")
+        if baseline_ratio is None:
+            errors.append("Baseline missing ratio_vs_baseline")
+        if baseline_param_ratio is None:
+            errors.append("Baseline missing param_reduction_ratio")
+        # Primary metric tolerance (lower-is-better families)
+        if pm_kind in {"ppl_causal", "ppl_mlm", "ppl_seq2seq", None}:
+            if current_ratio is not None and baseline_ratio is not None:
+                rel_diff = abs(current_ratio - float(baseline_ratio)) / float(
+                    baseline_ratio
+                )
+                checks["ratio_tolerance"] = rel_diff <= tol_ratio
+                metrics["ratio_diff"] = rel_diff
+                metrics["current_ratio"] = current_ratio
+                metrics["baseline_ratio"] = float(baseline_ratio)
+                if not checks["ratio_tolerance"]:
+                    msg = f"Primary metric ratio deviation {rel_diff:.3f} exceeds tolerance {tol_ratio:.3f}"
+                    messages.append(msg)
+                else:
+                    messages.append(
+                        f"Primary metric ratio within tolerance: {current_ratio:.3f} vs baseline {float(baseline_ratio):.3f}"
+                    )
+            else:
+                checks["ratio_tolerance"] = False
+        # Parameter ratio validation
+        if current_param_ratio is not None and baseline_param_ratio is not None:
+            param_relative_diff = (
+                abs(current_param_ratio - baseline_param_ratio) / baseline_param_ratio
+            )
+            checks["param_ratio_tolerance"] = param_relative_diff <= tol_param_ratio
+            metrics["param_ratio_diff"] = param_relative_diff
+            metrics["current_param_ratio"] = current_param_ratio
+            metrics["baseline_param_ratio"] = baseline_param_ratio
+            if not checks["param_ratio_tolerance"]:
+                messages.append(
+                    f"Parameter ratio deviation {param_relative_diff:.3f} exceeds tolerance {tol_param_ratio:.3f}"
+                )
+            else:
+                messages.append(
+                    f"Parameter ratio within tolerance: {current_param_ratio:.3f} vs baseline {baseline_param_ratio:.3f}"
+                )
+        else:
+            checks["param_ratio_tolerance"] = False
+        # Bounds check
+        if current_ratio is not None:
+            if pm_kind in {"accuracy", "vqa_accuracy"}:
+                # Interpret current_ratio as delta proportion; compare in pp when bounds provided
+                if isinstance(delta_bounds_pp, tuple) and len(delta_bounds_pp) == 2:
+                    delta_pp = 100.0 * float(current_ratio)
+                    lo_pp, hi_pp = float(delta_bounds_pp[0]), float(delta_bounds_pp[1])
+                    checks["delta_bounds_pp"] = lo_pp <= delta_pp <= hi_pp
+                    if not checks["delta_bounds_pp"]:
+                        messages.append(
+                            f"Δpp {delta_pp:+.2f} outside acceptable bounds {delta_bounds_pp}"
+                        )
+                    else:
+                        messages.append(
+                            f"Δpp {delta_pp:+.2f} within acceptable bounds {delta_bounds_pp}"
+                        )
+            else:
+                checks["ratio_bounds"] = (
+                    ratio_bounds[0] <= current_ratio <= ratio_bounds[1]
+                )
+                if not checks["ratio_bounds"]:
+                    messages.append(
+                        f"Ratio {current_ratio:.3f} outside acceptable bounds {ratio_bounds}"
+                    )
+                else:
+                    messages.append(
+                        f"Ratio {current_ratio:.3f} within acceptable bounds {ratio_bounds}"
+                    )
+        else:
+            if pm_kind in {"accuracy", "vqa_accuracy"}:
+                checks["delta_bounds_pp"] = False
+            else:
+                checks["ratio_bounds"] = False
+        # Structural count validation
+        if structural_exact:
+            structural_checks = _validate_structural_counts(run_report, baseline)
+            checks.update(structural_checks["checks"])
+            messages.extend(structural_checks["messages"])
+            warnings_list.extend(structural_checks["warnings"])
+        else:
+            checks["structural_counts"] = True  # Skip structural validation
+        # Invariants validation (if present in report)
+        invariants_passed = _validate_invariants(run_report)
+        if invariants_passed is not None:
+            checks["invariants"] = invariants_passed
+            if not invariants_passed:
+                errors.append("Model invariants validation failed")
+        # Overall pass/fail
+        passed = all(checks.values()) and len(errors) == 0
+        return ValidationResult(
+            passed=passed,
+            checks=checks,
+            metrics=metrics,
+            messages=messages,
+            warnings=warnings_list,
+            errors=errors,
+        )
+    except Exception as e:
+        return ValidationResult(
+            passed=False,
+            checks={"validation_error": False},
+            metrics={},
+            messages=[],
+            warnings=[],
+            errors=[f"Validation failed with exception: {str(e)}"],
+        )
+def validate_drift_gate(
+    run_report: dict[str, Any], drift_bounds: tuple[float, float] = (0.95, 1.05)
+) -> ValidationResult:
+    """
+    Validate hard drift gate: 0.95 ≤ final/preview ≤ 1.05.
+    Args:
+        run_report: Report from run with metrics.primary_metric preview/final
+        drift_bounds: Acceptable drift bounds (min, max) - default (0.95, 1.05)
+    Returns:
+        ValidationResult with drift gate check
+    """
+    checks = {}
+    metrics = {}
+    messages = []
+    warnings: list[str] = []
+    errors = []
+    try:
+        # Extract preview and final from primary_metric
+        pm = (
+            (run_report.get("metrics") or {}).get("primary_metric")
+            if isinstance(run_report.get("metrics"), dict)
+            else None
+        )
+        pm_preview = pm.get("preview") if isinstance(pm, dict) else None
+        pm_final = pm.get("final") if isinstance(pm, dict) else None
+        # Calculate drift ratio (final/preview) for lower-is-better families
+        if (
+            isinstance(pm_preview, (int | float))
+            and isinstance(pm_final, (int | float))
+            and pm_preview > 0
+        ):
+            drift_ratio = float(pm_final) / float(pm_preview)
+            metrics["drift_ratio"] = drift_ratio
+            metrics["preview"] = float(pm_preview)
+            metrics["final"] = float(pm_final)
+            # Apply hard gate
+            checks["drift_gate"] = drift_bounds[0] <= drift_ratio <= drift_bounds[1]
+            if checks["drift_gate"]:
+                messages.append(
+                    f"Drift gate PASSED: {drift_ratio:.3f} within bounds {drift_bounds}"
+                )
+            else:
+                errors.append(
+                    f"Drift gate FAILED: {drift_ratio:.3f} outside bounds {drift_bounds} "
+                    f"(±5% drift limit exceeded)"
+                )
+        else:
+            errors.append(
+                "Cannot calculate drift: missing primary_metric preview/final"
+            )
+            checks["drift_gate"] = False
+        # Overall pass/fail
+        passed = all(checks.values()) and len(errors) == 0
+        return ValidationResult(
+            passed=passed,
+            checks=checks,
+            metrics=metrics,
+            messages=messages,
+            warnings=warnings,
+            errors=errors,
+        )
+    except Exception as e:
+        return ValidationResult(
+            passed=False,
+            checks={"drift_gate_error": False},
+            metrics={},
+            messages=[],
+            warnings=[],
+            errors=[f"Drift gate validation failed: {str(e)}"],
+        )
+def validate_guard_overhead(
+    bare_report: dict[str, Any],
+    guarded_report: dict[str, Any],
+    overhead_threshold: float = 0.01,
+) -> ValidationResult:
+    """
+    Validate guard overhead using primary_metric: final(guarded)/final(bare) ≤ 1%.
+    Args:
+        bare_report: Report from bare (no guards) run (expects metrics.primary_metric)
+        guarded_report: Report from guarded run (expects metrics.primary_metric)
+        overhead_threshold: Maximum allowed overhead (default 0.01 = 1%)
+    Returns:
+        ValidationResult with guard overhead check
+    """
+    checks = {}
+    metrics = {}
+    messages = []
+    warnings: list[str] = []
+    errors = []
+    try:
+        # Extract primary metric final from both reports
+        bare_pm = (
+            (bare_report.get("metrics") or {}).get("primary_metric")
+            if isinstance(bare_report.get("metrics"), dict)
+            else None
+        )
+        guarded_pm = (
+            (guarded_report.get("metrics") or {}).get("primary_metric")
+            if isinstance(guarded_report.get("metrics"), dict)
+            else None
+        )
+        bare_final = None
+        guarded_final = None
+        if isinstance(bare_pm, dict):
+            bare_final = bare_pm.get("final")
+        if isinstance(guarded_pm, dict):
+            guarded_final = guarded_pm.get("final")
+        if (
+            isinstance(bare_final, (int | float))
+            and bare_final > 0
+            and isinstance(guarded_final, (int | float))
+        ):
+            overhead_ratio = float(guarded_final) / float(bare_final)
+            overhead_percent = (overhead_ratio - 1.0) * 100
+            metrics["overhead_ratio"] = overhead_ratio
+            metrics["overhead_percent"] = overhead_percent
+            metrics["bare_final"] = float(bare_final)
+            metrics["guarded_final"] = float(guarded_final)
+            # Apply overhead gate
+            checks["guard_overhead"] = overhead_ratio <= (1.0 + overhead_threshold)
+            if checks["guard_overhead"]:
+                messages.append(
+                    f"Guard overhead PASSED: {overhead_percent:+.2f}% ≤ {overhead_threshold * 100:.1f}%"
+                )
+            else:
+                errors.append(
+                    f"Guard overhead FAILED: {overhead_percent:+.2f}% > {overhead_threshold * 100:.1f}% "
+                    f"(guards add too much primary-metric overhead)"
+                )
+        else:
+            errors.append(
+                "Cannot calculate guard overhead: missing primary_metric data"
+            )
+            checks["guard_overhead"] = False
+        # Overall pass/fail
+        passed = all(checks.values()) and len(errors) == 0
+        return ValidationResult(
+            passed=passed,
+            checks=checks,
+            metrics=metrics,
+            messages=messages,
+            warnings=warnings,
+            errors=errors,
+        )
+    except Exception as e:
+        return ValidationResult(
+            passed=False,
+            checks={"guard_overhead_error": False},
+            metrics={},
+            messages=[],
+            warnings=[],
+            errors=[f"Guard overhead validation failed: {str(e)}"],
+        )
+def _validate_structural_counts(
+    run_report: dict[str, Any], baseline: dict[str, Any]
+) -> dict[str, Any]:
+    """Validate that structural counts match exactly."""
+    checks = {}
+    messages = []
+    warnings = []
+    # Heads/neurons counts removed from simplified schema; only validate layers
+    # Check layers modified
+    current_layers = run_report.get(
+        "layers_modified", run_report.get("metrics", {}).get("layers_modified")
+    )
+    baseline_layers = baseline.get("layers_modified")
+    if current_layers is not None and baseline_layers is not None:
+        checks["layers_count_exact"] = current_layers == baseline_layers
+        if checks["layers_count_exact"]:
+            messages.append(f"Modified layers count matches: {current_layers}")
+        else:
+            messages.append(
+                f"Modified layers mismatch: {current_layers} vs baseline {baseline_layers}"
+            )
+    else:
+        warnings.append("Cannot validate layers count - missing data")
+        checks["layers_count_exact"] = True  # Don't fail on missing data
+    return {"checks": checks, "messages": messages, "warnings": warnings}
+def _validate_invariants(run_report: dict[str, Any]) -> bool | None:
+    """Check if model invariants passed."""
+    # Look for invariants check in guard reports
+    guard_reports = run_report.get("guard_reports", {})
+    for guard_name, guard_report in guard_reports.items():
+        if "invariants" in guard_name.lower():
+            passed = guard_report.get("passed", True)
+            return bool(passed) if passed is not None else True
+    # Look for validation results in metrics
+    metrics = run_report.get("metrics", {})
+    if "invariants_passed" in metrics:
+        passed = metrics["invariants_passed"]
+        return bool(passed) if passed is not None else None
+    # No invariants check found
+    return None
+def load_baseline(baseline_path: Path) -> dict[str, Any]:
+    """Load baseline metrics from JSON file."""
+    try:
+        with open(baseline_path) as f:
+            data = json.load(f)
+            if not isinstance(data, dict):
+                raise ValueError(
+                    f"Baseline file must contain a JSON object, got {type(data)}"
+                )
+            return data
+    except FileNotFoundError as e:
+        raise FileNotFoundError(f"Baseline file not found: {baseline_path}") from e
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON in baseline file: {e}") from e
+def save_baseline(baseline: dict[str, Any], baseline_path: Path) -> None:
+    """Save baseline metrics to JSON file."""
+    baseline_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(baseline_path, "w") as f:
+        json.dump(baseline, f, indent=2)
+def create_baseline_from_report(run_report: dict[str, Any]) -> dict[str, Any]:
+    """Create a baseline structure from a run report."""
+    baseline: dict[str, Any] = {}
+    # Extract core metrics (PM-only)
+    try:
+        pm = (
+            run_report.get("metrics", {}).get("primary_metric")
+            if isinstance(run_report.get("metrics"), dict)
+            else None
+        )
+        if isinstance(pm, dict) and pm.get("ratio_vs_baseline") is not None:
+            baseline["ratio_vs_baseline"] = float(pm["ratio_vs_baseline"])
+    except Exception:
+        pass
+    if "param_reduction_ratio" in run_report:
+        baseline["param_reduction_ratio"] = run_report["param_reduction_ratio"]
+    elif "parameters_removed" in run_report and "original_params" in run_report:
+        baseline["param_reduction_ratio"] = (
+            run_report["parameters_removed"] / run_report["original_params"]
+        )
+    # Extract structural counts
+    metrics = run_report.get("metrics", {})
+    for key in ["heads_pruned", "neurons_pruned", "layers_modified"]:
+        if key in run_report:
+            baseline[key] = run_report[key]
+        elif key in metrics:
+            baseline[key] = metrics[key]
+    # Extract sparsity metrics
+    sparsity = run_report.get("actual_sparsity", {})
+    for key in ["head_sparsity", "neuron_sparsity", "weight_sparsity"]:
+        if key in sparsity:
+            baseline[key] = sparsity[key]
+    # Add metadata
+    baseline["baseline_created"] = True
+    baseline["source"] = "run_report"
+    return baseline
+def validate_gpt2_small_wt2_baseline(
+    run_report: dict[str, Any], baseline_path: Path | None = None
+) -> ValidationResult:
+    """
+    Validate against the canonical GPT-2 small + WikiText-2 baseline.
+    This is the CI validation function that uses the pinned baseline.
+    """
+    if baseline_path is None:
+        # Use default baseline path
+        baseline_path = (
+            Path(__file__).parent.parent.parent
+            / "benchmarks"
+            / "baselines"
+            / "gpt2_small_wt2.json"
+        )
+    try:
+        baseline = load_baseline(baseline_path)
+    except FileNotFoundError:
+        # Create a default baseline if file doesn't exist
+        warnings.warn(
+            f"Baseline file not found: {baseline_path}. Using default values.",
+            stacklevel=2,
+        )
+        baseline = {
+            "ratio_vs_baseline": 1.285,  # Target: ~1.25-1.32
+            "param_reduction_ratio": 0.022,  # Target: ~2.2%
+            "heads_pruned": 16,  # Example values
+            "neurons_pruned": 1024,
+            "layers_modified": 8,
+            "head_sparsity": 0.1,
+            "neuron_sparsity": 0.1,
+        }
+    return validate_against_baseline(
+        run_report,
+        baseline,
+        tol_ratio=0.02,
+        tol_param_ratio=0.02,
+        ratio_bounds=(1.25, 1.32),
+        structural_exact=True,
+    )