PyPI - invarlock - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl - Mend

invarlock 0.3.6py3-none-any.whl → 0.3.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

invarlock/__init__.py +4 -4
invarlock/adapters/__init__.py +10 -14
invarlock/adapters/auto.py +37 -50
invarlock/adapters/capabilities.py +2 -2
invarlock/adapters/hf_causal.py +418 -0
invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
invarlock/adapters/hf_loading.py +7 -7
invarlock/adapters/hf_mixin.py +53 -9
invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
invarlock/assurance/__init__.py +15 -23
invarlock/cli/adapter_auto.py +32 -26
invarlock/cli/app.py +128 -27
invarlock/cli/commands/__init__.py +2 -2
invarlock/cli/commands/calibrate.py +48 -4
invarlock/cli/commands/doctor.py +8 -10
invarlock/cli/commands/evaluate.py +986 -0
invarlock/cli/commands/explain_gates.py +25 -17
invarlock/cli/commands/export_html.py +11 -9
invarlock/cli/commands/plugins.py +13 -9
invarlock/cli/commands/report.py +326 -92
invarlock/cli/commands/run.py +1160 -228
invarlock/cli/commands/verify.py +157 -97
invarlock/cli/config.py +1 -1
invarlock/cli/determinism.py +1 -1
invarlock/cli/doctor_helpers.py +4 -5
invarlock/cli/output.py +193 -0
invarlock/cli/provenance.py +4 -4
invarlock/core/bootstrap.py +1 -1
invarlock/core/registry.py +9 -11
invarlock/core/retry.py +14 -14
invarlock/core/runner.py +112 -26
invarlock/edits/noop.py +2 -2
invarlock/edits/quant_rtn.py +67 -39
invarlock/eval/__init__.py +1 -1
invarlock/eval/bench.py +14 -10
invarlock/eval/data.py +68 -23
invarlock/eval/metrics.py +59 -1
invarlock/eval/primary_metric.py +1 -1
invarlock/eval/tasks/__init__.py +12 -0
invarlock/eval/tasks/classification.py +48 -0
invarlock/eval/tasks/qa.py +36 -0
invarlock/eval/tasks/text_generation.py +102 -0
invarlock/guards/invariants.py +19 -10
invarlock/guards/rmt.py +2 -2
invarlock/guards/spectral.py +1 -1
invarlock/guards/variance.py +2 -2
invarlock/model_profile.py +64 -62
invarlock/observability/health.py +6 -6
invarlock/observability/metrics.py +108 -0
invarlock/plugins/hf_bnb_adapter.py +32 -21
invarlock/reporting/__init__.py +18 -4
invarlock/reporting/guards_analysis.py +154 -4
invarlock/reporting/html.py +61 -11
invarlock/reporting/normalizer.py +9 -2
invarlock/reporting/policy_utils.py +1 -1
invarlock/reporting/primary_metric_utils.py +11 -11
invarlock/reporting/render.py +876 -510
invarlock/reporting/report.py +72 -30
invarlock/reporting/{certificate.py → report_builder.py} +252 -99
invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
invarlock/reporting/report_types.py +6 -1
invarlock/reporting/telemetry.py +86 -0
invarlock-0.3.8.dist-info/METADATA +283 -0
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
invarlock/adapters/hf_gpt2.py +0 -404
invarlock/adapters/hf_llama.py +0 -487
invarlock/cli/commands/certify.py +0 -422
invarlock-0.3.6.dist-info/METADATA +0 -588
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0

invarlock/cli/doctor_helpers.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any
 def get_adapter_rows() -> list[dict[str, Any]]:
     """Build adapter rows similar to doctor output for testing.
-    Applies optional-extra detection for hf_onnx (optimum/onnxruntime) even if
+    Applies optional-extra detection for hf_causal_onnx (optimum/onnxruntime) even if
     registered as a core adapter, so missing extras are surfaced.
     """
     from invarlock.core.registry import get_registry
@@ -29,13 +29,12 @@ def get_adapter_rows() -> list[dict[str, Any]]:
         module = str(info.get("module") or "")
         support = (
             "auto"
-            if module.startswith("invarlock.adapters")
-            and name in {"hf_causal_auto", "hf_mlm_auto"}
+            if module.startswith("invarlock.adapters") and name in {"hf_auto"}
             else ("core" if module.startswith("invarlock.adapters") else "optional")
         )
         backend, status, enable = None, "ready", ""
-        if name in {"hf_gpt2", "hf_bert", "hf_llama", "hf_causal_auto", "hf_mlm_auto"}:
+        if name in {"hf_causal", "hf_mlm", "hf_seq2seq", "hf_auto"}:
             backend = "transformers"
         elif name == "hf_gptq":
             backend = "auto-gptq"
@@ -49,7 +48,7 @@ def get_adapter_rows() -> list[dict[str, Any]]:
             backend = "bitsandbytes"
             if not has_cuda:
                 status, enable = "unsupported", "Requires CUDA"
-        elif name == "hf_onnx":
+        elif name == "hf_causal_onnx":
             backend = "onnxruntime"
             present = (
                 importlib.util.find_spec("optimum.onnxruntime") is not None

invarlock/cli/output.py ADDED Viewed

@@ -0,0 +1,193 @@
+from __future__ import annotations
+import os
+import time
+from collections.abc import Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TextIO
+from rich.console import Console
+_STYLE_AUDIT = "audit"
+_STYLE_FRIENDLY = "friendly"
+_VALID_STYLES = {_STYLE_AUDIT, _STYLE_FRIENDLY}
+def _safe_console_print(console: Console, *args: object, **kwargs: object) -> None:
+    try:
+        console.print(*args, **kwargs)
+    except TypeError:
+        console.print(*args)
+def env_no_color() -> bool:
+    """Return True when NO_COLOR is set (value-agnostic)."""
+    return bool(str(os.environ.get("NO_COLOR", "")).strip())
+def perf_counter() -> float:
+    return time.perf_counter()
+@dataclass(frozen=True, slots=True)
+class OutputStyle:
+    name: str
+    progress: bool = False
+    timing: bool = False
+    color: bool = True
+    @property
+    def emojis(self) -> bool:
+        return self.name != _STYLE_AUDIT
+    @property
+    def audit(self) -> bool:
+        return self.name == _STYLE_AUDIT
+def normalize_style(style: str | None) -> str | None:
+    if style is None:
+        return None
+    value = str(style).strip().lower()
+    if not value:
+        return None
+    return value if value in _VALID_STYLES else None
+def resolve_style_name(style: str | None, profile: str | None) -> str:
+    normalized = normalize_style(style)
+    if normalized is not None:
+        return normalized
+    profile_norm = str(profile or "").strip().lower()
+    if profile_norm in {"ci", "ci_cpu", "release"}:
+        return _STYLE_AUDIT
+    return _STYLE_FRIENDLY
+def resolve_output_style(
+    *,
+    style: str | None,
+    profile: str | None,
+    progress: bool = False,
+    timing: bool = False,
+    no_color: bool = False,
+) -> OutputStyle:
+    name = resolve_style_name(style, profile)
+    color_enabled = not (bool(no_color) or env_no_color())
+    return OutputStyle(
+        name=name,
+        progress=bool(progress),
+        timing=bool(timing),
+        color=color_enabled,
+    )
+def make_console(
+    *,
+    file: TextIO | None = None,
+    force_terminal: bool | None = None,
+    no_color: bool | None = None,
+) -> Console:
+    if no_color is None:
+        no_color = env_no_color()
+    if no_color:
+        color_system = None
+    else:
+        color_system = "standard" if force_terminal else "auto"
+    return Console(
+        file=file,
+        force_terminal=force_terminal,
+        no_color=bool(no_color),
+        color_system=color_system,
+    )
+def format_event_line(
+    tag: str,
+    message: str,
+    *,
+    style: OutputStyle,
+    emoji: str | None = None,
+) -> str:
+    tag_norm = str(tag or "").strip().upper() or "INFO"
+    if style.emojis and emoji:
+        prefix = emoji
+    else:
+        prefix = f"[{tag_norm}]"
+    msg = str(message or "").rstrip()
+    return f"{prefix} {msg}".rstrip()
+def print_event(
+    console: Console,
+    tag: str,
+    message: str,
+    *,
+    style: OutputStyle,
+    emoji: str | None = None,
+    console_style: str | None = None,
+) -> None:
+    line = format_event_line(tag, message, style=style, emoji=emoji)
+    if console_style is None and style.color:
+        tag_norm = str(tag or "").strip().upper()
+        if tag_norm in {"PASS"}:
+            console_style = "green"
+        elif tag_norm in {"FAIL", "ERROR"}:
+            console_style = "red"
+        elif tag_norm in {"WARN", "WARNING"}:
+            console_style = "yellow"
+        elif tag_norm in {"METRIC"}:
+            console_style = "cyan"
+    _safe_console_print(console, line, style=console_style, markup=False)
+@contextmanager
+def timed_step(
+    *,
+    console: Console,
+    style: OutputStyle,
+    timings: dict[str, float] | None,
+    key: str,
+    tag: str,
+    message: str,
+    emoji: str | None = None,
+) -> Iterator[None]:
+    start = perf_counter()
+    try:
+        yield
+    finally:
+        elapsed = max(0.0, float(perf_counter() - start))
+        if timings is not None:
+            timings[key] = elapsed
+        if style.progress:
+            print_event(
+                console,
+                tag,
+                f"{message} done ({elapsed:.2f}s)",
+                style=style,
+                emoji=emoji,
+            )
+def print_timing_summary(
+    console: Console,
+    timings: dict[str, float],
+    *,
+    style: OutputStyle,
+    order: list[tuple[str, str]],
+    extra_lines: list[str] | None = None,
+) -> None:
+    if not style.timing:
+        return
+    _safe_console_print(console, "", markup=False)
+    _safe_console_print(console, "TIMING SUMMARY", markup=False)
+    for label, key in order:
+        if key not in timings:
+            continue
+        _safe_console_print(
+            console, f"  {label:<11}: {timings[key]:.2f}s", markup=False
+        )
+    if extra_lines:
+        for line in extra_lines:
+            _safe_console_print(console, line, markup=False)

invarlock/cli/provenance.py CHANGED Viewed

@@ -2,7 +2,7 @@
 Provides a tiny, versioned schema describing the adapter family and the
 underlying library versions. This does not perform any edits; it only reads
-environment and import metadata to annotate reports/certificates.
+environment and import metadata to annotate evaluation artifacts.
 """
 from __future__ import annotations
@@ -31,7 +31,7 @@ _FAMILY_MAP: dict[str, tuple[str, str, list[str]]] = {
     "hf_awq": ("awq", "autoawq", []),
     "hf_bnb": ("bnb", "bitsandbytes", []),
     # ONNX stack (requires extras: invarlock[onnx])
-    "hf_onnx": ("onnx", "onnxruntime", []),
+    "hf_causal_onnx": ("onnx", "onnxruntime", []),
 }
@@ -46,12 +46,12 @@ def extract_adapter_provenance(adapter_name: str) -> AdapterProvenance:
         msg = (
             None
             if supported
-            else f"Use Compare & Certify (BYOE); {library} version unsupported (tested: {tested})"
+            else f"Use Compare & Evaluate (BYOE); {library} version unsupported (tested: {tested})"
         )
     except Exception:  # Package not installed or version unknown
         ver = None
         supported = False
-        msg = f"{library} not available; prefer Compare & Certify (BYOE) or install extras."
+        msg = f"{library} not available; prefer Compare & Evaluate (BYOE) or install extras."
     return AdapterProvenance(
         family=family,

invarlock/core/bootstrap.py CHANGED Viewed

@@ -6,7 +6,7 @@ Numerically stable bootstrap helpers for evaluation metrics.
 This module provides bias-corrected and accelerated (BCa) confidence
 intervals tailored for paired log-loss statistics used by the runner
-and safety certificate reports.
+and evaluation reports.
 """
 from __future__ import annotations

invarlock/core/registry.py CHANGED Viewed

@@ -140,23 +140,21 @@ class CoreRegistry:
                 )
         # Register built-in adapters
-        _fallback(self._adapters, "hf_gpt2", "invarlock.adapters", "HF_GPT2_Adapter")
-        _fallback(self._adapters, "hf_bert", "invarlock.adapters", "HF_BERT_Adapter")
-        _fallback(self._adapters, "hf_llama", "invarlock.adapters", "HF_LLaMA_Adapter")
-        _fallback(self._adapters, "hf_t5", "invarlock.adapters", "HF_T5_Adapter")
         _fallback(
-            self._adapters, "hf_onnx", "invarlock.adapters", "HF_ORT_CausalLM_Adapter"
+            self._adapters, "hf_causal", "invarlock.adapters", "HF_Causal_Adapter"
         )
-        # Convenience auto adapters (delegate to built-ins)
+        _fallback(self._adapters, "hf_mlm", "invarlock.adapters", "HF_MLM_Adapter")
         _fallback(
-            self._adapters,
-            "hf_causal_auto",
-            "invarlock.adapters",
-            "HF_Causal_Auto_Adapter",
+            self._adapters, "hf_seq2seq", "invarlock.adapters", "HF_Seq2Seq_Adapter"
         )
         _fallback(
-            self._adapters, "hf_mlm_auto", "invarlock.adapters", "HF_MLM_Auto_Adapter"
+            self._adapters,
+            "hf_causal_onnx",
+            "invarlock.adapters",
+            "HF_Causal_ONNX_Adapter",
+            required_deps=["optimum"],
         )
+        _fallback(self._adapters, "hf_auto", "invarlock.adapters", "HF_Auto_Adapter")
         # Optional plugin adapters (verify runtime dependencies)
         _fallback(
             self._adapters,

invarlock/core/retry.py CHANGED Viewed

@@ -2,11 +2,11 @@
 InvarLock Retry Controller
 =====================
-Manages retry logic for automated certification workflows with:
+Manages retry logic for automated evaluation workflows with:
 - Attempt budgets (max 3 attempts default)
 - Time budgets (optional timeout)
 - Parameter adjustment strategies per edit type
-- Certificate-driven retry decisions
+- Gate-driven retry decisions
 """
 from __future__ import annotations
@@ -19,7 +19,7 @@ __all__ = ["RetryController", "adjust_edit_params"]
 class RetryController:
     """
-    Controls retry logic for certificate-driven automation.
+    Controls retry logic for evaluation-report-driven automation.
     Features:
     - Attempt budget enforcement (default 3 max)
@@ -45,18 +45,18 @@ class RetryController:
         self.start_time = time.time()
         self.attempt_history: list[dict[str, Any]] = []
-    def should_retry(self, certificate_passed: bool) -> bool:
+    def should_retry(self, report_passed: bool) -> bool:
         """
         Determine if retry should be attempted.
         Args:
-            certificate_passed: Whether certificate validation passed
+            report_passed: Whether evaluation report gates passed
         Returns:
             True if retry should be attempted, False otherwise
         """
-        # If certificate passed, no retry needed
-        if certificate_passed:
+        # If report passed, no retry needed
+        if report_passed:
             return False
         # Check attempt budget (attempt count equals history length)
@@ -81,21 +81,21 @@ class RetryController:
     def record_attempt(
         self,
         attempt_num: int,
-        certificate_result: dict[str, Any],
+        report_result: dict[str, Any],
         edit_params: dict[str, Any],
     ) -> None:
         """Record details of an attempt for tracking."""
-        certificate_result = certificate_result or {}
+        report_result = report_result or {}
         edit_params = edit_params or {}
         self.attempt_history.append(
             {
                 "attempt": attempt_num,
                 "timestamp": time.time(),
-                "certificate_passed": certificate_result.get("passed", False),
+                "report_passed": report_result.get("passed", False),
                 "edit_params": edit_params.copy(),
-                "failures": certificate_result.get("failures", []),
-                "validation": certificate_result.get("validation", {}),
+                "failures": report_result.get("failures", []),
+                "validation": report_result.get("validation", {}),
             }
         )
@@ -114,7 +114,7 @@ def adjust_edit_params(
     edit_name: str,
     edit_params: dict[str, Any],
     attempt: int,
-    certificate_result: dict[str, Any] | None = None,
+    report_result: dict[str, Any] | None = None,
 ) -> dict[str, Any]:
     """
     Adjust edit parameters for retry attempt based on edit type and failure mode.
@@ -126,7 +126,7 @@ def adjust_edit_params(
         edit_name: Name of the edit operation
         edit_params: Current edit parameters
         attempt: Attempt number (1-indexed)
-        certificate_result: Optional certificate result for failure analysis
+        report_result: Optional evaluation report result for failure analysis
     Returns:
         Adjusted parameters for next attempt

invarlock/core/runner.py CHANGED Viewed

@@ -19,6 +19,11 @@ from typing import Any
 import numpy as np
 from invarlock.eval.tail_stats import evaluate_metric_tail
+from invarlock.observability.metrics import (
+    capture_memory_snapshot,
+    reset_peak_memory_stats,
+    summarize_memory_snapshots,
+)
 from .api import (
     EditLike,
@@ -190,6 +195,18 @@ class CoreRunner:
                     pass
         report.status = RunStatus.RUNNING.value
+        timings: dict[str, float] = {}
+        guard_timings: dict[str, float] = {}
+        memory_snapshots: list[dict[str, Any]] = []
+        total_start = time.perf_counter()
+        def _record_timing(key: str, start: float) -> None:
+            timings[key] = max(0.0, float(time.perf_counter() - start))
+        def _capture_memory(phase: str) -> None:
+            snapshot = capture_memory_snapshot(phase)
+            if snapshot:
+                memory_snapshots.append(snapshot)
         try:
             # Log start
@@ -205,40 +222,78 @@ class CoreRunner:
             )
             # Phase 1: Prepare (describe model, create checkpoint)
-            model_desc = self._prepare_phase(model, adapter, report)
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                model_desc = self._prepare_phase(model, adapter, report)
+            finally:
+                _record_timing("prepare", phase_start)
+                _capture_memory("prepare")
             # Phase 2: Prepare guards (must happen before edit)
-            self._prepare_guards_phase(
-                model,
-                adapter,
-                guards,
-                calibration_data,
-                report,
-                auto_config,
-                config,
-            )
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                self._prepare_guards_phase(
+                    model,
+                    adapter,
+                    guards,
+                    calibration_data,
+                    report,
+                    auto_config,
+                    config,
+                )
+            finally:
+                _record_timing("prepare_guards", phase_start)
+                _capture_memory("prepare_guards")
             # Phase 3: Apply edit
-            self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
+            finally:
+                _record_timing("edit", phase_start)
+                _capture_memory("edit")
             # Phase 4: Run guards
-            guard_results = self._guard_phase(model, adapter, guards, report)
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                guard_results = self._guard_phase(
+                    model, adapter, guards, report, guard_timings=guard_timings
+                )
+            finally:
+                _record_timing("guards", phase_start)
+                _capture_memory("guards")
             # Phase 5: Evaluate final metrics
-            metrics = self._eval_phase(
-                model,
-                adapter,
-                calibration_data,
-                report,
-                preview_n,
-                final_n,
-                config,
-            )
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                metrics = self._eval_phase(
+                    model,
+                    adapter,
+                    calibration_data,
+                    report,
+                    preview_n,
+                    final_n,
+                    config,
+                )
+            finally:
+                _record_timing("eval", phase_start)
+                _capture_memory("eval")
             # Phase 6: Finalize or rollback
-            final_status = self._finalize_phase(
-                model, adapter, guard_results, metrics, config, report
-            )
+            reset_peak_memory_stats()
+            phase_start = time.perf_counter()
+            try:
+                final_status = self._finalize_phase(
+                    model, adapter, guard_results, metrics, config, report
+                )
+            finally:
+                _record_timing("finalize", phase_start)
+                _capture_memory("finalize")
             report.status = final_status
             report.meta["end_time"] = time.time()
@@ -260,6 +315,25 @@ class CoreRunner:
             return report
         finally:
+            _record_timing("total", total_start)
+            if not isinstance(report.metrics, dict):
+                report.metrics = {}
+            if timings:
+                report.metrics.setdefault("timings", {}).update(timings)
+            if guard_timings:
+                report.metrics["guard_timings"] = guard_timings
+            if memory_snapshots:
+                report.metrics["memory_snapshots"] = memory_snapshots
+                summary = summarize_memory_snapshots(memory_snapshots)
+                if summary:
+                    mem_peak = summary.get("memory_mb_peak")
+                    if isinstance(mem_peak, (int | float)):
+                        existing = report.metrics.get("memory_mb_peak")
+                        if isinstance(existing, (int | float)):
+                            summary["memory_mb_peak"] = max(
+                                float(existing), float(mem_peak)
+                            )
+                    report.metrics.update(summary)
             self._active_model = None
             self._active_adapter = None
             self._cleanup_services()
@@ -447,7 +521,7 @@ class CoreRunner:
                         f"Guard '{guard.name}' prepare failed: {e}"
                     ) from e
-        # Store resolved policies in report for certificate
+        # Store resolved policies in report for evaluation report generation
         report.meta["tier_policies"] = tier_policies
         self._log_event(
@@ -455,7 +529,13 @@ class CoreRunner:
         )
     def _guard_phase(
-        self, model: Any, adapter: ModelAdapter, guards: list[Guard], report: RunReport
+        self,
+        model: Any,
+        adapter: ModelAdapter,
+        guards: list[Guard],
+        report: RunReport,
+        *,
+        guard_timings: dict[str, float] | None = None,
     ) -> dict[str, dict[str, Any]]:
         """Phase 4: Run safety guards."""
         self._log_event("guards", "start", LogLevel.INFO, {"count": len(guards)})
@@ -464,6 +544,7 @@ class CoreRunner:
         for guard in guards:
             self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
+            guard_start = time.perf_counter()
             if isinstance(guard, GuardWithContext):
                 try:
@@ -497,6 +578,11 @@ class CoreRunner:
                     LogLevel.ERROR,
                     {"guard": guard.name, "error": str(e)},
                 )
+            finally:
+                if guard_timings is not None:
+                    guard_timings[guard.name] = max(
+                        0.0, float(time.perf_counter() - guard_start)
+                    )
         report.guards = guard_results

invarlock/edits/noop.py CHANGED Viewed

@@ -1,7 +1,7 @@
-"""Built-in no-op edit used for baseline and Compare & Certify (BYOE).
+"""Built-in no-op edit used for baseline and Compare & Evaluate (BYOE).
 This edit does not modify the model and reports zero deltas. It exists to
-support baseline runs and Compare & Certify certification where the subject
+support baseline runs and Compare & Evaluate workflows where the subject
 checkpoint is produced outside of InvarLock.
 """

invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

invarlock 0.3.6py3-none-any.whl → 0.3.8py3-none-any.whl