PyPI - invarlock - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

invarlock 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

invarlock/__init__.py +2 -2
invarlock/adapters/__init__.py +10 -14
invarlock/adapters/auto.py +35 -40
invarlock/adapters/capabilities.py +2 -2
invarlock/adapters/hf_causal.py +418 -0
invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
invarlock/adapters/hf_mixin.py +25 -4
invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
invarlock/cli/adapter_auto.py +31 -21
invarlock/cli/app.py +73 -2
invarlock/cli/commands/certify.py +600 -59
invarlock/cli/commands/doctor.py +8 -10
invarlock/cli/commands/plugins.py +13 -9
invarlock/cli/commands/report.py +233 -69
invarlock/cli/commands/run.py +907 -183
invarlock/cli/commands/verify.py +76 -11
invarlock/cli/config.py +1 -1
invarlock/cli/doctor_helpers.py +4 -5
invarlock/cli/output.py +193 -0
invarlock/cli/provenance.py +1 -1
invarlock/core/bootstrap.py +1 -1
invarlock/core/registry.py +9 -11
invarlock/core/runner.py +111 -25
invarlock/edits/quant_rtn.py +65 -37
invarlock/eval/bench.py +3 -3
invarlock/eval/data.py +68 -23
invarlock/eval/metrics.py +59 -1
invarlock/eval/tasks/__init__.py +12 -0
invarlock/eval/tasks/classification.py +48 -0
invarlock/eval/tasks/qa.py +36 -0
invarlock/eval/tasks/text_generation.py +102 -0
invarlock/guards/invariants.py +19 -10
invarlock/guards/rmt.py +2 -2
invarlock/guards/variance.py +2 -2
invarlock/model_profile.py +48 -27
invarlock/observability/health.py +6 -6
invarlock/observability/metrics.py +108 -0
invarlock/reporting/certificate.py +159 -9
invarlock/reporting/certificate_schema.py +1 -1
invarlock/reporting/guards_analysis.py +154 -4
invarlock/reporting/html.py +55 -5
invarlock/reporting/normalizer.py +7 -0
invarlock/reporting/render.py +791 -431
invarlock/reporting/report.py +39 -3
invarlock/reporting/report_types.py +6 -1
invarlock/reporting/telemetry.py +86 -0
{invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/METADATA +23 -9
{invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/RECORD +53 -48
{invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
{invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
invarlock/adapters/hf_gpt2.py +0 -404
invarlock/adapters/hf_llama.py +0 -487
{invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0

invarlock/edits/quant_rtn.py CHANGED Viewed

@@ -86,6 +86,31 @@ class RTNQuantEdit(ModelEdit):
         # group_size is currently reserved for potential future variants; it is
         # ignored for the built-in INT8 demo edit.
+        self._emit_enabled = True
+        self._emit_console = None
+        self._output_style = None
+    def _configure_output(self, **kwargs: Any) -> None:
+        emit = kwargs.get("emit", True)
+        self._emit_enabled = bool(emit)
+        console = kwargs.get("console")
+        if console is not None and hasattr(console, "print"):
+            self._emit_console = console
+        else:
+            self._emit_console = None
+        self._output_style = kwargs.get("output_style")
+    def _emit(self, message: str) -> None:
+        if not self._emit_enabled:
+            return
+        line = f"[EDIT] {message}".rstrip()
+        if self._emit_console is not None:
+            try:
+                self._emit_console.print(line, markup=False)
+            except TypeError:
+                self._emit_console.print(line)
+        else:
+            print(line)
     def can_edit(self, model_desc: dict[str, Any]) -> bool:
         """Check if RTN quantization can be applied to this model."""
@@ -233,15 +258,18 @@ class RTNQuantEdit(ModelEdit):
             scope = kwargs.get("scope", self.scope)
             seed = kwargs.get("seed", self.seed)
+            self._configure_output(**kwargs)
             # Diagnostic reporting
-            print("🔧 RTN Quantization Configuration:")
-            print(
-                f"   Bitwidth: {bitwidth} (from config: {kwargs.get('bitwidth', kwargs.get('bits', 'default'))})"
+            self._emit("RTN Quantization Configuration:")
+            self._emit(
+                "Bitwidth: "
+                f"{bitwidth} (from config: {kwargs.get('bitwidth', kwargs.get('bits', 'default'))})"
             )
-            print(f"   Scope: {scope}")
-            print(f"   Group size: {group_size}")
-            print(f"   Clamp ratio: {clamp_ratio}")
-            print(f"   Seed: {seed}")
+            self._emit(f"Scope: {scope}")
+            self._emit(f"Group size: {group_size}")
+            self._emit(f"Clamp ratio: {clamp_ratio}")
+            self._emit(f"Seed: {seed}")
             # Persist configuration overrides for downstream helpers
             self.bitwidth = bitwidth
@@ -256,22 +284,22 @@ class RTNQuantEdit(ModelEdit):
             np.random.seed(seed)
             # Identify target modules and get weight tying map
-            print(f"🎯 Identifying target modules for scope '{scope}'...")
+            self._emit(f"Identifying target modules for scope '{scope}'...")
             target_modules = self._identify_target_modules(model)
             total_identified = len(target_modules)
             max_modules = kwargs.get("max_modules")
             if isinstance(max_modules, int) and max_modules > 0:
                 if max_modules < total_identified:
-                    print(
-                        f"   Limiting quantization to first {max_modules} modules "
+                    self._emit(
+                        f"Limiting quantization to first {max_modules} modules "
                         f"(of {total_identified}) based on plan.max_modules"
                     )
                     target_modules = target_modules[:max_modules]
                     self.max_modules = max_modules
                 else:
-                    print(
-                        f"   max_modules={max_modules} >= available modules "
+                    self._emit(
+                        f"max_modules={max_modules} >= available modules "
                         f"({total_identified}); using all targets"
                     )
                     self.max_modules = None
@@ -280,33 +308,35 @@ class RTNQuantEdit(ModelEdit):
             tying_map = self._get_weight_tying_map(model)
-            print(f"   Found {len(target_modules)} target modules:")
+            self._emit(f"Found {len(target_modules)} target modules:")
             for i, (name, module) in enumerate(target_modules):
                 weight_shape = module.weight.shape
                 param_count = module.weight.numel()
-                print(f"   [{i + 1}] {name}: {weight_shape} ({param_count:,} params)")
+                self._emit(f"[{i + 1}] {name}: {weight_shape} ({param_count:,} params)")
             if len(target_modules) == 0:
-                print("❌ WARNING: No target modules found! Check scope configuration.")
-                print("   Available linear modules:")
+                self._emit(
+                    "WARNING: No target modules found! Check scope configuration."
+                )
+                self._emit("Available linear modules:")
                 linear_modules = []
                 for name, module in model.named_modules():
                     if isinstance(module, nn.Linear | nn.Conv1d):
                         linear_modules.append((name, module.weight.shape))
                 for name, shape in linear_modules[:10]:  # Show first 10
-                    print(f"     {name}: {shape}")
+                    self._emit(f"{name}: {shape}")
                 if len(linear_modules) > 10:
-                    print(f"     ... and {len(linear_modules) - 10} more")
+                    self._emit(f"... and {len(linear_modules) - 10} more")
             # Execute GuardChain before edit (if provided)
             guard_results = {}
             if self.guard_chain is not None:
-                print("  Executing guard chain preparation...")
+                self._emit("Executing guard chain preparation...")
                 guard_results["prepare"] = self.guard_chain.prepare_all(
                     model, adapter, None, {}
                 )
-                print("  Executing before-edit guards...")
+                self._emit("Executing before-edit guards...")
                 self.guard_chain.before_edit_all(model)
             # Apply quantization to each target module
@@ -314,12 +344,12 @@ class RTNQuantEdit(ModelEdit):
             total_params_quantized = 0
             for i, (module_name, module) in enumerate(target_modules):
-                print(f"  [{i + 1}/{len(target_modules)}] Quantizing: {module_name}")
-                print(
-                    f"    Shape: {module.weight.shape}, Params: {module.weight.numel():,}"
+                self._emit(f"[{i + 1}/{len(target_modules)}] Quantizing: {module_name}")
+                self._emit(
+                    f"Shape: {module.weight.shape}, Params: {module.weight.numel():,}"
                 )
-                print(
-                    f"    Weight range: [{module.weight.min():.4f}, {module.weight.max():.4f}]"
+                self._emit(
+                    f"Weight range: [{module.weight.min():.4f}, {module.weight.max():.4f}]"
                 )
                 # Apply RTN quantization
@@ -335,24 +365,22 @@ class RTNQuantEdit(ModelEdit):
                 quantization_results.append(quant_result)
                 total_params_quantized += quant_result["params_quantized"]
-                print(
-                    f"    ✓ Quantized {quant_result['params_quantized']:,} parameters"
-                )
+                self._emit(f"Quantized {quant_result['params_quantized']:,} parameters")
             # Execute GuardChain after edit (if provided)
             if self.guard_chain is not None:
-                print("  Executing after-edit guards...")
+                self._emit("Executing after-edit guards...")
                 self.guard_chain.after_edit_all(model)
-                print("  Finalizing guard chain...")
+                self._emit("Finalizing guard chain...")
                 guard_results["finalize"] = self.guard_chain.finalize_all(model)
                 # Check if all guards passed
                 if not self.guard_chain.all_passed(guard_results["finalize"]):
-                    print("  ⚠️ Guard chain validation failed!")
+                    self._emit("Guard chain validation failed!")
                     guard_results["all_passed"] = False
                 else:
-                    print("  ✓ All guards passed")
+                    self._emit("All guards passed")
                     guard_results["all_passed"] = True
             # Create bitwidth map
@@ -490,11 +518,11 @@ class RTNQuantEdit(ModelEdit):
         # Log diagnostic information
         if skipped_modules:
-            print(f"   Skipped {len(skipped_modules)} modules:")
+            self._emit(f"Skipped {len(skipped_modules)} modules:")
             for name, reason in skipped_modules[:5]:  # Show first 5
-                print(f"     {name}: {reason}")
+                self._emit(f"{name}: {reason}")
             if len(skipped_modules) > 5:
-                print(f"     ... and {len(skipped_modules) - 5} more")
+                self._emit(f"... and {len(skipped_modules) - 5} more")
         return target_modules
@@ -625,7 +653,7 @@ class RTNQuantEdit(ModelEdit):
         # Ensure actual quantization occurred by applying quantization loss
         # This guarantees the weights are actually modified
         quantization_error = (quantized_weight - original_weight).abs().mean()
-        print(f"    Quantization error: {quantization_error:.6f}")
+        self._emit(f"Quantization error: {quantization_error:.6f}")
         # Write back to module (preserving tying if needed)
         module.weight.data.copy_(quantized_weight)
@@ -634,7 +662,7 @@ class RTNQuantEdit(ModelEdit):
         final_weight = module.weight.data
         actual_change = not torch.allclose(original_weight, final_weight, atol=1e-6)
         if not actual_change:
-            print(f"    WARNING: No actual weight change detected for {module}")
+            self._emit(f"WARNING: No actual weight change detected for {module}")
         # Handle tied weights
         if tied_modules:

invarlock/eval/bench.py CHANGED Viewed

@@ -47,7 +47,7 @@ class ScenarioConfig:
     probes: int
     profile: str = "ci"  # "ci" or "release"
     model_id: str = "gpt2"
-    adapter: str = "hf_gpt2"
+    adapter: str = "hf_causal"
     device: str = "auto"
     seq_len: int = 512
     stride: int = 128
@@ -81,7 +81,7 @@ class BenchmarkConfig:
     profile: str = "ci"  # "ci" or "release"
     dataset: str = "wikitext2"
     model_id: str = "gpt2"
-    adapter: str = "hf_gpt2"
+    adapter: str = "hf_causal"
     device: str = "auto"
     seq_len: int = 512
     stride: int = 128
@@ -1423,7 +1423,7 @@ def main():
         "--dataset", default="wikitext2", help="Dataset to use for benchmarking"
     )
     parser.add_argument("--model-id", default="gpt2", help="Model identifier")
-    parser.add_argument("--adapter", default="hf_gpt2", help="Model adapter to use")
+    parser.add_argument("--adapter", default="hf_causal", help="Model adapter to use")
     parser.add_argument(
         "--device", default="auto", help="Device to use (auto|cuda|mps|cpu)"
     )

invarlock/eval/data.py CHANGED Viewed

@@ -15,7 +15,7 @@ import time
 import warnings
 from abc import abstractmethod
 from collections import Counter
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from pathlib import Path
 from typing import Any, NamedTuple, Protocol
@@ -56,6 +56,9 @@ except ImportError:
     HAS_TORCH = False
+EventEmitter = Callable[[str, str, str | None], None]
 class EvaluationWindow(NamedTuple):
     """A window of tokenized samples for evaluation."""
@@ -166,6 +169,7 @@ class WikiText2Provider:
         self,
         cache_dir: Path | None = None,
         device_hint: str | None = None,
+        emit: EventEmitter | None = None,
         **_: Any,
     ):
         """
@@ -175,6 +179,7 @@ class WikiText2Provider:
             cache_dir: Optional cache directory for dataset storage
         """
         self.cache_dir = cache_dir
+        self._emit_event = emit
         self._validate_dependencies()
         self._last_stratification_stats: dict[str, Any] | None = None
         self._last_batch_size_used: int = 0
@@ -186,6 +191,20 @@ class WikiText2Provider:
         normalized_hint = (device_hint or "").strip().lower()
         self._device_hint: str | None = normalized_hint or None
+    def _event(self, tag: str, message: str, *, emoji: str | None = None) -> None:
+        """Emit a dataset event via an optional CLI-provided sink."""
+        if self._emit_event is None:
+            if emoji:
+                print(f"{emoji} {message}")
+            else:
+                print(message)
+            return
+        try:
+            self._emit_event(tag, message, emoji)
+        except TypeError:
+            # Back-compat: tolerate sinks that only accept (tag, message).
+            self._emit_event(tag, message)  # type: ignore[misc]
     def _validate_dependencies(self) -> None:
         """Check that required dependencies are available."""
         if not HAS_DATASETS:
@@ -319,7 +338,11 @@ class WikiText2Provider:
         Returns:
             List of filtered text strings
         """
-        print(f"📚 Loading WikiText-2 {split} split...")
+        self._event(
+            "DATA",
+            f"WikiText-2 {split}: loading split...",
+            emoji="📚",
+        )
         # Serve from cache when possible (load the largest slice once)
         cached = self._texts_cache.get(split)
@@ -366,7 +389,10 @@ class WikiText2Provider:
         if prev is None or len(valid_texts) > len(prev):
             self._texts_cache[split] = list(valid_texts)
-        print(f"  ✓ Loaded {len(valid_texts)} valid samples from {len(dataset)} total")
+        self._event(
+            "DATA",
+            f"Loaded {len(valid_texts)}/{len(dataset)} valid samples",
+        )
         return valid_texts
     def windows(
@@ -435,9 +461,13 @@ class WikiText2Provider:
         cursor = 0
         chunk_size = max(64, min(256, target_pool))
-        print("  📊 Creating evaluation windows:")
-        print(f"    Requested preview/final: {preview_n}/{final_n}")
-        print(f"    Sampling pool target: {target_pool} (reserve {reserve})")
+        self._event(
+            "DATA",
+            "Creating evaluation windows:",
+            emoji="📊",
+        )
+        self._event("DATA", f"Requested preview/final: {preview_n}/{final_n}")
+        self._event("DATA", f"Sampling pool target: {target_pool} (reserve {reserve})")
         while len(candidates) < total_required + reserve and cursor < len(
             shuffled_indices
@@ -708,9 +738,9 @@ class WikiText2Provider:
             ),
         }
-        print(f"    Seed: {seed}, Seq length: {seq_len}")
-        print(f"    Preview: {len(preview_window)} samples")
-        print(f"    Final: {len(final_window)} samples")
+        self._event("DATA", f"Seed: {seed}, Seq length: {seq_len}")
+        self._event("DATA", f"Preview: {len(preview_window)} samples")
+        self._event("DATA", f"Final: {len(final_window)} samples")
         return preview_window, final_window
@@ -840,8 +870,9 @@ class WikiText2Provider:
         attention_masks_list = [entry[2] for entry in collected]
         valid_indices = [entry[0] for entry in collected]
-        print(
-            f"    ✓ {window_name}: {len(valid_indices)}/{len(indices)} samples tokenized successfully"
+        self._event(
+            "DATA",
+            f"{window_name}: {len(valid_indices)}/{len(indices)} samples tokenized",
         )
         return EvaluationWindow(
@@ -934,7 +965,8 @@ class SyntheticProvider:
         self, split: str = "validation", max_samples: int = 500, **kwargs
     ) -> list[str]:
         """Generate synthetic text samples."""
-        # Expand base samples to meet requirement
+        # Expand base samples to meet requirement, preferring unique variations
+        # to avoid duplicate-token windows (important for stratified pairing).
         expanded_samples: list[str] = []
         variations = [
             lambda s: s,
@@ -944,18 +976,25 @@ class SyntheticProvider:
             lambda s: f"Furthermore, {s.lower()}",
             lambda s: f"In addition, {s.lower()}",
         ]
-        # Use a deterministic approach based on max_samples
-        rng = np.random.RandomState(42)  # Fixed seed for reproducibility
-        while len(expanded_samples) < max_samples:
+        # Deterministic coverage of (variation × base sample) combinations first.
+        for variation in variations:
             for base_text in self.base_samples:
-                if len(expanded_samples) >= max_samples:
-                    break
-                variation = rng.choice(variations)
                 expanded_samples.append(variation(base_text))
+                if len(expanded_samples) >= max_samples:
+                    return expanded_samples
+        # If callers request more than the unique combination space, keep
+        # extending deterministically while ensuring uniqueness via a suffix.
+        idx = 0
+        while len(expanded_samples) < max_samples:
+            base_text = self.base_samples[idx % len(self.base_samples)]
+            variation = variations[(idx // len(self.base_samples)) % len(variations)]
+            expanded_samples.append(
+                f"{variation(base_text)} [synthetic #{len(expanded_samples)}]"
+            )
+            idx += 1
-        return expanded_samples[:max_samples]
+        return expanded_samples
     def windows(
         self,
@@ -1801,12 +1840,15 @@ _PROVIDERS: dict[str, type] = {
 }
-def get_provider(name: str, **kwargs) -> DatasetProvider:
+def get_provider(
+    name: str, *, emit: EventEmitter | None = None, **kwargs: Any
+) -> DatasetProvider:
     """
     Get a dataset provider by name.
     Args:
         name: Provider name ("wikitext2", "synthetic")
+        emit: Optional event sink for dataset/provider logs.
         **kwargs: Provider-specific initialization parameters
     Returns:
@@ -1825,7 +1867,10 @@ def get_provider(name: str, **kwargs) -> DatasetProvider:
         )
     provider_class = _PROVIDERS[name]
-    return provider_class(**kwargs)
+    init_kwargs = dict(kwargs)
+    if emit is not None and name == "wikitext2":
+        init_kwargs["emit"] = emit
+    return provider_class(**init_kwargs)
 def list_providers() -> list[str]:

invarlock/eval/metrics.py CHANGED Viewed

@@ -18,9 +18,10 @@ import gc
 import logging
 import math
 import time
+from collections.abc import Iterable
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Protocol
 import numpy as np
 import psutil
@@ -2269,6 +2270,57 @@ def analyze_rmt_changes(
         return {"error": str(e)}
+class Metric(Protocol):
+    name: str
+    kind: str  # "ppl", "accuracy", "exact_match", "bleu", "rouge"
+    def compute(self, model: Any, dataset: Iterable[dict[str, Any]]) -> float: ...
+class PerplexityMetric:
+    """Lightweight perplexity metric from per-record logloss + token counts."""
+    name = "perplexity"
+    kind = "ppl"
+    def compute(self, model: Any, dataset: Iterable[dict[str, Any]]) -> float:  # noqa: ARG002
+        total_loss = 0.0
+        total_tokens = 0.0
+        for record in dataset:
+            if not isinstance(record, dict):
+                continue
+            loss = record.get("logloss", record.get("loss"))
+            tokens = record.get("token_count", record.get("tokens", 1))
+            try:
+                loss_val = float(loss)
+                tok_val = float(tokens)
+            except Exception:
+                continue
+            if (
+                not math.isfinite(loss_val)
+                or not math.isfinite(tok_val)
+                or tok_val <= 0
+            ):
+                continue
+            total_loss += loss_val * tok_val
+            total_tokens += tok_val
+        if total_tokens <= 0:
+            return float("nan")
+        return float(math.exp(total_loss / total_tokens))
+class AccuracyMetric:
+    """Classification accuracy metric from label/prediction records."""
+    name = "accuracy"
+    kind = "accuracy"
+    def compute(self, model: Any, dataset: Iterable[dict[str, Any]]) -> float:  # noqa: ARG002
+        from invarlock.eval.tasks.classification import accuracy_from_records
+        return accuracy_from_records(dataset)
 # ── Integration with existing system ───────────────────────────────────────
 # Update exports to include new functions (add to existing __all__ if it exists)
@@ -2282,6 +2334,9 @@ try:
             "compute_parameter_deltas",
             "analyze_spectral_changes",
             "analyze_rmt_changes",
+            "Metric",
+            "PerplexityMetric",
+            "AccuracyMetric",
         ]
     )
 except NameError:
@@ -2294,4 +2349,7 @@ except NameError:
         "compute_parameter_deltas",
         "analyze_spectral_changes",
         "analyze_rmt_changes",
+        "Metric",
+        "PerplexityMetric",
+        "AccuracyMetric",
     ]

invarlock/eval/tasks/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from __future__ import annotations
+from .classification import accuracy_from_records
+from .qa import exact_match_from_records
+from .text_generation import bleu1_from_records, rouge_l_from_records
+__all__ = [
+    "accuracy_from_records",
+    "exact_match_from_records",
+    "bleu1_from_records",
+    "rouge_l_from_records",
+]

invarlock/eval/tasks/classification.py ADDED Viewed

@@ -0,0 +1,48 @@
+from __future__ import annotations
+from collections.abc import Iterable
+from typing import Any
+def _iter_pairs(record: dict[str, Any]) -> list[tuple[Any, Any]]:
+    if "correct" in record:
+        return [(bool(record.get("correct")), True)]
+    label = record.get("label")
+    pred = record.get("prediction")
+    if label is None:
+        label = record.get("labels")
+    if pred is None:
+        pred = record.get("pred")
+    if pred is None:
+        pred = record.get("predictions")
+    if isinstance(label, list) and isinstance(pred, list):
+        return list(zip(label, pred, strict=False))
+    if label is None or pred is None:
+        return []
+    return [(label, pred)]
+def accuracy_from_records(records: Iterable[dict[str, Any]]) -> float:
+    """Compute accuracy from records with labels/predictions.
+    Accepted record shapes:
+    - {"label": <label>, "prediction": <label>}
+    - {"labels": [...], "predictions": [...]}
+    - {"correct": <bool>}
+    """
+    total = 0
+    correct = 0
+    for record in records:
+        if not isinstance(record, dict):
+            continue
+        for label, pred in _iter_pairs(record):
+            total += 1
+            if isinstance(label, bool):
+                correct += int(label is pred)
+            else:
+                correct += int(label == pred)
+    if total == 0:
+        return float("nan")
+    return float(correct / total)

invarlock/eval/tasks/qa.py ADDED Viewed

@@ -0,0 +1,36 @@
+from __future__ import annotations
+from collections.abc import Iterable
+from typing import Any
+def _normalize(text: str) -> str:
+    return " ".join(str(text).strip().lower().split())
+def exact_match_from_records(records: Iterable[dict[str, Any]]) -> float:
+    """Compute exact-match accuracy for QA-style records.
+    Accepted record shapes:
+    - {"prediction": "...", "answer": "..."}
+    - {"prediction": "...", "answers": ["...", ...]}
+    """
+    total = 0
+    correct = 0
+    for record in records:
+        if not isinstance(record, dict):
+            continue
+        pred = record.get("prediction")
+        answers = record.get("answers")
+        if answers is None and "answer" in record:
+            answers = [record.get("answer")]
+        if pred is None or answers is None:
+            continue
+        pred_norm = _normalize(pred)
+        answer_list = answers if isinstance(answers, list) else [answers]
+        total += 1
+        if any(_normalize(a) == pred_norm for a in answer_list if a is not None):
+            correct += 1
+    if total == 0:
+        return float("nan")
+    return float(correct / total)

invarlock 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

invarlock 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl