PyPI - invarlock - Versions diffs - 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

invarlock 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

invarlock/__init__.py +2 -2
invarlock/_data/runtime/tiers.yaml +57 -30
invarlock/adapters/__init__.py +11 -15
invarlock/adapters/auto.py +35 -40
invarlock/adapters/capabilities.py +2 -2
invarlock/adapters/hf_causal.py +418 -0
invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
invarlock/adapters/hf_mixin.py +25 -4
invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
invarlock/calibration/spectral_null.py +15 -10
invarlock/calibration/variance_ve.py +0 -2
invarlock/cli/adapter_auto.py +31 -21
invarlock/cli/app.py +73 -2
invarlock/cli/commands/calibrate.py +6 -2
invarlock/cli/commands/certify.py +651 -91
invarlock/cli/commands/doctor.py +11 -11
invarlock/cli/commands/explain_gates.py +57 -8
invarlock/cli/commands/plugins.py +13 -9
invarlock/cli/commands/report.py +233 -69
invarlock/cli/commands/run.py +1066 -244
invarlock/cli/commands/verify.py +154 -15
invarlock/cli/config.py +22 -6
invarlock/cli/doctor_helpers.py +4 -5
invarlock/cli/output.py +193 -0
invarlock/cli/provenance.py +1 -1
invarlock/core/api.py +45 -5
invarlock/core/auto_tuning.py +65 -20
invarlock/core/bootstrap.py +1 -1
invarlock/core/contracts.py +7 -1
invarlock/core/registry.py +11 -13
invarlock/core/runner.py +425 -75
invarlock/edits/quant_rtn.py +65 -37
invarlock/eval/bench.py +3 -16
invarlock/eval/data.py +82 -51
invarlock/eval/metrics.py +63 -2
invarlock/eval/primary_metric.py +23 -0
invarlock/eval/tail_stats.py +230 -0
invarlock/eval/tasks/__init__.py +12 -0
invarlock/eval/tasks/classification.py +48 -0
invarlock/eval/tasks/qa.py +36 -0
invarlock/eval/tasks/text_generation.py +102 -0
invarlock/guards/_estimators.py +154 -0
invarlock/guards/invariants.py +19 -10
invarlock/guards/policies.py +16 -6
invarlock/guards/rmt.py +627 -546
invarlock/guards/spectral.py +348 -110
invarlock/guards/tier_config.py +32 -30
invarlock/guards/variance.py +7 -31
invarlock/guards_ref/rmt_ref.py +23 -23
invarlock/model_profile.py +90 -42
invarlock/observability/health.py +6 -6
invarlock/observability/metrics.py +108 -0
invarlock/reporting/certificate.py +384 -55
invarlock/reporting/certificate_schema.py +3 -2
invarlock/reporting/dataset_hashing.py +15 -2
invarlock/reporting/guards_analysis.py +350 -277
invarlock/reporting/html.py +55 -5
invarlock/reporting/normalizer.py +13 -0
invarlock/reporting/policy_utils.py +38 -36
invarlock/reporting/primary_metric_utils.py +71 -17
invarlock/reporting/render.py +852 -431
invarlock/reporting/report.py +40 -4
invarlock/reporting/report_types.py +11 -3
invarlock/reporting/telemetry.py +86 -0
invarlock/reporting/validate.py +1 -18
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
invarlock/adapters/hf_gpt2.py +0 -404
invarlock/adapters/hf_llama.py +0 -487
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0

invarlock/edits/quant_rtn.py CHANGED Viewed

@@ -86,6 +86,31 @@ class RTNQuantEdit(ModelEdit):
         # group_size is currently reserved for potential future variants; it is
         # ignored for the built-in INT8 demo edit.
+        self._emit_enabled = True
+        self._emit_console = None
+        self._output_style = None
+    def _configure_output(self, **kwargs: Any) -> None:
+        emit = kwargs.get("emit", True)
+        self._emit_enabled = bool(emit)
+        console = kwargs.get("console")
+        if console is not None and hasattr(console, "print"):
+            self._emit_console = console
+        else:
+            self._emit_console = None
+        self._output_style = kwargs.get("output_style")
+    def _emit(self, message: str) -> None:
+        if not self._emit_enabled:
+            return
+        line = f"[EDIT] {message}".rstrip()
+        if self._emit_console is not None:
+            try:
+                self._emit_console.print(line, markup=False)
+            except TypeError:
+                self._emit_console.print(line)
+        else:
+            print(line)
     def can_edit(self, model_desc: dict[str, Any]) -> bool:
         """Check if RTN quantization can be applied to this model."""
@@ -233,15 +258,18 @@ class RTNQuantEdit(ModelEdit):
             scope = kwargs.get("scope", self.scope)
             seed = kwargs.get("seed", self.seed)
+            self._configure_output(**kwargs)
             # Diagnostic reporting
-            print("🔧 RTN Quantization Configuration:")
-            print(
-                f"   Bitwidth: {bitwidth} (from config: {kwargs.get('bitwidth', kwargs.get('bits', 'default'))})"
+            self._emit("RTN Quantization Configuration:")
+            self._emit(
+                "Bitwidth: "
+                f"{bitwidth} (from config: {kwargs.get('bitwidth', kwargs.get('bits', 'default'))})"
             )
-            print(f"   Scope: {scope}")
-            print(f"   Group size: {group_size}")
-            print(f"   Clamp ratio: {clamp_ratio}")
-            print(f"   Seed: {seed}")
+            self._emit(f"Scope: {scope}")
+            self._emit(f"Group size: {group_size}")
+            self._emit(f"Clamp ratio: {clamp_ratio}")
+            self._emit(f"Seed: {seed}")
             # Persist configuration overrides for downstream helpers
             self.bitwidth = bitwidth
@@ -256,22 +284,22 @@ class RTNQuantEdit(ModelEdit):
             np.random.seed(seed)
             # Identify target modules and get weight tying map
-            print(f"🎯 Identifying target modules for scope '{scope}'...")
+            self._emit(f"Identifying target modules for scope '{scope}'...")
             target_modules = self._identify_target_modules(model)
             total_identified = len(target_modules)
             max_modules = kwargs.get("max_modules")
             if isinstance(max_modules, int) and max_modules > 0:
                 if max_modules < total_identified:
-                    print(
-                        f"   Limiting quantization to first {max_modules} modules "
+                    self._emit(
+                        f"Limiting quantization to first {max_modules} modules "
                         f"(of {total_identified}) based on plan.max_modules"
                     )
                     target_modules = target_modules[:max_modules]
                     self.max_modules = max_modules
                 else:
-                    print(
-                        f"   max_modules={max_modules} >= available modules "
+                    self._emit(
+                        f"max_modules={max_modules} >= available modules "
                         f"({total_identified}); using all targets"
                     )
                     self.max_modules = None
@@ -280,33 +308,35 @@ class RTNQuantEdit(ModelEdit):
             tying_map = self._get_weight_tying_map(model)
-            print(f"   Found {len(target_modules)} target modules:")
+            self._emit(f"Found {len(target_modules)} target modules:")
             for i, (name, module) in enumerate(target_modules):
                 weight_shape = module.weight.shape
                 param_count = module.weight.numel()
-                print(f"   [{i + 1}] {name}: {weight_shape} ({param_count:,} params)")
+                self._emit(f"[{i + 1}] {name}: {weight_shape} ({param_count:,} params)")
             if len(target_modules) == 0:
-                print("❌ WARNING: No target modules found! Check scope configuration.")
-                print("   Available linear modules:")
+                self._emit(
+                    "WARNING: No target modules found! Check scope configuration."
+                )
+                self._emit("Available linear modules:")
                 linear_modules = []
                 for name, module in model.named_modules():
                     if isinstance(module, nn.Linear | nn.Conv1d):
                         linear_modules.append((name, module.weight.shape))
                 for name, shape in linear_modules[:10]:  # Show first 10
-                    print(f"     {name}: {shape}")
+                    self._emit(f"{name}: {shape}")
                 if len(linear_modules) > 10:
-                    print(f"     ... and {len(linear_modules) - 10} more")
+                    self._emit(f"... and {len(linear_modules) - 10} more")
             # Execute GuardChain before edit (if provided)
             guard_results = {}
             if self.guard_chain is not None:
-                print("  Executing guard chain preparation...")
+                self._emit("Executing guard chain preparation...")
                 guard_results["prepare"] = self.guard_chain.prepare_all(
                     model, adapter, None, {}
                 )
-                print("  Executing before-edit guards...")
+                self._emit("Executing before-edit guards...")
                 self.guard_chain.before_edit_all(model)
             # Apply quantization to each target module
@@ -314,12 +344,12 @@ class RTNQuantEdit(ModelEdit):
             total_params_quantized = 0
             for i, (module_name, module) in enumerate(target_modules):
-                print(f"  [{i + 1}/{len(target_modules)}] Quantizing: {module_name}")
-                print(
-                    f"    Shape: {module.weight.shape}, Params: {module.weight.numel():,}"
+                self._emit(f"[{i + 1}/{len(target_modules)}] Quantizing: {module_name}")
+                self._emit(
+                    f"Shape: {module.weight.shape}, Params: {module.weight.numel():,}"
                 )
-                print(
-                    f"    Weight range: [{module.weight.min():.4f}, {module.weight.max():.4f}]"
+                self._emit(
+                    f"Weight range: [{module.weight.min():.4f}, {module.weight.max():.4f}]"
                 )
                 # Apply RTN quantization
@@ -335,24 +365,22 @@ class RTNQuantEdit(ModelEdit):
                 quantization_results.append(quant_result)
                 total_params_quantized += quant_result["params_quantized"]
-                print(
-                    f"    ✓ Quantized {quant_result['params_quantized']:,} parameters"
-                )
+                self._emit(f"Quantized {quant_result['params_quantized']:,} parameters")
             # Execute GuardChain after edit (if provided)
             if self.guard_chain is not None:
-                print("  Executing after-edit guards...")
+                self._emit("Executing after-edit guards...")
                 self.guard_chain.after_edit_all(model)
-                print("  Finalizing guard chain...")
+                self._emit("Finalizing guard chain...")
                 guard_results["finalize"] = self.guard_chain.finalize_all(model)
                 # Check if all guards passed
                 if not self.guard_chain.all_passed(guard_results["finalize"]):
-                    print("  ⚠️ Guard chain validation failed!")
+                    self._emit("Guard chain validation failed!")
                     guard_results["all_passed"] = False
                 else:
-                    print("  ✓ All guards passed")
+                    self._emit("All guards passed")
                     guard_results["all_passed"] = True
             # Create bitwidth map
@@ -490,11 +518,11 @@ class RTNQuantEdit(ModelEdit):
         # Log diagnostic information
         if skipped_modules:
-            print(f"   Skipped {len(skipped_modules)} modules:")
+            self._emit(f"Skipped {len(skipped_modules)} modules:")
             for name, reason in skipped_modules[:5]:  # Show first 5
-                print(f"     {name}: {reason}")
+                self._emit(f"{name}: {reason}")
             if len(skipped_modules) > 5:
-                print(f"     ... and {len(skipped_modules) - 5} more")
+                self._emit(f"... and {len(skipped_modules) - 5} more")
         return target_modules
@@ -625,7 +653,7 @@ class RTNQuantEdit(ModelEdit):
         # Ensure actual quantization occurred by applying quantization loss
         # This guarantees the weights are actually modified
         quantization_error = (quantized_weight - original_weight).abs().mean()
-        print(f"    Quantization error: {quantization_error:.6f}")
+        self._emit(f"Quantization error: {quantization_error:.6f}")
         # Write back to module (preserving tying if needed)
         module.weight.data.copy_(quantized_weight)
@@ -634,7 +662,7 @@ class RTNQuantEdit(ModelEdit):
         final_weight = module.weight.data
         actual_change = not torch.allclose(original_weight, final_weight, atol=1e-6)
         if not actual_change:
-            print(f"    WARNING: No actual weight change detected for {module}")
+            self._emit(f"WARNING: No actual weight change detected for {module}")
         # Handle tied weights
         if tied_modules:

invarlock/eval/bench.py CHANGED Viewed

@@ -47,7 +47,7 @@ class ScenarioConfig:
     probes: int
     profile: str = "ci"  # "ci" or "release"
     model_id: str = "gpt2"
-    adapter: str = "hf_gpt2"
+    adapter: str = "hf_causal"
     device: str = "auto"
     seq_len: int = 512
     stride: int = 128
@@ -81,7 +81,7 @@ class BenchmarkConfig:
     profile: str = "ci"  # "ci" or "release"
     dataset: str = "wikitext2"
     model_id: str = "gpt2"
-    adapter: str = "hf_gpt2"
+    adapter: str = "hf_causal"
     device: str = "auto"
     seq_len: int = 512
     stride: int = 128
@@ -92,7 +92,6 @@ class BenchmarkConfig:
     epsilon: float | None = (
         None  # RMT deadband tolerance (None = use resolved deadband)
     )
-    strict: bool = False  # If True, sets epsilon = 0
     ppl_overhead_threshold: float = 0.01  # 1%
     guard_overhead_time_threshold: float = 0.15  # 15%
     guard_overhead_mem_threshold: float = 0.10  # 10%
@@ -104,10 +103,6 @@ class BenchmarkConfig:
         """Apply post-initialization logic."""
         self.output_dir = Path(self.output_dir)
-        # Handle strict mode
-        if self.strict:
-            self.epsilon = 0.0
 @dataclass
 class ScenarioResult:
@@ -1043,7 +1038,6 @@ def run_guard_effect_benchmark(
     profile: str = "ci",
     output_dir: str | Path = "benchmarks",
     epsilon: float | None = None,
-    strict: bool = False,
     **kwargs,
 ) -> dict[str, Any]:
     """
@@ -1056,7 +1050,6 @@ def run_guard_effect_benchmark(
         profile: "ci" (50/50 windows) or "release" (100/100 windows)
         output_dir: Directory to save results
         epsilon: Optional epsilon override
-        strict: If True, sets epsilon = 0
         **kwargs: Additional configuration options
     Returns:
@@ -1075,7 +1068,6 @@ def run_guard_effect_benchmark(
         profile=profile,
         output_dir=Path(output_dir),
         epsilon=epsilon,
-        strict=strict,
         **kwargs,
     )
@@ -1384,7 +1376,6 @@ def _config_to_dict(config: BenchmarkConfig) -> dict[str, Any]:
         "stride": config.stride,
         "seed": config.seed,
         "epsilon": config.epsilon,
-        "strict": config.strict,
         "ppl_overhead_threshold": config.ppl_overhead_threshold,
         "guard_overhead_time_threshold": config.guard_overhead_time_threshold,
         "guard_overhead_mem_threshold": config.guard_overhead_mem_threshold,
@@ -1426,16 +1417,13 @@ def main():
         type=float,
         help="RMT outliers epsilon threshold (default: use resolved RMT deadband)",
     )
-    parser.add_argument(
-        "--strict", action="store_true", help="Set epsilon=0 (overrides --epsilon)"
-    )
     # Model and dataset configuration
     parser.add_argument(
         "--dataset", default="wikitext2", help="Dataset to use for benchmarking"
     )
     parser.add_argument("--model-id", default="gpt2", help="Model identifier")
-    parser.add_argument("--adapter", default="hf_gpt2", help="Model adapter to use")
+    parser.add_argument("--adapter", default="hf_causal", help="Model adapter to use")
     parser.add_argument(
         "--device", default="auto", help="Device to use (auto|cuda|mps|cpu)"
     )
@@ -1505,7 +1493,6 @@ def main():
             profile=args.profile,
             output_dir=args.out,
             epsilon=args.epsilon,
-            strict=args.strict,
             **kwargs,
         )

invarlock/eval/data.py CHANGED Viewed

@@ -15,7 +15,7 @@ import time
 import warnings
 from abc import abstractmethod
 from collections import Counter
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from pathlib import Path
 from typing import Any, NamedTuple, Protocol
@@ -56,6 +56,9 @@ except ImportError:
     HAS_TORCH = False
+EventEmitter = Callable[[str, str, str | None], None]
 class EvaluationWindow(NamedTuple):
     """A window of tokenized samples for evaluation."""
@@ -166,6 +169,7 @@ class WikiText2Provider:
         self,
         cache_dir: Path | None = None,
         device_hint: str | None = None,
+        emit: EventEmitter | None = None,
         **_: Any,
     ):
         """
@@ -175,6 +179,7 @@ class WikiText2Provider:
             cache_dir: Optional cache directory for dataset storage
         """
         self.cache_dir = cache_dir
+        self._emit_event = emit
         self._validate_dependencies()
         self._last_stratification_stats: dict[str, Any] | None = None
         self._last_batch_size_used: int = 0
@@ -186,11 +191,23 @@ class WikiText2Provider:
         normalized_hint = (device_hint or "").strip().lower()
         self._device_hint: str | None = normalized_hint or None
+    def _event(self, tag: str, message: str, *, emoji: str | None = None) -> None:
+        """Emit a dataset event via an optional CLI-provided sink."""
+        if self._emit_event is None:
+            if emoji:
+                print(f"{emoji} {message}")
+            else:
+                print(message)
+            return
+        try:
+            self._emit_event(tag, message, emoji)
+        except TypeError:
+            # Back-compat: tolerate sinks that only accept (tag, message).
+            self._emit_event(tag, message)  # type: ignore[misc]
     def _validate_dependencies(self) -> None:
         """Check that required dependencies are available."""
         if not HAS_DATASETS:
-            if _LIGHT_IMPORT:
-                return
             raise _DepErr(
                 code="E301",
                 message=(
@@ -321,20 +338,17 @@ class WikiText2Provider:
         Returns:
             List of filtered text strings
         """
-        print(f"📚 Loading WikiText-2 {split} split...")
+        self._event(
+            "DATA",
+            f"WikiText-2 {split}: loading split...",
+            emoji="📚",
+        )
         # Serve from cache when possible (load the largest slice once)
         cached = self._texts_cache.get(split)
         if cached is not None and len(cached) >= max_samples:
             return cached[:max_samples]
-        if not HAS_DATASETS and _LIGHT_IMPORT:
-            texts = ["hello world", "invarlock synthetic text"] * max(
-                1, max_samples // 2
-            )
-            self._texts_cache[split] = texts
-            return texts[:max_samples]
         # Load dataset with size limit for efficiency
         dataset_slice = f"{split}[:{max_samples}]" if max_samples > 0 else split
         dataset = load_dataset(
@@ -375,7 +389,10 @@ class WikiText2Provider:
         if prev is None or len(valid_texts) > len(prev):
             self._texts_cache[split] = list(valid_texts)
-        print(f"  ✓ Loaded {len(valid_texts)} valid samples from {len(dataset)} total")
+        self._event(
+            "DATA",
+            f"Loaded {len(valid_texts)}/{len(dataset)} valid samples",
+        )
         return valid_texts
     def windows(
@@ -444,9 +461,13 @@ class WikiText2Provider:
         cursor = 0
         chunk_size = max(64, min(256, target_pool))
-        print("  📊 Creating evaluation windows:")
-        print(f"    Requested preview/final: {preview_n}/{final_n}")
-        print(f"    Sampling pool target: {target_pool} (reserve {reserve})")
+        self._event(
+            "DATA",
+            "Creating evaluation windows:",
+            emoji="📊",
+        )
+        self._event("DATA", f"Requested preview/final: {preview_n}/{final_n}")
+        self._event("DATA", f"Sampling pool target: {target_pool} (reserve {reserve})")
         while len(candidates) < total_required + reserve and cursor < len(
             shuffled_indices
@@ -717,9 +738,9 @@ class WikiText2Provider:
             ),
         }
-        print(f"    Seed: {seed}, Seq length: {seq_len}")
-        print(f"    Preview: {len(preview_window)} samples")
-        print(f"    Final: {len(final_window)} samples")
+        self._event("DATA", f"Seed: {seed}, Seq length: {seq_len}")
+        self._event("DATA", f"Preview: {len(preview_window)} samples")
+        self._event("DATA", f"Final: {len(final_window)} samples")
         return preview_window, final_window
@@ -849,8 +870,9 @@ class WikiText2Provider:
         attention_masks_list = [entry[2] for entry in collected]
         valid_indices = [entry[0] for entry in collected]
-        print(
-            f"    ✓ {window_name}: {len(valid_indices)}/{len(indices)} samples tokenized successfully"
+        self._event(
+            "DATA",
+            f"{window_name}: {len(valid_indices)}/{len(indices)} samples tokenized",
         )
         return EvaluationWindow(
@@ -943,7 +965,8 @@ class SyntheticProvider:
         self, split: str = "validation", max_samples: int = 500, **kwargs
     ) -> list[str]:
         """Generate synthetic text samples."""
-        # Expand base samples to meet requirement
+        # Expand base samples to meet requirement, preferring unique variations
+        # to avoid duplicate-token windows (important for stratified pairing).
         expanded_samples: list[str] = []
         variations = [
             lambda s: s,
@@ -953,18 +976,25 @@ class SyntheticProvider:
             lambda s: f"Furthermore, {s.lower()}",
             lambda s: f"In addition, {s.lower()}",
         ]
-        # Use a deterministic approach based on max_samples
-        rng = np.random.RandomState(42)  # Fixed seed for reproducibility
-        while len(expanded_samples) < max_samples:
+        # Deterministic coverage of (variation × base sample) combinations first.
+        for variation in variations:
             for base_text in self.base_samples:
-                if len(expanded_samples) >= max_samples:
-                    break
-                variation = rng.choice(variations)
                 expanded_samples.append(variation(base_text))
+                if len(expanded_samples) >= max_samples:
+                    return expanded_samples
+        # If callers request more than the unique combination space, keep
+        # extending deterministically while ensuring uniqueness via a suffix.
+        idx = 0
+        while len(expanded_samples) < max_samples:
+            base_text = self.base_samples[idx % len(self.base_samples)]
+            variation = variations[(idx // len(self.base_samples)) % len(variations)]
+            expanded_samples.append(
+                f"{variation(base_text)} [synthetic #{len(expanded_samples)}]"
+            )
+            idx += 1
-        return expanded_samples[:max_samples]
+        return expanded_samples
     def windows(
         self,
@@ -1062,14 +1092,13 @@ class HFTextProvider:
         max_samples: int = 2000,
     ):
         if not HAS_DATASETS:
-            if not _LIGHT_IMPORT:
-                raise _DepErr(
-                    code="E301",
-                    message=(
-                        "DEPENDENCY-MISSING: datasets library required for hf_text provider"
-                    ),
-                    details={"dependency": "datasets"},
-                )
+            raise _DepErr(
+                code="E301",
+                message=(
+                    "DEPENDENCY-MISSING: datasets library required for hf_text provider"
+                ),
+                details={"dependency": "datasets"},
+            )
         self.dataset_name = dataset_name or "wikitext"
         self.config_name = config_name or None
         self.text_field = text_field
@@ -1077,9 +1106,6 @@ class HFTextProvider:
         self.max_samples = int(max_samples)
     def load(self, split: str = "validation", **kwargs) -> list[str]:
-        if not HAS_DATASETS and _LIGHT_IMPORT:
-            return ["synthetic dataset text"] * int(self.max_samples or 1)
         ds = load_dataset(
             path=self.dataset_name,
             name=self.config_name,
@@ -1204,14 +1230,13 @@ class HFSeq2SeqProvider:
         max_samples: int = 2000,
     ) -> None:
         if not HAS_DATASETS:
-            if not _LIGHT_IMPORT:
-                raise _DepErr(
-                    code="E301",
-                    message=(
-                        "DEPENDENCY-MISSING: datasets library required for hf_seq2seq provider"
-                    ),
-                    details={"dependency": "datasets"},
-                )
+            raise _DepErr(
+                code="E301",
+                message=(
+                    "DEPENDENCY-MISSING: datasets library required for hf_seq2seq provider"
+                ),
+                details={"dependency": "datasets"},
+            )
         self.dataset_name = dataset_name
         self.config_name = config_name
         self.src_field = src_field
@@ -1815,12 +1840,15 @@ _PROVIDERS: dict[str, type] = {
 }
-def get_provider(name: str, **kwargs) -> DatasetProvider:
+def get_provider(
+    name: str, *, emit: EventEmitter | None = None, **kwargs: Any
+) -> DatasetProvider:
     """
     Get a dataset provider by name.
     Args:
         name: Provider name ("wikitext2", "synthetic")
+        emit: Optional event sink for dataset/provider logs.
         **kwargs: Provider-specific initialization parameters
     Returns:
@@ -1839,7 +1867,10 @@ def get_provider(name: str, **kwargs) -> DatasetProvider:
         )
     provider_class = _PROVIDERS[name]
-    return provider_class(**kwargs)
+    init_kwargs = dict(kwargs)
+    if emit is not None and name == "wikitext2":
+        init_kwargs["emit"] = emit
+    return provider_class(**init_kwargs)
 def list_providers() -> list[str]:

invarlock/eval/metrics.py CHANGED Viewed

@@ -18,9 +18,10 @@ import gc
 import logging
 import math
 import time
+from collections.abc import Iterable
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Protocol
 import numpy as np
 import psutil
@@ -723,7 +724,10 @@ def calculate_lens_metrics_for_model(
     except Exception as e:
         logger.error(f"Metrics calculation failed: {e}")
         if config.strict_validation:
-            raise MetricsError(f"Metrics calculation failed: {e}") from e
+            raise MetricsError(
+                code="E401",
+                message=f"METRICS-COMPUTE-FAILED: {e}",
+            ) from e
     finally:
         resource_manager.cleanup()
@@ -2266,6 +2270,57 @@ def analyze_rmt_changes(
         return {"error": str(e)}
+class Metric(Protocol):
+    name: str
+    kind: str  # "ppl", "accuracy", "exact_match", "bleu", "rouge"
+    def compute(self, model: Any, dataset: Iterable[dict[str, Any]]) -> float: ...
+class PerplexityMetric:
+    """Lightweight perplexity metric from per-record logloss + token counts."""
+    name = "perplexity"
+    kind = "ppl"
+    def compute(self, model: Any, dataset: Iterable[dict[str, Any]]) -> float:  # noqa: ARG002
+        total_loss = 0.0
+        total_tokens = 0.0
+        for record in dataset:
+            if not isinstance(record, dict):
+                continue
+            loss = record.get("logloss", record.get("loss"))
+            tokens = record.get("token_count", record.get("tokens", 1))
+            try:
+                loss_val = float(loss)
+                tok_val = float(tokens)
+            except Exception:
+                continue
+            if (
+                not math.isfinite(loss_val)
+                or not math.isfinite(tok_val)
+                or tok_val <= 0
+            ):
+                continue
+            total_loss += loss_val * tok_val
+            total_tokens += tok_val
+        if total_tokens <= 0:
+            return float("nan")
+        return float(math.exp(total_loss / total_tokens))
+class AccuracyMetric:
+    """Classification accuracy metric from label/prediction records."""
+    name = "accuracy"
+    kind = "accuracy"
+    def compute(self, model: Any, dataset: Iterable[dict[str, Any]]) -> float:  # noqa: ARG002
+        from invarlock.eval.tasks.classification import accuracy_from_records
+        return accuracy_from_records(dataset)
 # ── Integration with existing system ───────────────────────────────────────
 # Update exports to include new functions (add to existing __all__ if it exists)
@@ -2279,6 +2334,9 @@ try:
             "compute_parameter_deltas",
             "analyze_spectral_changes",
             "analyze_rmt_changes",
+            "Metric",
+            "PerplexityMetric",
+            "AccuracyMetric",
         ]
     )
 except NameError:
@@ -2291,4 +2349,7 @@ except NameError:
         "compute_parameter_deltas",
         "analyze_spectral_changes",
         "analyze_rmt_changes",
+        "Metric",
+        "PerplexityMetric",
+        "AccuracyMetric",
     ]

invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

invarlock 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl