PyPI - invarlock - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

invarlock 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

invarlock/__init__.py +1 -1
invarlock/cli/commands/run.py +6 -0
invarlock/cli/config.py +11 -1
invarlock/cli/determinism.py +16 -1
invarlock/core/bootstrap.py +137 -5
invarlock/core/runner.py +305 -35
invarlock/eval/bootstrap.py +3 -1
invarlock/eval/primary_metric.py +20 -5
invarlock/guards/rmt.py +536 -46
invarlock/guards/spectral.py +1 -1
invarlock/guards/variance.py +122 -43
invarlock/reporting/certificate.py +231 -12
invarlock/reporting/normalizer.py +3 -0
invarlock/reporting/policy_utils.py +1 -3
invarlock/reporting/primary_metric_utils.py +17 -0
invarlock/reporting/validate.py +10 -10
{invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/METADATA +2 -2
{invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/RECORD +22 -22
{invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
{invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
{invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.2.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0

invarlock/guards/spectral.py CHANGED Viewed

@@ -500,7 +500,7 @@ class SpectralGuard(Guard):
                     if self.ignore_preview_inflation and phase == "after_edit":
                         continue
-                    if z_score > kappa_cap:
+                    if abs(z_score) > kappa_cap:
                         violations.append(
                             {
                                 "type": "family_z_cap",

invarlock/guards/variance.py CHANGED Viewed

@@ -403,29 +403,36 @@ def _predictive_gate_outcome(
     ):
         return False, "ci_unavailable"
+    lower = float(delta_ci[0])
     upper = float(delta_ci[1])
     min_effect = float(min_effect or 0.0)
+    # CI must clear zero (and the min-effect band when provided).
     if one_sided:
-        # One-sided improvement (ΔlogNLL < 0): certify a minimum effect by
-        # requiring the *upper* bound of the (two-sided) CI to clear -min_effect.
         if upper >= 0.0:
             return False, "ci_contains_zero"
         if mean_delta >= 0.0:
             return False, "mean_not_negative"
-        gain_lower_bound = -upper  # worst-case gain under CI
-        if gain_lower_bound < min_effect:
+        if upper > -min_effect:
+            return False, "gain_below_threshold"
+        if mean_delta > -min_effect:
             return False, "gain_below_threshold"
         return True, "ci_gain_met"
-    # Two-sided improvement: CI must be strictly below zero.
-    if upper >= 0.0:
+    # Two-sided: detect regressions outside the +min_effect band, but only
+    # enable VE for negative improvements.
+    if lower <= 0.0 <= upper:
         return False, "ci_contains_zero"
-    gain_lower_bound = -upper  # Convert ΔlogNLL CI to gain CI lower bound.
-    if gain_lower_bound < min_effect:
+    if lower > 0.0:
+        if lower >= min_effect and mean_delta >= min_effect:
+            return False, "regression_detected"
+        return False, "mean_not_negative"
+    if upper > -min_effect:
+        return False, "gain_below_threshold"
+    if mean_delta >= 0.0:
+        return False, "mean_not_negative"
+    if mean_delta > -min_effect:
         return False, "gain_below_threshold"
     return True, "ci_gain_met"
@@ -1441,12 +1448,17 @@ class VarianceGuard(Guard):
         device = next(model.parameters()).device
         torch.manual_seed(calib_seed)
-        ppl_no_ve_samples, loss_no_ve_samples = self._compute_ppl_for_batches(
-            model, calibration_batches, device
+        (
+            ppl_no_ve_samples,
+            loss_no_ve_samples,
+            token_counts,
+        ) = self._compute_ppl_for_batches(
+            model, calibration_batches, device, return_counts=True
         )
         coverage = min(len(calibration_batches), len(ppl_no_ve_samples))
         ppl_with_ve_samples: list[float] = []
         loss_with_ve_samples: list[float] = []
+        token_counts_with: list[int] = []
         ratio_ci: tuple[float, float] | None = None
         enable_success = False
@@ -1462,10 +1474,12 @@ class VarianceGuard(Guard):
             try:
                 torch.manual_seed(calib_seed)
                 if enable_success:
-                    ppl_with_ve_samples, loss_with_ve_samples = (
-                        self._compute_ppl_for_batches(
-                            model, calibration_batches, device
-                        )
+                    (
+                        ppl_with_ve_samples,
+                        loss_with_ve_samples,
+                        token_counts_with,
+                    ) = self._compute_ppl_for_batches(
+                        model, calibration_batches, device, return_counts=True
                     )
             finally:
                 if enable_success:
@@ -1478,6 +1492,8 @@ class VarianceGuard(Guard):
             coverage,
             len(ppl_with_ve_samples) if ppl_with_ve_samples else coverage,
             len(loss_with_ve_samples) if loss_with_ve_samples else coverage,
+            len(token_counts) if token_counts else coverage,
+            len(token_counts_with) if token_counts_with else coverage,
         )
         self._calibration_stats.update(
             {
@@ -1546,6 +1562,7 @@ class VarianceGuard(Guard):
             loss_no_ve_samples = loss_no_ve_samples[:coverage]
             ppl_with_ve_samples = ppl_with_ve_samples[:coverage]
             loss_with_ve_samples = loss_with_ve_samples[:coverage]
+            token_counts = token_counts[:coverage]
             ratios = [
                 with_val / no_val
@@ -1602,6 +1619,7 @@ class VarianceGuard(Guard):
                 delta_ci = compute_paired_delta_log_ci(
                     loss_with_ve_samples,
                     loss_no_ve_samples,
+                    weights=token_counts,
                     method="bca",
                     replicates=500,
                     alpha=self._policy.get("alpha", 0.05),
@@ -1617,18 +1635,31 @@ class VarianceGuard(Guard):
                 )
             predictive_state["evaluated"] = True
-            mean_delta = float(
-                np.mean(
-                    [
-                        with_loss - no_loss
-                        for with_loss, no_loss in zip(
-                            loss_with_ve_samples,
-                            loss_no_ve_samples,
-                            strict=False,
-                        )
-                    ]
+            if token_counts:
+                sw = 0.0
+                swx = 0.0
+                for with_loss, no_loss, weight in zip(
+                    loss_with_ve_samples,
+                    loss_no_ve_samples,
+                    token_counts,
+                    strict=False,
+                ):
+                    sw += float(weight)
+                    swx += float(weight) * (with_loss - no_loss)
+                mean_delta = float(swx / sw) if sw > 0 else float("nan")
+            else:
+                mean_delta = float(
+                    np.mean(
+                        [
+                            with_loss - no_loss
+                            for with_loss, no_loss in zip(
+                                loss_with_ve_samples,
+                                loss_no_ve_samples,
+                                strict=False,
+                            )
+                        ]
+                    )
                 )
-            )
             predictive_state["mean_delta"] = mean_delta
             if delta_ci is not None and all(
@@ -1875,12 +1906,19 @@ class VarianceGuard(Guard):
         model: nn.Module,
         batches: list[Any],
         device: torch.device,
-    ) -> tuple[list[float], list[float]]:
+        *,
+        return_counts: bool = False,
+    ) -> tuple[list[float], list[float]] | tuple[list[float], list[float], list[int]]:
         """Compute per-batch perplexity and log-loss values for deterministic calibration."""
         ppl_values: list[float] = []
         loss_values: list[float] = []
+        token_counts: list[int] = []
         if not batches:
-            return ppl_values, loss_values
+            return (
+                (ppl_values, loss_values, token_counts)
+                if return_counts
+                else (ppl_values, loss_values)
+            )
         model_was_training = model.training
         model.eval()
@@ -1919,12 +1957,29 @@ class VarianceGuard(Guard):
                     if math.isfinite(ppl):
                         ppl_values.append(ppl)
                         loss_values.append(loss)
+                        if return_counts:
+                            count = None
+                            try:
+                                if labels is not None and isinstance(
+                                    labels, torch.Tensor
+                                ):
+                                    count = int((labels != -100).sum().item())
+                            except Exception:
+                                count = None
+                            if count is None:
+                                try:
+                                    count = int(inputs.numel())
+                                except Exception:
+                                    count = 0
+                            token_counts.append(int(max(count, 0)))
                 except Exception:
                     continue
         if model_was_training:
             model.train()
+        if return_counts:
+            return ppl_values, loss_values, token_counts
         return ppl_values, loss_values
     def _bootstrap_mean_ci(
@@ -2111,12 +2166,17 @@ class VarianceGuard(Guard):
             if calibration_batches:
                 device = next(model.parameters()).device
                 torch.manual_seed(calib_seed)
-                ppl_no_ve_samples, loss_no_ve_samples = self._compute_ppl_for_batches(
-                    model, calibration_batches, device
+                (
+                    ppl_no_ve_samples,
+                    loss_no_ve_samples,
+                    token_counts,
+                ) = self._compute_ppl_for_batches(
+                    model, calibration_batches, device, return_counts=True
                 )
                 coverage = min(len(calibration_batches), len(ppl_no_ve_samples))
                 ppl_with_ve_samples: list[float] = []
                 loss_with_ve_samples: list[float] = []
+                token_counts_with: list[int] = []
                 ratio_ci: tuple[float, float] | None = None
                 enable_success = False
@@ -2135,8 +2195,9 @@ class VarianceGuard(Guard):
                             (
                                 ppl_with_ve_samples,
                                 loss_with_ve_samples,
+                                token_counts_with,
                             ) = self._compute_ppl_for_batches(
-                                model, calibration_batches, device
+                                model, calibration_batches, device, return_counts=True
                             )
                     finally:
                         if enable_success:
@@ -2149,6 +2210,8 @@ class VarianceGuard(Guard):
                     coverage,
                     len(ppl_with_ve_samples) if ppl_with_ve_samples else coverage,
                     len(loss_with_ve_samples) if loss_with_ve_samples else coverage,
+                    len(token_counts) if token_counts else coverage,
+                    len(token_counts_with) if token_counts_with else coverage,
                 )
                 self._calibration_stats.update(
                     {"coverage": coverage, "status": "insufficient"}
@@ -2181,6 +2244,8 @@ class VarianceGuard(Guard):
                     loss_no_ve_samples = loss_no_ve_samples[:coverage]
                     ppl_with_ve_samples = ppl_with_ve_samples[:coverage]
                     loss_with_ve_samples = loss_with_ve_samples[:coverage]
+                    token_counts = token_counts[:coverage]
+                    token_counts_with = token_counts_with[:coverage]
                     ratios = [
                         with_val / no_val
@@ -2219,6 +2284,7 @@ class VarianceGuard(Guard):
                         delta_ci = compute_paired_delta_log_ci(
                             loss_with_ve_samples,
                             loss_no_ve_samples,
+                            weights=token_counts,
                             method="bca",
                             replicates=500,
                             alpha=self._policy.get("alpha", 0.05),
@@ -2234,18 +2300,31 @@ class VarianceGuard(Guard):
                         )
                     predictive_state["evaluated"] = True
-                    mean_delta = float(
-                        np.mean(
-                            [
-                                with_loss - no_loss
-                                for with_loss, no_loss in zip(
-                                    loss_with_ve_samples,
-                                    loss_no_ve_samples,
-                                    strict=False,
-                                )
-                            ]
+                    if token_counts:
+                        sw = 0.0
+                        swx = 0.0
+                        for with_loss, no_loss, weight in zip(
+                            loss_with_ve_samples,
+                            loss_no_ve_samples,
+                            token_counts,
+                            strict=False,
+                        ):
+                            sw += float(weight)
+                            swx += float(weight) * (with_loss - no_loss)
+                        mean_delta = float(swx / sw) if sw > 0 else float("nan")
+                    else:
+                        mean_delta = float(
+                            np.mean(
+                                [
+                                    with_loss - no_loss
+                                    for with_loss, no_loss in zip(
+                                        loss_with_ve_samples,
+                                        loss_no_ve_samples,
+                                        strict=False,
+                                    )
+                                ]
+                            )
                         )
-                    )
                     predictive_state["mean_delta"] = mean_delta
                     if delta_ci is not None and all(

invarlock/reporting/certificate.py CHANGED Viewed

@@ -538,6 +538,175 @@ def _enforce_ratio_ci_alignment(
             )
+def _enforce_display_ci_alignment(
+    ratio_ci_source: str,
+    primary_metric: Any,
+    logloss_delta_ci: Any,
+    window_plan_profile: str | None,
+) -> None:
+    """Ensure display_ci matches exp(ci) for ppl-like metrics when paired."""
+    if ratio_ci_source != "paired_baseline":
+        return
+    if not isinstance(primary_metric, dict) or not primary_metric:
+        return
+    try:
+        kind = str(primary_metric.get("kind", "")).lower()
+    except Exception:
+        return
+    if not kind.startswith("ppl"):
+        return
+    def _finite_bounds(bounds: Any) -> bool:
+        return (
+            isinstance(bounds, tuple | list)
+            and len(bounds) == 2
+            and all(isinstance(v, int | float) and math.isfinite(v) for v in bounds)
+        )
+    ci = primary_metric.get("ci")
+    if not _finite_bounds(ci):
+        if _finite_bounds(logloss_delta_ci):
+            primary_metric["ci"] = (
+                float(logloss_delta_ci[0]),
+                float(logloss_delta_ci[1]),
+            )
+            ci = primary_metric["ci"]
+        else:
+            profile = (window_plan_profile or "dev").lower()
+            if profile in {"ci", "release"}:
+                raise ValueError(
+                    "primary_metric.ci missing for ppl-like metric under paired baseline."
+                )
+            return
+    expected = tuple(math.exp(float(bound)) for bound in ci)
+    display_ci = primary_metric.get("display_ci")
+    if not _finite_bounds(display_ci):
+        profile = (window_plan_profile or "dev").lower()
+        if profile in {"ci", "release"}:
+            raise ValueError(
+                "primary_metric.display_ci missing for ppl-like metric under paired baseline."
+            )
+        primary_metric["display_ci"] = [expected[0], expected[1]]
+        return
+    for observed, exp_val in zip(display_ci, expected, strict=False):
+        tolerance = 5e-4 * max(1.0, abs(exp_val))
+        if abs(float(observed) - float(exp_val)) > tolerance:
+            profile = (window_plan_profile or "dev").lower()
+            if profile in {"ci", "release"}:
+                raise ValueError(
+                    "primary_metric.display_ci mismatch: bounds do not match exp(ci)."
+                )
+            primary_metric["display_ci"] = [expected[0], expected[1]]
+            break
+def _enforce_pairing_and_coverage(
+    stats: dict[str, Any] | None,
+    window_plan_profile: str | None,
+    tier: str | None,
+) -> None:
+    """Enforce pairing and coverage contracts for CI/Release profiles."""
+    profile = (window_plan_profile or "dev").lower()
+    if profile not in {"ci", "release"}:
+        return
+    if not isinstance(stats, dict):
+        raise ValueError("Missing dataset window stats for CI/Release enforcement.")
+    match_fraction = stats.get("window_match_fraction")
+    overlap_fraction = stats.get("window_overlap_fraction")
+    if not (
+        isinstance(match_fraction, (int | float))
+        and math.isfinite(float(match_fraction))
+    ):
+        raise ValueError("CI/Release requires window_match_fraction.")
+    if float(match_fraction) < 0.999999:
+        raise ValueError(
+            f"CI/Release requires perfect pairing (window_match_fraction={float(match_fraction):.6f})."
+        )
+    if not (
+        isinstance(overlap_fraction, (int | float))
+        and math.isfinite(float(overlap_fraction))
+    ):
+        raise ValueError("CI/Release requires window_overlap_fraction.")
+    if float(overlap_fraction) > 1e-9:
+        raise ValueError(
+            f"CI/Release requires non-overlapping windows (window_overlap_fraction={float(overlap_fraction):.6f})."
+        )
+    def _coerce_count(value: Any) -> int | None:
+        if value is None or isinstance(value, bool):
+            return None
+        try:
+            val = float(value)
+        except (TypeError, ValueError):
+            return None
+        if not math.isfinite(val) or val < 0:
+            return None
+        if abs(val - round(val)) > 1e-9:
+            return None
+        return int(round(val))
+    actual_preview = _coerce_count(stats.get("actual_preview"))
+    actual_final = _coerce_count(stats.get("actual_final"))
+    if actual_preview is None or actual_final is None:
+        coverage = stats.get("coverage")
+        if isinstance(coverage, dict):
+            if actual_preview is None:
+                actual_preview = _coerce_count(coverage.get("preview", {}).get("used"))
+            if actual_final is None:
+                actual_final = _coerce_count(coverage.get("final", {}).get("used"))
+    if actual_preview is None or actual_final is None:
+        raise ValueError("CI/Release requires preview/final window counts.")
+    if actual_preview != actual_final:
+        raise ValueError(
+            f"CI/Release requires matching preview/final counts "
+            f"(preview={actual_preview}, final={actual_final})."
+        )
+    from invarlock.core.runner import BOOTSTRAP_COVERAGE_REQUIREMENTS
+    tier_key = str(tier or "balanced").lower()
+    floors = BOOTSTRAP_COVERAGE_REQUIREMENTS.get(
+        tier_key, BOOTSTRAP_COVERAGE_REQUIREMENTS["balanced"]
+    )
+    preview_floor = int(floors.get("preview", 0))
+    final_floor = int(floors.get("final", 0))
+    replicates_floor = int(floors.get("replicates", 0))
+    coverage = stats.get("coverage")
+    if not isinstance(coverage, dict):
+        raise ValueError("CI/Release requires bootstrap coverage stats.")
+    preview_used = _coerce_count(coverage.get("preview", {}).get("used"))
+    final_used = _coerce_count(coverage.get("final", {}).get("used"))
+    replicates_used = _coerce_count(coverage.get("replicates", {}).get("used"))
+    if replicates_used is None:
+        bootstrap = stats.get("bootstrap")
+        if isinstance(bootstrap, dict):
+            replicates_used = _coerce_count(
+                bootstrap.get("replicates", bootstrap.get("n"))
+            )
+    if preview_used is None or final_used is None or replicates_used is None:
+        raise ValueError("CI/Release requires preview/final/replicates coverage stats.")
+    if preview_used < preview_floor or final_used < final_floor:
+        raise ValueError(
+            "CI/Release requires preview/final coverage at or above tier floors "
+            f"(preview={preview_used}/{preview_floor}, final={final_used}/{final_floor})."
+        )
+    if replicates_used < replicates_floor:
+        raise ValueError(
+            "CI/Release requires bootstrap replicates at or above tier floors "
+            f"(replicates={replicates_used}/{replicates_floor})."
+        )
 def _fallback_paired_windows(
     paired_windows: int, coverage_summary: dict[str, Any]
 ) -> int:
@@ -807,6 +976,47 @@ def make_certificate(
     if paired:
         paired_run, paired_base = paired
         paired_windows = len(paired_run)
+        paired_weights: list[float] | None = None
+        try:
+            run_ids = (
+                run_windows.get("window_ids") if isinstance(run_windows, dict) else None
+            )
+            run_w = (
+                run_windows.get("token_counts")
+                if isinstance(run_windows, dict)
+                else None
+            )
+            base_ids = (
+                baseline_windows.get("window_ids")
+                if isinstance(baseline_windows, dict)
+                else None
+            )
+            if (
+                isinstance(run_ids, list)
+                and isinstance(run_w, list)
+                and isinstance(base_ids, list)
+            ):
+                base_set = {
+                    int(b_id) for b_id in base_ids if isinstance(b_id, int | float)
+                }
+                weights: list[float] = []
+                for r_id, w in zip(run_ids, run_w, strict=False):
+                    if not isinstance(r_id, int | float):
+                        continue
+                    key = int(r_id)
+                    if key not in base_set:
+                        continue
+                    try:
+                        wv = float(w)
+                    except Exception:
+                        continue
+                    if not math.isfinite(wv):
+                        continue
+                    weights.append(float(max(wv, 0.0)))
+                if weights:
+                    paired_weights = weights
+        except Exception:  # pragma: no cover
+            paired_weights = None
         method = str(metrics_bootstrap.get("method", "percentile")).lower()
         replicates = int(
             metrics_bootstrap.get(
@@ -834,6 +1044,7 @@ def make_certificate(
                 delta_ci = compute_paired_delta_log_ci(
                     paired_run,
                     paired_base,
+                    weights=paired_weights,
                     method=ci_method,
                     replicates=replicates,
                     alpha=alpha,
@@ -1156,13 +1367,13 @@ def make_certificate(
                 act_fin = req_fin
             if req_prev is not None:
-                stats_obj.setdefault("requested_preview", req_prev)
+                stats_obj["requested_preview"] = req_prev
             if req_fin is not None:
-                stats_obj.setdefault("requested_final", req_fin)
+                stats_obj["requested_final"] = req_fin
             if act_prev is not None:
-                stats_obj.setdefault("actual_preview", act_prev)
+                stats_obj["actual_preview"] = act_prev
             if act_fin is not None:
-                stats_obj.setdefault("actual_final", act_fin)
+                stats_obj["actual_final"] = act_fin
             if "coverage_ok" not in stats_obj:
                 if (
@@ -1177,6 +1388,12 @@ def make_certificate(
     except Exception:  # pragma: no cover
         pass
+    _enforce_pairing_and_coverage(
+        ppl_analysis.get("stats", {}),
+        window_plan_profile,
+        auto.get("tier", "balanced"),
+    )
     if isinstance(window_plan_ctx, dict):
         ppl_analysis["window_plan"] = window_plan_ctx
@@ -1895,6 +2112,12 @@ def make_certificate(
     from .primary_metric_utils import attach_primary_metric as _attach_pm
     _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
+    _enforce_display_ci_alignment(
+        ratio_ci_source,
+        certificate.get("primary_metric"),
+        logloss_delta_ci,
+        window_plan_profile,
+    )
     # Ensure primary_metric has display_ci populated for schema invariants
     try:
@@ -2492,8 +2715,8 @@ def _prepare_guard_overhead_section(
             {
                 "overhead_ratio": metrics.get("overhead_ratio"),
                 "overhead_percent": metrics.get("overhead_percent"),
-                "bare_final": metrics.get("bare_final"),
-                "guarded_final": metrics.get("guarded_final"),
+                "bare_ppl": metrics.get("bare_ppl"),
+                "guarded_ppl": metrics.get("guarded_ppl"),
                 "messages": list(result.messages),
                 "warnings": list(result.warnings),
                 "errors": list(result.errors),
@@ -2505,12 +2728,8 @@ def _prepare_guard_overhead_section(
         return sanitized, bool(result.passed)
     # Fall back to direct ratio computation when reports are not provided
-    bare_ppl = _coerce_float(payload.get("bare_final")) or _coerce_float(
-        payload.get("bare_ppl")
-    )
-    guarded_ppl = _coerce_float(payload.get("guarded_final")) or _coerce_float(
-        payload.get("guarded_ppl")
-    )
+    bare_ppl = _coerce_float(payload.get("bare_ppl"))
+    guarded_ppl = _coerce_float(payload.get("guarded_ppl"))
     ratio = _coerce_float(payload.get("overhead_ratio"))
     if ratio is None and bare_ppl is not None and guarded_ppl is not None:

invarlock/reporting/normalizer.py CHANGED Viewed

@@ -194,6 +194,9 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
         "window_pairing_reason",
         "window_pairing_preview",
         "window_pairing_final",
+        "window_plan",
+        "window_capacity",
+        "stats",
         "total_tokens",
         "preview_total_tokens",
         "final_total_tokens",

invarlock/reporting/policy_utils.py CHANGED Viewed

@@ -579,9 +579,7 @@ def _extract_policy_overrides(report: RunReport) -> list[str]:
 def _compute_policy_digest(policy: dict[str, Any]) -> str:
-    canonical = json.dumps(
-        policy, sort_keys=True, default=str, separators=(",", ":"), ensure_ascii=True
-    )
+    canonical = json.dumps(policy, sort_keys=True, default=str)
     return hashlib.sha256(canonical.encode()).hexdigest()[:16]

invarlock/reporting/primary_metric_utils.py CHANGED Viewed

@@ -102,6 +102,23 @@ def attach_primary_metric(
                     and float(base_final) > 0
                 ):
                     pm_copy["ratio_vs_baseline"] = float(fin) / float(base_final)
+                # Ensure display_ci aligns with log-space CI for ppl-like metrics
+                try:
+                    kind = str(pm_copy.get("kind", "")).lower()
+                except Exception:
+                    kind = ""
+                ci = pm_copy.get("ci")
+                if (
+                    kind.startswith("ppl")
+                    and isinstance(ci, list | tuple)
+                    and len(ci) == 2
+                ):
+                    try:
+                        lo, hi = float(ci[0]), float(ci[1])
+                        if math.isfinite(lo) and math.isfinite(hi):
+                            pm_copy["display_ci"] = [math.exp(lo), math.exp(hi)]
+                    except Exception:
+                        pass
                 # Provide a degenerate display CI if missing
                 if not isinstance(
                     pm_copy.get("display_ci"), list | tuple

invarlock 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

invarlock 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl