invarlock 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/tiers.yaml +57 -30
  3. invarlock/adapters/__init__.py +1 -1
  4. invarlock/calibration/spectral_null.py +15 -10
  5. invarlock/calibration/variance_ve.py +0 -2
  6. invarlock/cli/commands/calibrate.py +6 -2
  7. invarlock/cli/commands/certify.py +58 -39
  8. invarlock/cli/commands/doctor.py +3 -1
  9. invarlock/cli/commands/explain_gates.py +57 -8
  10. invarlock/cli/commands/report.py +1 -1
  11. invarlock/cli/commands/run.py +159 -61
  12. invarlock/cli/commands/verify.py +78 -4
  13. invarlock/cli/config.py +21 -5
  14. invarlock/core/api.py +45 -5
  15. invarlock/core/auto_tuning.py +65 -20
  16. invarlock/core/contracts.py +7 -1
  17. invarlock/core/registry.py +2 -2
  18. invarlock/core/runner.py +314 -50
  19. invarlock/eval/bench.py +0 -13
  20. invarlock/eval/data.py +73 -283
  21. invarlock/eval/metrics.py +134 -4
  22. invarlock/eval/primary_metric.py +23 -0
  23. invarlock/eval/tail_stats.py +230 -0
  24. invarlock/guards/_estimators.py +154 -0
  25. invarlock/guards/policies.py +16 -6
  26. invarlock/guards/rmt.py +625 -544
  27. invarlock/guards/spectral.py +348 -110
  28. invarlock/guards/tier_config.py +32 -30
  29. invarlock/guards/variance.py +5 -29
  30. invarlock/guards_ref/rmt_ref.py +23 -23
  31. invarlock/model_profile.py +42 -15
  32. invarlock/reporting/certificate.py +225 -46
  33. invarlock/reporting/certificate_schema.py +2 -1
  34. invarlock/reporting/dataset_hashing.py +15 -2
  35. invarlock/reporting/guards_analysis.py +197 -274
  36. invarlock/reporting/normalizer.py +6 -0
  37. invarlock/reporting/policy_utils.py +38 -36
  38. invarlock/reporting/primary_metric_utils.py +71 -17
  39. invarlock/reporting/render.py +61 -0
  40. invarlock/reporting/report.py +1 -1
  41. invarlock/reporting/report_types.py +5 -2
  42. invarlock/reporting/validate.py +1 -18
  43. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/METADATA +6 -6
  44. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/RECORD +48 -46
  45. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/WHEEL +0 -0
  46. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/entry_points.txt +0 -0
  47. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/licenses/LICENSE +0 -0
  48. {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/top_level.txt +0 -0
@@ -35,6 +35,7 @@ from invarlock.core.bootstrap import (
35
35
  logspace_to_ratio_ci,
36
36
  )
37
37
  from invarlock.eval.primary_metric import compute_primary_metric_from_report, get_metric
38
+ from invarlock.eval.tail_stats import evaluate_metric_tail
38
39
  from invarlock.utils.digest import hash_json
39
40
 
40
41
  from . import certificate_schema as _cert_schema
@@ -81,7 +82,7 @@ TIER_RATIO_LIMITS: dict[str, float] = {
81
82
  def _is_ppl_kind(name: Any) -> bool:
82
83
  """Return True if a primary_metric kind denotes a ppl-like metric.
83
84
 
84
- Supports legacy and alternate names to stay resilient across schema variants.
85
+ Supports alternate names to stay resilient across schema variants.
85
86
  """
86
87
  try:
87
88
  n = str(name or "").lower()
@@ -100,7 +101,7 @@ def _is_ppl_kind(name: Any) -> bool:
100
101
  }
101
102
 
102
103
 
103
- ## NOTE: Deprecated legacy helper `_get_ppl_final` was removed; callers should
104
+ ## NOTE: Deprecated helper `_get_ppl_final` was removed; callers should
104
105
  ## use the normalized primary_metric block directly via make_certificate or
105
106
  ## report processing utilities.
106
107
 
@@ -391,6 +392,7 @@ def _compute_thresholds_hash(payload: dict[str, Any]) -> str:
391
392
  # Allow-list loader with safe defaults for validation keys
392
393
  _VALIDATION_ALLOWLIST_DEFAULT = {
393
394
  "primary_metric_acceptable",
395
+ "primary_metric_tail_acceptable",
394
396
  "preview_final_drift_acceptable",
395
397
  "guard_overhead_acceptable",
396
398
  "invariants_pass",
@@ -792,6 +794,19 @@ def make_certificate(
792
794
  except Exception: # pragma: no cover
793
795
  pass
794
796
 
797
+ # Execution profile provenance when available via run context.
798
+ try:
799
+ ctx = report.get("context") if isinstance(report, dict) else None
800
+ ctx_profile = (
801
+ str(ctx.get("profile") or "").strip().lower()
802
+ if isinstance(ctx, dict)
803
+ else ""
804
+ )
805
+ if ctx_profile:
806
+ meta["profile"] = ctx_profile
807
+ except Exception: # pragma: no cover
808
+ pass
809
+
795
810
  tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
796
811
  if not tokenizer_hash_meta:
797
812
  dataset_section = report.get("data", {})
@@ -1518,7 +1533,10 @@ def make_certificate(
1518
1533
  )
1519
1534
  overrides_list = _extract_policy_overrides(report)
1520
1535
  resolved_digest = _compute_policy_digest(
1521
- {"resolved_policy": resolved_policy, "overrides": overrides_list}
1536
+ {
1537
+ "resolved_policy": resolved_policy,
1538
+ "overrides": overrides_list,
1539
+ }
1522
1540
  )
1523
1541
  policy_provenance = {
1524
1542
  "tier": auto.get("tier", "balanced"),
@@ -1738,6 +1756,104 @@ def make_certificate(
1738
1756
 
1739
1757
  pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
1740
1758
 
1759
+ # Primary metric tail evidence and gate evaluation (ΔlogNLL vs baseline, per-window).
1760
+ pm_tail_result: dict[str, Any] = {}
1761
+ try:
1762
+ pm_kind = None
1763
+ try:
1764
+ pm_block = (
1765
+ report.get("metrics", {}).get("primary_metric")
1766
+ if isinstance(report.get("metrics"), dict)
1767
+ else None
1768
+ )
1769
+ if isinstance(pm_block, dict):
1770
+ pm_kind = pm_block.get("kind")
1771
+ except Exception: # pragma: no cover
1772
+ pm_kind = None
1773
+
1774
+ pm_tail_policy: dict[str, Any] = {}
1775
+ try:
1776
+ metrics_pol = (
1777
+ resolved_policy.get("metrics", {})
1778
+ if isinstance(resolved_policy, dict)
1779
+ else {}
1780
+ )
1781
+ if isinstance(metrics_pol, dict) and isinstance(
1782
+ metrics_pol.get("pm_tail"), dict
1783
+ ):
1784
+ pm_tail_policy = dict(metrics_pol.get("pm_tail") or {})
1785
+ except Exception: # pragma: no cover
1786
+ pm_tail_policy = {}
1787
+
1788
+ deltas: list[float] = []
1789
+ weights: list[float] = []
1790
+ if _is_ppl_kind(pm_kind):
1791
+ run_windows = (
1792
+ report.get("evaluation_windows", {}).get("final", {})
1793
+ if isinstance(report.get("evaluation_windows"), dict)
1794
+ else {}
1795
+ )
1796
+ base_windows = (
1797
+ baseline_normalized.get("evaluation_windows", {}).get("final", {})
1798
+ if isinstance(baseline_normalized.get("evaluation_windows"), dict)
1799
+ else {}
1800
+ )
1801
+ run_ids = (
1802
+ run_windows.get("window_ids") if isinstance(run_windows, dict) else None
1803
+ )
1804
+ run_ll = (
1805
+ run_windows.get("logloss") if isinstance(run_windows, dict) else None
1806
+ )
1807
+ run_tc = (
1808
+ run_windows.get("token_counts")
1809
+ if isinstance(run_windows, dict)
1810
+ else None
1811
+ )
1812
+ base_ids = (
1813
+ base_windows.get("window_ids")
1814
+ if isinstance(base_windows, dict)
1815
+ else None
1816
+ )
1817
+ base_ll = (
1818
+ base_windows.get("logloss") if isinstance(base_windows, dict) else None
1819
+ )
1820
+ if (
1821
+ isinstance(run_ids, list)
1822
+ and isinstance(run_ll, list)
1823
+ and isinstance(base_ids, list)
1824
+ and isinstance(base_ll, list)
1825
+ ):
1826
+ base_map: dict[int, float] = {}
1827
+ for b_id, b_val in zip(base_ids, base_ll, strict=False):
1828
+ if isinstance(b_id, int | float) and isinstance(b_val, int | float):
1829
+ base_map[int(b_id)] = float(b_val)
1830
+ for idx, (r_id, r_val) in enumerate(zip(run_ids, run_ll, strict=False)):
1831
+ if not (
1832
+ isinstance(r_id, int | float) and isinstance(r_val, int | float)
1833
+ ):
1834
+ continue
1835
+ key = int(r_id)
1836
+ if key not in base_map:
1837
+ continue
1838
+ dv = float(r_val) - base_map[key]
1839
+ if math.isfinite(dv):
1840
+ deltas.append(float(dv))
1841
+ if isinstance(run_tc, list) and idx < len(run_tc):
1842
+ try:
1843
+ wv = float(run_tc[idx])
1844
+ except Exception:
1845
+ wv = 0.0
1846
+ weights.append(float(max(wv, 0.0)))
1847
+
1848
+ pm_tail_result = evaluate_metric_tail(
1849
+ deltas=deltas,
1850
+ weights=weights if (weights and len(weights) == len(deltas)) else None,
1851
+ policy=pm_tail_policy,
1852
+ )
1853
+ pm_tail_result["source"] = "paired_baseline.final"
1854
+ except Exception: # pragma: no cover
1855
+ pm_tail_result = {"mode": "warn", "evaluated": False, "passed": True}
1856
+
1741
1857
  validation_kwargs = {
1742
1858
  "ppl": ppl_analysis,
1743
1859
  "spectral": spectral,
@@ -1765,7 +1881,14 @@ def make_certificate(
1765
1881
  except Exception: # pragma: no cover - defensive against patched functions
1766
1882
  validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
1767
1883
 
1884
+ try:
1885
+ if "pm_tail" in inspect.signature(_compute_validation_flags).parameters:
1886
+ validation_kwargs["pm_tail"] = pm_tail_result
1887
+ except Exception: # pragma: no cover - defensive against patched functions
1888
+ validation_kwargs["pm_tail"] = pm_tail_result
1889
+
1768
1890
  validation_flags = _compute_validation_flags(**validation_kwargs)
1891
+
1769
1892
  # Enforce validation key allow-list to prevent surface drift
1770
1893
  _allowed_validation = _load_validation_allowlist()
1771
1894
  validation_filtered = {
@@ -1797,6 +1920,7 @@ def make_certificate(
1797
1920
  "artifacts": artifacts_payload,
1798
1921
  "validation": validation_filtered,
1799
1922
  "guard_overhead": guard_overhead_section,
1923
+ "primary_metric_tail": pm_tail_result,
1800
1924
  }
1801
1925
 
1802
1926
  # Record tiny-relax provenance explicitly when active (dev-only demos)
@@ -2048,7 +2172,49 @@ def make_certificate(
2048
2172
  except Exception: # pragma: no cover
2049
2173
  pass
2050
2174
 
2051
- # Emit optional one-line telemetry summary (opt-in via INVARLOCK_TELEMETRY=1)
2175
+ # Attach/normalize primary metric block (moved to helper)
2176
+ from .primary_metric_utils import attach_primary_metric as _attach_pm
2177
+
2178
+ _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2179
+ _enforce_display_ci_alignment(
2180
+ ratio_ci_source,
2181
+ certificate.get("primary_metric"),
2182
+ logloss_delta_ci,
2183
+ window_plan_profile,
2184
+ )
2185
+
2186
+ # Ensure primary_metric has display_ci populated for schema invariants
2187
+ try:
2188
+ pm = (
2189
+ certificate.get("primary_metric", {})
2190
+ if isinstance(certificate.get("primary_metric"), dict)
2191
+ else None
2192
+ )
2193
+ if isinstance(pm, dict) and pm:
2194
+ # Prefer existing bounds; otherwise collapse to point estimate
2195
+ disp = pm.get("display_ci")
2196
+ if not (
2197
+ isinstance(disp, list | tuple)
2198
+ and len(disp) == 2
2199
+ and all(isinstance(x, int | float) for x in disp)
2200
+ ):
2201
+ point = None
2202
+ for key in ("ratio_vs_baseline", "final", "preview"):
2203
+ val = pm.get(key)
2204
+ if isinstance(val, int | float) and math.isfinite(float(val)):
2205
+ point = float(val)
2206
+ break
2207
+ if isinstance(point, float):
2208
+ pm["display_ci"] = [point, point]
2209
+ else:
2210
+ # As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
2211
+ pm["display_ci"] = [1.0, 1.0]
2212
+ pm.setdefault("estimated", True)
2213
+ except Exception: # pragma: no cover
2214
+ pass
2215
+
2216
+ # Emit optional one-line telemetry summary (opt-in via INVARLOCK_TELEMETRY=1).
2217
+ # This runs after primary_metric attachment so the summary can include display_ci/width.
2052
2218
  try:
2053
2219
  kind = None
2054
2220
  pm_try = (
@@ -2135,46 +2301,6 @@ def make_certificate(
2135
2301
  except Exception: # pragma: no cover
2136
2302
  pass
2137
2303
 
2138
- # Attach/normalize primary metric block (moved to helper)
2139
- from .primary_metric_utils import attach_primary_metric as _attach_pm
2140
-
2141
- _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2142
- _enforce_display_ci_alignment(
2143
- ratio_ci_source,
2144
- certificate.get("primary_metric"),
2145
- logloss_delta_ci,
2146
- window_plan_profile,
2147
- )
2148
-
2149
- # Ensure primary_metric has display_ci populated for schema invariants
2150
- try:
2151
- pm = (
2152
- certificate.get("primary_metric", {})
2153
- if isinstance(certificate.get("primary_metric"), dict)
2154
- else None
2155
- )
2156
- if isinstance(pm, dict) and pm:
2157
- # Prefer existing bounds; otherwise collapse to point estimate
2158
- disp = pm.get("display_ci")
2159
- if not (
2160
- isinstance(disp, list | tuple)
2161
- and len(disp) == 2
2162
- and all(isinstance(x, int | float) for x in disp)
2163
- ):
2164
- point = None
2165
- for key in ("ratio_vs_baseline", "final", "preview"):
2166
- val = pm.get(key)
2167
- if isinstance(val, int | float) and math.isfinite(float(val)):
2168
- point = float(val)
2169
- break
2170
- if isinstance(point, float):
2171
- pm["display_ci"] = [point, point]
2172
- else:
2173
- # As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
2174
- pm["display_ci"] = [1.0, 1.0]
2175
- except Exception: # pragma: no cover
2176
- pass
2177
-
2178
2304
  # Attach confidence label (non-gating)
2179
2305
  try:
2180
2306
  certificate["confidence"] = _compute_confidence_label(certificate)
@@ -2208,7 +2334,7 @@ def _normalize_baseline(baseline: RunReport | dict[str, Any]) -> dict[str, Any]:
2208
2334
  }
2209
2335
  # Check if it's a RunReport structure
2210
2336
  elif "meta" in baseline and "metrics" in baseline and "edit" in baseline:
2211
- # Accept both legacy ppl_* metrics and PM-first reports
2337
+ # Accept both ppl_* metrics and PM-first reports
2212
2338
  metrics_blk = baseline.get("metrics", {}) or {}
2213
2339
  ppl_final = metrics_blk.get("ppl_final")
2214
2340
  ppl_preview = metrics_blk.get("ppl_preview")
@@ -2483,7 +2609,7 @@ def _extract_edit_metadata(
2483
2609
  algorithm = edit_section.get("algorithm")
2484
2610
  if not algorithm:
2485
2611
  algorithm = edit_name or ""
2486
- # Sanitize algorithm identifiers to purge legacy/unsupported edit labels
2612
+ # Sanitize algorithm identifiers to purge unsupported edit labels
2487
2613
  try:
2488
2614
  alg_lower = str(algorithm).strip().lower()
2489
2615
  except Exception: # pragma: no cover
@@ -3112,6 +3238,7 @@ def _compute_validation_flags(
3112
3238
  moe: dict[str, Any] | None = None,
3113
3239
  dataset_capacity: dict[str, Any] | None = None,
3114
3240
  pm_acceptance_range: dict[str, float] | None = None,
3241
+ pm_tail: dict[str, Any] | None = None,
3115
3242
  ) -> dict[str, bool]:
3116
3243
  """Compute validation flags for the certificate including canonical gates."""
3117
3244
  tier = (tier or "balanced").lower()
@@ -3223,6 +3350,45 @@ def _compute_validation_flags(
3223
3350
  except Exception: # pragma: no cover
3224
3351
  pass
3225
3352
  tokens_ok = total_tokens >= eff_min_tokens
3353
+ if not tokens_ok:
3354
+ coverage_ok = False
3355
+ try:
3356
+ coverage = _ppl_metrics.get("bootstrap", {}).get("coverage")
3357
+ if isinstance(coverage, dict):
3358
+ prev_cov = coverage.get("preview")
3359
+ fin_cov = coverage.get("final")
3360
+ if isinstance(prev_cov, dict) and isinstance(fin_cov, dict):
3361
+ prev_used = prev_cov.get("used")
3362
+ prev_req = prev_cov.get("required")
3363
+ fin_used = fin_cov.get("used")
3364
+ fin_req = fin_cov.get("required")
3365
+ prev_ok = bool(prev_cov.get("ok")) or (
3366
+ isinstance(prev_used, int | float)
3367
+ and isinstance(prev_req, int | float)
3368
+ and float(prev_used) >= float(prev_req)
3369
+ )
3370
+ fin_ok = bool(fin_cov.get("ok")) or (
3371
+ isinstance(fin_used, int | float)
3372
+ and isinstance(fin_req, int | float)
3373
+ and float(fin_used) >= float(fin_req)
3374
+ )
3375
+ coverage_ok = prev_ok and fin_ok
3376
+ except Exception: # pragma: no cover
3377
+ coverage_ok = False
3378
+
3379
+ if coverage_ok:
3380
+ try:
3381
+ tolerance_ratio = float(
3382
+ pm_policy.get("min_tokens_tolerance", 0.02) or 0.0
3383
+ )
3384
+ except Exception:
3385
+ tolerance_ratio = 0.0
3386
+ if tolerance_ratio < 0.0:
3387
+ tolerance_ratio = 0.0
3388
+ relaxed_floor = int(
3389
+ math.floor(float(eff_min_tokens) * (1.0 - tolerance_ratio))
3390
+ )
3391
+ tokens_ok = total_tokens >= max(relaxed_floor, 0)
3226
3392
  except Exception: # pragma: no cover
3227
3393
  tokens_ok = True
3228
3394
  # Under tiny_relax, treat token floors as informational only
@@ -3416,6 +3582,19 @@ def _compute_validation_flags(
3416
3582
  except Exception: # pragma: no cover
3417
3583
  pass
3418
3584
 
3585
+ # Primary metric tail gate (warn/fail; default non-blocking)
3586
+ try:
3587
+ tail_ok = True
3588
+ if isinstance(pm_tail, dict) and pm_tail:
3589
+ mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
3590
+ evaluated = bool(pm_tail.get("evaluated", False))
3591
+ passed = bool(pm_tail.get("passed", True))
3592
+ if mode == "fail" and evaluated and (not passed):
3593
+ tail_ok = False
3594
+ flags["primary_metric_tail_acceptable"] = bool(tail_ok)
3595
+ except Exception: # pragma: no cover
3596
+ flags["primary_metric_tail_acceptable"] = True
3597
+
3419
3598
  return flags
3420
3599
 
3421
3600
 
@@ -148,6 +148,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
148
148
 
149
149
  _VALIDATION_ALLOWLIST_DEFAULT = {
150
150
  "primary_metric_acceptable",
151
+ "primary_metric_tail_acceptable",
151
152
  "preview_final_drift_acceptable",
152
153
  "guard_overhead_acceptable",
153
154
  "invariants_pass",
@@ -181,7 +182,7 @@ def _load_validation_allowlist() -> set[str]:
181
182
  def _validate_with_jsonschema(certificate: dict[str, Any]) -> bool:
182
183
  """Validate certificate with JSON Schema when available."""
183
184
  if jsonschema is None:
184
- return True # Schema library unavailable; fall back to legacy checks
185
+ return True # Schema library unavailable; fall back to minimal checks
185
186
  try:
186
187
  jsonschema.validate(instance=certificate, schema=CERTIFICATE_JSON_SCHEMA)
187
188
  return True
@@ -1,8 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any
3
+ from typing import TYPE_CHECKING, Any
4
4
 
5
- from ..eval.data import EvaluationWindow, compute_window_hash
5
+ if TYPE_CHECKING:
6
+ from ..eval.data import EvaluationWindow
7
+
8
+
9
+ def compute_window_hash(window: EvaluationWindow, *, include_data: bool) -> str:
10
+ """Lazy wrapper around `invarlock.eval.data.compute_window_hash`.
11
+
12
+ Importing `invarlock.eval.data` pulls in optional heavy deps (HF datasets /
13
+ pyarrow). Keep that import off the module import path so that lightweight
14
+ reporting/helpers can be used without eagerly importing those deps.
15
+ """
16
+ from ..eval.data import compute_window_hash as _compute_window_hash
17
+
18
+ return _compute_window_hash(window, include_data=include_data)
6
19
 
7
20
 
8
21
  def compute_window_hashes(