invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. invarlock/__init__.py +2 -2
  2. invarlock/_data/runtime/tiers.yaml +57 -30
  3. invarlock/adapters/__init__.py +11 -15
  4. invarlock/adapters/auto.py +35 -40
  5. invarlock/adapters/capabilities.py +2 -2
  6. invarlock/adapters/hf_causal.py +418 -0
  7. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  8. invarlock/adapters/hf_mixin.py +25 -4
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/calibration/spectral_null.py +15 -10
  12. invarlock/calibration/variance_ve.py +0 -2
  13. invarlock/cli/adapter_auto.py +31 -21
  14. invarlock/cli/app.py +73 -2
  15. invarlock/cli/commands/calibrate.py +6 -2
  16. invarlock/cli/commands/certify.py +651 -91
  17. invarlock/cli/commands/doctor.py +11 -11
  18. invarlock/cli/commands/explain_gates.py +57 -8
  19. invarlock/cli/commands/plugins.py +13 -9
  20. invarlock/cli/commands/report.py +233 -69
  21. invarlock/cli/commands/run.py +1066 -244
  22. invarlock/cli/commands/verify.py +154 -15
  23. invarlock/cli/config.py +22 -6
  24. invarlock/cli/doctor_helpers.py +4 -5
  25. invarlock/cli/output.py +193 -0
  26. invarlock/cli/provenance.py +1 -1
  27. invarlock/core/api.py +45 -5
  28. invarlock/core/auto_tuning.py +65 -20
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/contracts.py +7 -1
  31. invarlock/core/registry.py +11 -13
  32. invarlock/core/runner.py +425 -75
  33. invarlock/edits/quant_rtn.py +65 -37
  34. invarlock/eval/bench.py +3 -16
  35. invarlock/eval/data.py +82 -51
  36. invarlock/eval/metrics.py +63 -2
  37. invarlock/eval/primary_metric.py +23 -0
  38. invarlock/eval/tail_stats.py +230 -0
  39. invarlock/eval/tasks/__init__.py +12 -0
  40. invarlock/eval/tasks/classification.py +48 -0
  41. invarlock/eval/tasks/qa.py +36 -0
  42. invarlock/eval/tasks/text_generation.py +102 -0
  43. invarlock/guards/_estimators.py +154 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/policies.py +16 -6
  46. invarlock/guards/rmt.py +627 -546
  47. invarlock/guards/spectral.py +348 -110
  48. invarlock/guards/tier_config.py +32 -30
  49. invarlock/guards/variance.py +7 -31
  50. invarlock/guards_ref/rmt_ref.py +23 -23
  51. invarlock/model_profile.py +90 -42
  52. invarlock/observability/health.py +6 -6
  53. invarlock/observability/metrics.py +108 -0
  54. invarlock/reporting/certificate.py +384 -55
  55. invarlock/reporting/certificate_schema.py +3 -2
  56. invarlock/reporting/dataset_hashing.py +15 -2
  57. invarlock/reporting/guards_analysis.py +350 -277
  58. invarlock/reporting/html.py +55 -5
  59. invarlock/reporting/normalizer.py +13 -0
  60. invarlock/reporting/policy_utils.py +38 -36
  61. invarlock/reporting/primary_metric_utils.py +71 -17
  62. invarlock/reporting/render.py +852 -431
  63. invarlock/reporting/report.py +40 -4
  64. invarlock/reporting/report_types.py +11 -3
  65. invarlock/reporting/telemetry.py +86 -0
  66. invarlock/reporting/validate.py +1 -18
  67. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
  68. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
  69. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
  70. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
  71. invarlock/adapters/hf_gpt2.py +0 -404
  72. invarlock/adapters/hf_llama.py +0 -487
  73. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
  74. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,9 @@
1
1
  """
2
- InvarLock Safety Certificate Generation
3
- ==================================
2
+ InvarLock Evaluation Certificate Generation
3
+ ==========================================
4
4
 
5
- Generate standardized safety certificates from RunReport and baseline comparison.
5
+ Generate standardized evaluation certificates from RunReport and baseline
6
+ comparison.
6
7
  Certificates are standalone, portable verification artifacts that can be used
7
8
  for CI/CD gates and regulatory compliance.
8
9
  """
@@ -35,6 +36,7 @@ from invarlock.core.bootstrap import (
35
36
  logspace_to_ratio_ci,
36
37
  )
37
38
  from invarlock.eval.primary_metric import compute_primary_metric_from_report, get_metric
39
+ from invarlock.eval.tail_stats import evaluate_metric_tail
38
40
  from invarlock.utils.digest import hash_json
39
41
 
40
42
  from . import certificate_schema as _cert_schema
@@ -81,7 +83,7 @@ TIER_RATIO_LIMITS: dict[str, float] = {
81
83
  def _is_ppl_kind(name: Any) -> bool:
82
84
  """Return True if a primary_metric kind denotes a ppl-like metric.
83
85
 
84
- Supports legacy and alternate names to stay resilient across schema variants.
86
+ Supports alternate names to stay resilient across schema variants.
85
87
  """
86
88
  try:
87
89
  n = str(name or "").lower()
@@ -100,7 +102,7 @@ def _is_ppl_kind(name: Any) -> bool:
100
102
  }
101
103
 
102
104
 
103
- ## NOTE: Deprecated legacy helper `_get_ppl_final` was removed; callers should
105
+ ## NOTE: Deprecated helper `_get_ppl_final` was removed; callers should
104
106
  ## use the normalized primary_metric block directly via make_certificate or
105
107
  ## report processing utilities.
106
108
 
@@ -391,6 +393,7 @@ def _compute_thresholds_hash(payload: dict[str, Any]) -> str:
391
393
  # Allow-list loader with safe defaults for validation keys
392
394
  _VALIDATION_ALLOWLIST_DEFAULT = {
393
395
  "primary_metric_acceptable",
396
+ "primary_metric_tail_acceptable",
394
397
  "preview_final_drift_acceptable",
395
398
  "guard_overhead_acceptable",
396
399
  "invariants_pass",
@@ -741,7 +744,7 @@ def make_certificate(
741
744
  baseline: RunReport | dict[str, Any],
742
745
  ) -> dict[str, Any]:
743
746
  """
744
- Generate a safety certificate from a RunReport and baseline comparison.
747
+ Generate an evaluation certificate from a RunReport and baseline comparison.
745
748
 
746
749
  The certificate is a standalone, portable artifact that contains all
747
750
  essential metrics and comparisons needed for safety verification.
@@ -762,6 +765,17 @@ def make_certificate(
762
765
  # Normalize baseline input
763
766
  baseline_raw = baseline
764
767
  baseline_normalized = _normalize_baseline(baseline_raw)
768
+ baseline_report: RunReport | None = None
769
+ try:
770
+ if (
771
+ isinstance(baseline_raw, dict)
772
+ and "meta" in baseline_raw
773
+ and "metrics" in baseline_raw
774
+ and "edit" in baseline_raw
775
+ ):
776
+ baseline_report = _normalize_and_validate_report(baseline_raw)
777
+ except Exception: # pragma: no cover - baseline compare is best-effort
778
+ baseline_report = None
765
779
 
766
780
  # Extract core metadata with full seed bundle
767
781
  meta = _extract_certificate_meta(report)
@@ -792,6 +806,19 @@ def make_certificate(
792
806
  except Exception: # pragma: no cover
793
807
  pass
794
808
 
809
+ # Execution profile provenance when available via run context.
810
+ try:
811
+ ctx = report.get("context") if isinstance(report, dict) else None
812
+ ctx_profile = (
813
+ str(ctx.get("profile") or "").strip().lower()
814
+ if isinstance(ctx, dict)
815
+ else ""
816
+ )
817
+ if ctx_profile:
818
+ meta["profile"] = ctx_profile
819
+ except Exception: # pragma: no cover
820
+ pass
821
+
795
822
  tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
796
823
  if not tokenizer_hash_meta:
797
824
  dataset_section = report.get("data", {})
@@ -1425,7 +1452,7 @@ def make_certificate(
1425
1452
  ppl_analysis["window_plan"] = window_plan_ctx
1426
1453
 
1427
1454
  # Extract invariant status
1428
- invariants = _extract_invariants(report)
1455
+ invariants = _extract_invariants(report, baseline=baseline_report)
1429
1456
 
1430
1457
  # Extract spectral analysis
1431
1458
  spectral = _extract_spectral_analysis(report, baseline_normalized)
@@ -1518,7 +1545,10 @@ def make_certificate(
1518
1545
  )
1519
1546
  overrides_list = _extract_policy_overrides(report)
1520
1547
  resolved_digest = _compute_policy_digest(
1521
- {"resolved_policy": resolved_policy, "overrides": overrides_list}
1548
+ {
1549
+ "resolved_policy": resolved_policy,
1550
+ "overrides": overrides_list,
1551
+ }
1522
1552
  )
1523
1553
  policy_provenance = {
1524
1554
  "tier": auto.get("tier", "balanced"),
@@ -1540,7 +1570,13 @@ def make_certificate(
1540
1570
  telemetry: dict[str, Any] = {}
1541
1571
  metrics_section = report.get("metrics", {})
1542
1572
  if isinstance(metrics_section, dict):
1543
- for key in ("latency_ms_per_tok", "memory_mb_peak", "throughput_tok_per_s"):
1573
+ for key in (
1574
+ "latency_ms_per_tok",
1575
+ "memory_mb_peak",
1576
+ "gpu_memory_mb_peak",
1577
+ "gpu_memory_reserved_mb_peak",
1578
+ "throughput_tok_per_s",
1579
+ ):
1544
1580
  value = metrics_section.get(key)
1545
1581
  if isinstance(value, int | float) and math.isfinite(value):
1546
1582
  telemetry[key] = float(value)
@@ -1737,6 +1773,105 @@ def make_certificate(
1737
1773
  capacity_examples = None
1738
1774
 
1739
1775
  pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
1776
+ pm_drift_band = _resolve_pm_drift_band_from_report(report)
1777
+
1778
+ # Primary metric tail evidence and gate evaluation (ΔlogNLL vs baseline, per-window).
1779
+ pm_tail_result: dict[str, Any] = {}
1780
+ try:
1781
+ pm_kind = None
1782
+ try:
1783
+ pm_block = (
1784
+ report.get("metrics", {}).get("primary_metric")
1785
+ if isinstance(report.get("metrics"), dict)
1786
+ else None
1787
+ )
1788
+ if isinstance(pm_block, dict):
1789
+ pm_kind = pm_block.get("kind")
1790
+ except Exception: # pragma: no cover
1791
+ pm_kind = None
1792
+
1793
+ pm_tail_policy: dict[str, Any] = {}
1794
+ try:
1795
+ metrics_pol = (
1796
+ resolved_policy.get("metrics", {})
1797
+ if isinstance(resolved_policy, dict)
1798
+ else {}
1799
+ )
1800
+ if isinstance(metrics_pol, dict) and isinstance(
1801
+ metrics_pol.get("pm_tail"), dict
1802
+ ):
1803
+ pm_tail_policy = dict(metrics_pol.get("pm_tail") or {})
1804
+ except Exception: # pragma: no cover
1805
+ pm_tail_policy = {}
1806
+
1807
+ deltas: list[float] = []
1808
+ weights: list[float] = []
1809
+ if _is_ppl_kind(pm_kind):
1810
+ run_windows = (
1811
+ report.get("evaluation_windows", {}).get("final", {})
1812
+ if isinstance(report.get("evaluation_windows"), dict)
1813
+ else {}
1814
+ )
1815
+ base_windows = (
1816
+ baseline_normalized.get("evaluation_windows", {}).get("final", {})
1817
+ if isinstance(baseline_normalized.get("evaluation_windows"), dict)
1818
+ else {}
1819
+ )
1820
+ run_ids = (
1821
+ run_windows.get("window_ids") if isinstance(run_windows, dict) else None
1822
+ )
1823
+ run_ll = (
1824
+ run_windows.get("logloss") if isinstance(run_windows, dict) else None
1825
+ )
1826
+ run_tc = (
1827
+ run_windows.get("token_counts")
1828
+ if isinstance(run_windows, dict)
1829
+ else None
1830
+ )
1831
+ base_ids = (
1832
+ base_windows.get("window_ids")
1833
+ if isinstance(base_windows, dict)
1834
+ else None
1835
+ )
1836
+ base_ll = (
1837
+ base_windows.get("logloss") if isinstance(base_windows, dict) else None
1838
+ )
1839
+ if (
1840
+ isinstance(run_ids, list)
1841
+ and isinstance(run_ll, list)
1842
+ and isinstance(base_ids, list)
1843
+ and isinstance(base_ll, list)
1844
+ ):
1845
+ base_map: dict[int, float] = {}
1846
+ for b_id, b_val in zip(base_ids, base_ll, strict=False):
1847
+ if isinstance(b_id, int | float) and isinstance(b_val, int | float):
1848
+ base_map[int(b_id)] = float(b_val)
1849
+ for idx, (r_id, r_val) in enumerate(zip(run_ids, run_ll, strict=False)):
1850
+ if not (
1851
+ isinstance(r_id, int | float) and isinstance(r_val, int | float)
1852
+ ):
1853
+ continue
1854
+ key = int(r_id)
1855
+ if key not in base_map:
1856
+ continue
1857
+ dv = float(r_val) - base_map[key]
1858
+ if math.isfinite(dv):
1859
+ deltas.append(float(dv))
1860
+ if isinstance(run_tc, list) and idx < len(run_tc):
1861
+ try:
1862
+ wv = float(run_tc[idx])
1863
+ except Exception:
1864
+ wv = 0.0
1865
+ weights.append(float(max(wv, 0.0)))
1866
+
1867
+ pm_tail_result = evaluate_metric_tail(
1868
+ deltas=deltas,
1869
+ weights=weights if (weights and len(weights) == len(deltas)) else None,
1870
+ policy=pm_tail_policy,
1871
+ )
1872
+ pm_tail_result["source"] = "paired_baseline.final"
1873
+ except Exception: # pragma: no cover
1874
+ pm_tail_result = {"mode": "warn", "evaluated": False, "passed": True}
1740
1875
 
1741
1876
  validation_kwargs = {
1742
1877
  "ppl": ppl_analysis,
@@ -1765,7 +1900,20 @@ def make_certificate(
1765
1900
  except Exception: # pragma: no cover - defensive against patched functions
1766
1901
  validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
1767
1902
 
1903
+ try:
1904
+ if "pm_drift_band" in inspect.signature(_compute_validation_flags).parameters:
1905
+ validation_kwargs["pm_drift_band"] = pm_drift_band
1906
+ except Exception: # pragma: no cover - defensive against patched functions
1907
+ validation_kwargs["pm_drift_band"] = pm_drift_band
1908
+
1909
+ try:
1910
+ if "pm_tail" in inspect.signature(_compute_validation_flags).parameters:
1911
+ validation_kwargs["pm_tail"] = pm_tail_result
1912
+ except Exception: # pragma: no cover - defensive against patched functions
1913
+ validation_kwargs["pm_tail"] = pm_tail_result
1914
+
1768
1915
  validation_flags = _compute_validation_flags(**validation_kwargs)
1916
+
1769
1917
  # Enforce validation key allow-list to prevent surface drift
1770
1918
  _allowed_validation = _load_validation_allowlist()
1771
1919
  validation_filtered = {
@@ -1797,6 +1945,7 @@ def make_certificate(
1797
1945
  "artifacts": artifacts_payload,
1798
1946
  "validation": validation_filtered,
1799
1947
  "guard_overhead": guard_overhead_section,
1948
+ "primary_metric_tail": pm_tail_result,
1800
1949
  }
1801
1950
 
1802
1951
  # Record tiny-relax provenance explicitly when active (dev-only demos)
@@ -2048,7 +2197,56 @@ def make_certificate(
2048
2197
  except Exception: # pragma: no cover
2049
2198
  pass
2050
2199
 
2051
- # Emit optional one-line telemetry summary (opt-in via INVARLOCK_TELEMETRY=1)
2200
+ # Attach/normalize primary metric block (moved to helper)
2201
+ from .primary_metric_utils import attach_primary_metric as _attach_pm
2202
+
2203
+ _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2204
+ try:
2205
+ if isinstance(pm_drift_band, dict) and pm_drift_band:
2206
+ pm_block = certificate.get("primary_metric")
2207
+ if isinstance(pm_block, dict):
2208
+ pm_block.setdefault("drift_band", dict(pm_drift_band))
2209
+ except Exception: # pragma: no cover
2210
+ pass
2211
+ _enforce_display_ci_alignment(
2212
+ ratio_ci_source,
2213
+ certificate.get("primary_metric"),
2214
+ logloss_delta_ci,
2215
+ window_plan_profile,
2216
+ )
2217
+
2218
+ # Ensure primary_metric has display_ci populated for schema invariants
2219
+ try:
2220
+ pm = (
2221
+ certificate.get("primary_metric", {})
2222
+ if isinstance(certificate.get("primary_metric"), dict)
2223
+ else None
2224
+ )
2225
+ if isinstance(pm, dict) and pm:
2226
+ # Prefer existing bounds; otherwise collapse to point estimate
2227
+ disp = pm.get("display_ci")
2228
+ if not (
2229
+ isinstance(disp, list | tuple)
2230
+ and len(disp) == 2
2231
+ and all(isinstance(x, int | float) for x in disp)
2232
+ ):
2233
+ point = None
2234
+ for key in ("ratio_vs_baseline", "final", "preview"):
2235
+ val = pm.get(key)
2236
+ if isinstance(val, int | float) and math.isfinite(float(val)):
2237
+ point = float(val)
2238
+ break
2239
+ if isinstance(point, float):
2240
+ pm["display_ci"] = [point, point]
2241
+ else:
2242
+ # As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
2243
+ pm["display_ci"] = [1.0, 1.0]
2244
+ pm.setdefault("estimated", True)
2245
+ except Exception: # pragma: no cover
2246
+ pass
2247
+
2248
+ # Emit optional one-line telemetry summary (opt-in via INVARLOCK_TELEMETRY=1).
2249
+ # This runs after primary_metric attachment so the summary can include display_ci/width.
2052
2250
  try:
2053
2251
  kind = None
2054
2252
  pm_try = (
@@ -2135,46 +2333,6 @@ def make_certificate(
2135
2333
  except Exception: # pragma: no cover
2136
2334
  pass
2137
2335
 
2138
- # Attach/normalize primary metric block (moved to helper)
2139
- from .primary_metric_utils import attach_primary_metric as _attach_pm
2140
-
2141
- _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2142
- _enforce_display_ci_alignment(
2143
- ratio_ci_source,
2144
- certificate.get("primary_metric"),
2145
- logloss_delta_ci,
2146
- window_plan_profile,
2147
- )
2148
-
2149
- # Ensure primary_metric has display_ci populated for schema invariants
2150
- try:
2151
- pm = (
2152
- certificate.get("primary_metric", {})
2153
- if isinstance(certificate.get("primary_metric"), dict)
2154
- else None
2155
- )
2156
- if isinstance(pm, dict) and pm:
2157
- # Prefer existing bounds; otherwise collapse to point estimate
2158
- disp = pm.get("display_ci")
2159
- if not (
2160
- isinstance(disp, list | tuple)
2161
- and len(disp) == 2
2162
- and all(isinstance(x, int | float) for x in disp)
2163
- ):
2164
- point = None
2165
- for key in ("ratio_vs_baseline", "final", "preview"):
2166
- val = pm.get(key)
2167
- if isinstance(val, int | float) and math.isfinite(float(val)):
2168
- point = float(val)
2169
- break
2170
- if isinstance(point, float):
2171
- pm["display_ci"] = [point, point]
2172
- else:
2173
- # As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
2174
- pm["display_ci"] = [1.0, 1.0]
2175
- except Exception: # pragma: no cover
2176
- pass
2177
-
2178
2336
  # Attach confidence label (non-gating)
2179
2337
  try:
2180
2338
  certificate["confidence"] = _compute_confidence_label(certificate)
@@ -2208,7 +2366,7 @@ def _normalize_baseline(baseline: RunReport | dict[str, Any]) -> dict[str, Any]:
2208
2366
  }
2209
2367
  # Check if it's a RunReport structure
2210
2368
  elif "meta" in baseline and "metrics" in baseline and "edit" in baseline:
2211
- # Accept both legacy ppl_* metrics and PM-first reports
2369
+ # Accept both ppl_* metrics and PM-first reports
2212
2370
  metrics_blk = baseline.get("metrics", {}) or {}
2213
2371
  ppl_final = metrics_blk.get("ppl_final")
2214
2372
  ppl_preview = metrics_blk.get("ppl_preview")
@@ -2483,12 +2641,12 @@ def _extract_edit_metadata(
2483
2641
  algorithm = edit_section.get("algorithm")
2484
2642
  if not algorithm:
2485
2643
  algorithm = edit_name or ""
2486
- # Sanitize algorithm identifiers to purge legacy/unsupported edit labels
2644
+ # Sanitize algorithm identifiers to purge unsupported edit labels
2487
2645
  try:
2488
2646
  alg_lower = str(algorithm).strip().lower()
2489
2647
  except Exception: # pragma: no cover
2490
2648
  alg_lower = ""
2491
- allowed_algorithms = {"quant_rtn", "noop"}
2649
+ allowed_algorithms = {"quant_rtn", "noop", "custom"}
2492
2650
  if alg_lower not in allowed_algorithms:
2493
2651
  algorithm = ""
2494
2652
 
@@ -3099,6 +3257,105 @@ def _resolve_pm_acceptance_range_from_report(
3099
3257
  return {"min": float(min_val), "max": float(max_val)}
3100
3258
 
3101
3259
 
3260
+ def _resolve_pm_drift_band_from_report(
3261
+ report: dict[str, Any] | None,
3262
+ ) -> dict[str, float]:
3263
+ """Resolve preview→final drift band from report context/meta/env."""
3264
+
3265
+ base_min = 0.95
3266
+ base_max = 1.05
3267
+
3268
+ def _safe_float(val: Any) -> float | None:
3269
+ try:
3270
+ if val is None:
3271
+ return None
3272
+ out = float(val)
3273
+ except Exception:
3274
+ return None
3275
+ return out if math.isfinite(out) else None
3276
+
3277
+ cfg_min = None
3278
+ cfg_max = None
3279
+
3280
+ ctx = report.get("context") if isinstance(report, dict) else None
3281
+ if isinstance(ctx, dict):
3282
+ pm_ctx = ctx.get("primary_metric")
3283
+ if isinstance(pm_ctx, dict):
3284
+ band = pm_ctx.get("drift_band")
3285
+ if isinstance(band, dict):
3286
+ cfg_min = _safe_float(band.get("min"))
3287
+ cfg_max = _safe_float(band.get("max"))
3288
+ elif isinstance(band, list | tuple) and len(band) == 2:
3289
+ cfg_min = _safe_float(band[0])
3290
+ cfg_max = _safe_float(band[1])
3291
+ if cfg_min is None or cfg_max is None:
3292
+ alt = ctx.get("pm_drift_band")
3293
+ if isinstance(alt, dict):
3294
+ cfg_min = (
3295
+ cfg_min if cfg_min is not None else _safe_float(alt.get("min"))
3296
+ )
3297
+ cfg_max = (
3298
+ cfg_max if cfg_max is not None else _safe_float(alt.get("max"))
3299
+ )
3300
+
3301
+ if (cfg_min is None or cfg_max is None) and isinstance(report, dict):
3302
+ meta = report.get("meta")
3303
+ if isinstance(meta, dict):
3304
+ meta_band = meta.get("pm_drift_band")
3305
+ if isinstance(meta_band, dict):
3306
+ cfg_min = (
3307
+ cfg_min
3308
+ if cfg_min is not None
3309
+ else _safe_float(meta_band.get("min"))
3310
+ )
3311
+ cfg_max = (
3312
+ cfg_max
3313
+ if cfg_max is not None
3314
+ else _safe_float(meta_band.get("max"))
3315
+ )
3316
+
3317
+ def _parse_env(name: str) -> float | None:
3318
+ try:
3319
+ raw = os.environ.get(name, "")
3320
+ if raw is None or str(raw).strip() == "":
3321
+ return None
3322
+ return float(raw)
3323
+ except Exception:
3324
+ return None
3325
+
3326
+ env_min = _parse_env("INVARLOCK_PM_DRIFT_MIN")
3327
+ env_max = _parse_env("INVARLOCK_PM_DRIFT_MAX")
3328
+
3329
+ has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
3330
+ if not has_explicit:
3331
+ return {}
3332
+
3333
+ min_val = (
3334
+ env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
3335
+ )
3336
+ max_val = (
3337
+ env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
3338
+ )
3339
+
3340
+ try:
3341
+ if min_val is not None and min_val <= 0:
3342
+ min_val = base_min
3343
+ except Exception:
3344
+ min_val = base_min
3345
+ try:
3346
+ if max_val is not None and max_val <= 0:
3347
+ max_val = base_max
3348
+ except Exception:
3349
+ max_val = base_max
3350
+ try:
3351
+ if min_val is not None and max_val is not None and min_val >= max_val:
3352
+ min_val, max_val = base_min, base_max
3353
+ except Exception:
3354
+ min_val, max_val = base_min, base_max
3355
+
3356
+ return {"min": float(min_val), "max": float(max_val)}
3357
+
3358
+
3102
3359
  def _compute_validation_flags(
3103
3360
  ppl: dict[str, Any],
3104
3361
  spectral: dict[str, Any],
@@ -3112,6 +3369,8 @@ def _compute_validation_flags(
3112
3369
  moe: dict[str, Any] | None = None,
3113
3370
  dataset_capacity: dict[str, Any] | None = None,
3114
3371
  pm_acceptance_range: dict[str, float] | None = None,
3372
+ pm_drift_band: dict[str, float] | None = None,
3373
+ pm_tail: dict[str, Any] | None = None,
3115
3374
  ) -> dict[str, bool]:
3116
3375
  """Compute validation flags for the certificate including canonical gates."""
3117
3376
  tier = (tier or "balanced").lower()
@@ -3174,9 +3433,27 @@ def _compute_validation_flags(
3174
3433
  ratio_limit = min(ratio_limit, float(target_ratio))
3175
3434
 
3176
3435
  # Canonical Gates
3177
- # 1. Drift gate: 0.95 ≤ final/preview ≤ 1.05
3436
+ # 1. Drift gate: by default 0.95 ≤ final/preview ≤ 1.05 (configurable)
3178
3437
  drift_ratio = ppl.get("preview_final_ratio", 1.0)
3179
- preview_final_drift_acceptable = 0.95 <= drift_ratio <= 1.05
3438
+ drift_min = 0.95
3439
+ drift_max = 1.05
3440
+ if isinstance(pm_drift_band, dict):
3441
+ try:
3442
+ cand_min = pm_drift_band.get("min")
3443
+ cand_max = pm_drift_band.get("max")
3444
+ if isinstance(cand_min, int | float) and isinstance(cand_max, int | float):
3445
+ cand_min_f = float(cand_min)
3446
+ cand_max_f = float(cand_max)
3447
+ if (
3448
+ math.isfinite(cand_min_f)
3449
+ and math.isfinite(cand_max_f)
3450
+ and 0 < cand_min_f < cand_max_f
3451
+ ):
3452
+ drift_min = cand_min_f
3453
+ drift_max = cand_max_f
3454
+ except Exception: # pragma: no cover
3455
+ pass
3456
+ preview_final_drift_acceptable = drift_min <= drift_ratio <= drift_max
3180
3457
  if _tiny_relax:
3181
3458
  # Treat drift identity as informational in tiny dev demos
3182
3459
  preview_final_drift_acceptable = True
@@ -3223,6 +3500,45 @@ def _compute_validation_flags(
3223
3500
  except Exception: # pragma: no cover
3224
3501
  pass
3225
3502
  tokens_ok = total_tokens >= eff_min_tokens
3503
+ if not tokens_ok:
3504
+ coverage_ok = False
3505
+ try:
3506
+ coverage = _ppl_metrics.get("bootstrap", {}).get("coverage")
3507
+ if isinstance(coverage, dict):
3508
+ prev_cov = coverage.get("preview")
3509
+ fin_cov = coverage.get("final")
3510
+ if isinstance(prev_cov, dict) and isinstance(fin_cov, dict):
3511
+ prev_used = prev_cov.get("used")
3512
+ prev_req = prev_cov.get("required")
3513
+ fin_used = fin_cov.get("used")
3514
+ fin_req = fin_cov.get("required")
3515
+ prev_ok = bool(prev_cov.get("ok")) or (
3516
+ isinstance(prev_used, int | float)
3517
+ and isinstance(prev_req, int | float)
3518
+ and float(prev_used) >= float(prev_req)
3519
+ )
3520
+ fin_ok = bool(fin_cov.get("ok")) or (
3521
+ isinstance(fin_used, int | float)
3522
+ and isinstance(fin_req, int | float)
3523
+ and float(fin_used) >= float(fin_req)
3524
+ )
3525
+ coverage_ok = prev_ok and fin_ok
3526
+ except Exception: # pragma: no cover
3527
+ coverage_ok = False
3528
+
3529
+ if coverage_ok:
3530
+ try:
3531
+ tolerance_ratio = float(
3532
+ pm_policy.get("min_tokens_tolerance", 0.02) or 0.0
3533
+ )
3534
+ except Exception:
3535
+ tolerance_ratio = 0.0
3536
+ if tolerance_ratio < 0.0:
3537
+ tolerance_ratio = 0.0
3538
+ relaxed_floor = int(
3539
+ math.floor(float(eff_min_tokens) * (1.0 - tolerance_ratio))
3540
+ )
3541
+ tokens_ok = total_tokens >= max(relaxed_floor, 0)
3226
3542
  except Exception: # pragma: no cover
3227
3543
  tokens_ok = True
3228
3544
  # Under tiny_relax, treat token floors as informational only
@@ -3416,6 +3732,19 @@ def _compute_validation_flags(
3416
3732
  except Exception: # pragma: no cover
3417
3733
  pass
3418
3734
 
3735
+ # Primary metric tail gate (warn/fail; default non-blocking)
3736
+ try:
3737
+ tail_ok = True
3738
+ if isinstance(pm_tail, dict) and pm_tail:
3739
+ mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
3740
+ evaluated = bool(pm_tail.get("evaluated", False))
3741
+ passed = bool(pm_tail.get("passed", True))
3742
+ if mode == "fail" and evaluated and (not passed):
3743
+ tail_ok = False
3744
+ flags["primary_metric_tail_acceptable"] = bool(tail_ok)
3745
+ except Exception: # pragma: no cover
3746
+ flags["primary_metric_tail_acceptable"] = True
3747
+
3419
3748
  return flags
3420
3749
 
3421
3750
 
@@ -20,7 +20,7 @@ CERTIFICATE_SCHEMA_VERSION = "v1"
20
20
  # separately in metric-specific logic.
21
21
  CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
22
22
  "$schema": "https://json-schema.org/draft/2020-12/schema",
23
- "title": "InvarLock Safety Certificate",
23
+ "title": "InvarLock Evaluation Certificate",
24
24
  "type": "object",
25
25
  "required": [
26
26
  "schema_version",
@@ -148,6 +148,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
148
148
 
149
149
  _VALIDATION_ALLOWLIST_DEFAULT = {
150
150
  "primary_metric_acceptable",
151
+ "primary_metric_tail_acceptable",
151
152
  "preview_final_drift_acceptable",
152
153
  "guard_overhead_acceptable",
153
154
  "invariants_pass",
@@ -181,7 +182,7 @@ def _load_validation_allowlist() -> set[str]:
181
182
  def _validate_with_jsonschema(certificate: dict[str, Any]) -> bool:
182
183
  """Validate certificate with JSON Schema when available."""
183
184
  if jsonschema is None:
184
- return True # Schema library unavailable; fall back to legacy checks
185
+ return True # Schema library unavailable; fall back to minimal checks
185
186
  try:
186
187
  jsonschema.validate(instance=certificate, schema=CERTIFICATE_JSON_SCHEMA)
187
188
  return True
@@ -1,8 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any
3
+ from typing import TYPE_CHECKING, Any
4
4
 
5
- from ..eval.data import EvaluationWindow, compute_window_hash
5
+ if TYPE_CHECKING:
6
+ from ..eval.data import EvaluationWindow
7
+
8
+
9
+ def compute_window_hash(window: EvaluationWindow, *, include_data: bool) -> str:
10
+ """Lazy wrapper around `invarlock.eval.data.compute_window_hash`.
11
+
12
+ Importing `invarlock.eval.data` pulls in optional heavy deps (HF datasets /
13
+ pyarrow). Keep that import off the module import path so that lightweight
14
+ reporting/helpers can be used without eagerly importing those deps.
15
+ """
16
+ from ..eval.data import compute_window_hash as _compute_window_hash
17
+
18
+ return _compute_window_hash(window, include_data=include_data)
6
19
 
7
20
 
8
21
  def compute_window_hashes(