invarlock 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/tiers.yaml +57 -30
- invarlock/adapters/__init__.py +1 -1
- invarlock/calibration/spectral_null.py +15 -10
- invarlock/calibration/variance_ve.py +0 -2
- invarlock/cli/commands/calibrate.py +6 -2
- invarlock/cli/commands/certify.py +58 -39
- invarlock/cli/commands/doctor.py +3 -1
- invarlock/cli/commands/explain_gates.py +57 -8
- invarlock/cli/commands/report.py +1 -1
- invarlock/cli/commands/run.py +159 -61
- invarlock/cli/commands/verify.py +78 -4
- invarlock/cli/config.py +21 -5
- invarlock/core/api.py +45 -5
- invarlock/core/auto_tuning.py +65 -20
- invarlock/core/contracts.py +7 -1
- invarlock/core/registry.py +2 -2
- invarlock/core/runner.py +314 -50
- invarlock/eval/bench.py +0 -13
- invarlock/eval/data.py +73 -283
- invarlock/eval/metrics.py +134 -4
- invarlock/eval/primary_metric.py +23 -0
- invarlock/eval/tail_stats.py +230 -0
- invarlock/guards/_estimators.py +154 -0
- invarlock/guards/policies.py +16 -6
- invarlock/guards/rmt.py +625 -544
- invarlock/guards/spectral.py +348 -110
- invarlock/guards/tier_config.py +32 -30
- invarlock/guards/variance.py +5 -29
- invarlock/guards_ref/rmt_ref.py +23 -23
- invarlock/model_profile.py +42 -15
- invarlock/reporting/certificate.py +225 -46
- invarlock/reporting/certificate_schema.py +2 -1
- invarlock/reporting/dataset_hashing.py +15 -2
- invarlock/reporting/guards_analysis.py +197 -274
- invarlock/reporting/normalizer.py +6 -0
- invarlock/reporting/policy_utils.py +38 -36
- invarlock/reporting/primary_metric_utils.py +71 -17
- invarlock/reporting/render.py +61 -0
- invarlock/reporting/report.py +1 -1
- invarlock/reporting/report_types.py +5 -2
- invarlock/reporting/validate.py +1 -18
- {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/METADATA +6 -6
- {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/RECORD +48 -46
- {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/WHEEL +0 -0
- {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/top_level.txt +0 -0
|
@@ -35,6 +35,7 @@ from invarlock.core.bootstrap import (
|
|
|
35
35
|
logspace_to_ratio_ci,
|
|
36
36
|
)
|
|
37
37
|
from invarlock.eval.primary_metric import compute_primary_metric_from_report, get_metric
|
|
38
|
+
from invarlock.eval.tail_stats import evaluate_metric_tail
|
|
38
39
|
from invarlock.utils.digest import hash_json
|
|
39
40
|
|
|
40
41
|
from . import certificate_schema as _cert_schema
|
|
@@ -81,7 +82,7 @@ TIER_RATIO_LIMITS: dict[str, float] = {
|
|
|
81
82
|
def _is_ppl_kind(name: Any) -> bool:
|
|
82
83
|
"""Return True if a primary_metric kind denotes a ppl-like metric.
|
|
83
84
|
|
|
84
|
-
Supports
|
|
85
|
+
Supports alternate names to stay resilient across schema variants.
|
|
85
86
|
"""
|
|
86
87
|
try:
|
|
87
88
|
n = str(name or "").lower()
|
|
@@ -100,7 +101,7 @@ def _is_ppl_kind(name: Any) -> bool:
|
|
|
100
101
|
}
|
|
101
102
|
|
|
102
103
|
|
|
103
|
-
## NOTE: Deprecated
|
|
104
|
+
## NOTE: Deprecated helper `_get_ppl_final` was removed; callers should
|
|
104
105
|
## use the normalized primary_metric block directly via make_certificate or
|
|
105
106
|
## report processing utilities.
|
|
106
107
|
|
|
@@ -391,6 +392,7 @@ def _compute_thresholds_hash(payload: dict[str, Any]) -> str:
|
|
|
391
392
|
# Allow-list loader with safe defaults for validation keys
|
|
392
393
|
_VALIDATION_ALLOWLIST_DEFAULT = {
|
|
393
394
|
"primary_metric_acceptable",
|
|
395
|
+
"primary_metric_tail_acceptable",
|
|
394
396
|
"preview_final_drift_acceptable",
|
|
395
397
|
"guard_overhead_acceptable",
|
|
396
398
|
"invariants_pass",
|
|
@@ -792,6 +794,19 @@ def make_certificate(
|
|
|
792
794
|
except Exception: # pragma: no cover
|
|
793
795
|
pass
|
|
794
796
|
|
|
797
|
+
# Execution profile provenance when available via run context.
|
|
798
|
+
try:
|
|
799
|
+
ctx = report.get("context") if isinstance(report, dict) else None
|
|
800
|
+
ctx_profile = (
|
|
801
|
+
str(ctx.get("profile") or "").strip().lower()
|
|
802
|
+
if isinstance(ctx, dict)
|
|
803
|
+
else ""
|
|
804
|
+
)
|
|
805
|
+
if ctx_profile:
|
|
806
|
+
meta["profile"] = ctx_profile
|
|
807
|
+
except Exception: # pragma: no cover
|
|
808
|
+
pass
|
|
809
|
+
|
|
795
810
|
tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
|
|
796
811
|
if not tokenizer_hash_meta:
|
|
797
812
|
dataset_section = report.get("data", {})
|
|
@@ -1518,7 +1533,10 @@ def make_certificate(
|
|
|
1518
1533
|
)
|
|
1519
1534
|
overrides_list = _extract_policy_overrides(report)
|
|
1520
1535
|
resolved_digest = _compute_policy_digest(
|
|
1521
|
-
{
|
|
1536
|
+
{
|
|
1537
|
+
"resolved_policy": resolved_policy,
|
|
1538
|
+
"overrides": overrides_list,
|
|
1539
|
+
}
|
|
1522
1540
|
)
|
|
1523
1541
|
policy_provenance = {
|
|
1524
1542
|
"tier": auto.get("tier", "balanced"),
|
|
@@ -1738,6 +1756,104 @@ def make_certificate(
|
|
|
1738
1756
|
|
|
1739
1757
|
pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
|
|
1740
1758
|
|
|
1759
|
+
# Primary metric tail evidence and gate evaluation (ΔlogNLL vs baseline, per-window).
|
|
1760
|
+
pm_tail_result: dict[str, Any] = {}
|
|
1761
|
+
try:
|
|
1762
|
+
pm_kind = None
|
|
1763
|
+
try:
|
|
1764
|
+
pm_block = (
|
|
1765
|
+
report.get("metrics", {}).get("primary_metric")
|
|
1766
|
+
if isinstance(report.get("metrics"), dict)
|
|
1767
|
+
else None
|
|
1768
|
+
)
|
|
1769
|
+
if isinstance(pm_block, dict):
|
|
1770
|
+
pm_kind = pm_block.get("kind")
|
|
1771
|
+
except Exception: # pragma: no cover
|
|
1772
|
+
pm_kind = None
|
|
1773
|
+
|
|
1774
|
+
pm_tail_policy: dict[str, Any] = {}
|
|
1775
|
+
try:
|
|
1776
|
+
metrics_pol = (
|
|
1777
|
+
resolved_policy.get("metrics", {})
|
|
1778
|
+
if isinstance(resolved_policy, dict)
|
|
1779
|
+
else {}
|
|
1780
|
+
)
|
|
1781
|
+
if isinstance(metrics_pol, dict) and isinstance(
|
|
1782
|
+
metrics_pol.get("pm_tail"), dict
|
|
1783
|
+
):
|
|
1784
|
+
pm_tail_policy = dict(metrics_pol.get("pm_tail") or {})
|
|
1785
|
+
except Exception: # pragma: no cover
|
|
1786
|
+
pm_tail_policy = {}
|
|
1787
|
+
|
|
1788
|
+
deltas: list[float] = []
|
|
1789
|
+
weights: list[float] = []
|
|
1790
|
+
if _is_ppl_kind(pm_kind):
|
|
1791
|
+
run_windows = (
|
|
1792
|
+
report.get("evaluation_windows", {}).get("final", {})
|
|
1793
|
+
if isinstance(report.get("evaluation_windows"), dict)
|
|
1794
|
+
else {}
|
|
1795
|
+
)
|
|
1796
|
+
base_windows = (
|
|
1797
|
+
baseline_normalized.get("evaluation_windows", {}).get("final", {})
|
|
1798
|
+
if isinstance(baseline_normalized.get("evaluation_windows"), dict)
|
|
1799
|
+
else {}
|
|
1800
|
+
)
|
|
1801
|
+
run_ids = (
|
|
1802
|
+
run_windows.get("window_ids") if isinstance(run_windows, dict) else None
|
|
1803
|
+
)
|
|
1804
|
+
run_ll = (
|
|
1805
|
+
run_windows.get("logloss") if isinstance(run_windows, dict) else None
|
|
1806
|
+
)
|
|
1807
|
+
run_tc = (
|
|
1808
|
+
run_windows.get("token_counts")
|
|
1809
|
+
if isinstance(run_windows, dict)
|
|
1810
|
+
else None
|
|
1811
|
+
)
|
|
1812
|
+
base_ids = (
|
|
1813
|
+
base_windows.get("window_ids")
|
|
1814
|
+
if isinstance(base_windows, dict)
|
|
1815
|
+
else None
|
|
1816
|
+
)
|
|
1817
|
+
base_ll = (
|
|
1818
|
+
base_windows.get("logloss") if isinstance(base_windows, dict) else None
|
|
1819
|
+
)
|
|
1820
|
+
if (
|
|
1821
|
+
isinstance(run_ids, list)
|
|
1822
|
+
and isinstance(run_ll, list)
|
|
1823
|
+
and isinstance(base_ids, list)
|
|
1824
|
+
and isinstance(base_ll, list)
|
|
1825
|
+
):
|
|
1826
|
+
base_map: dict[int, float] = {}
|
|
1827
|
+
for b_id, b_val in zip(base_ids, base_ll, strict=False):
|
|
1828
|
+
if isinstance(b_id, int | float) and isinstance(b_val, int | float):
|
|
1829
|
+
base_map[int(b_id)] = float(b_val)
|
|
1830
|
+
for idx, (r_id, r_val) in enumerate(zip(run_ids, run_ll, strict=False)):
|
|
1831
|
+
if not (
|
|
1832
|
+
isinstance(r_id, int | float) and isinstance(r_val, int | float)
|
|
1833
|
+
):
|
|
1834
|
+
continue
|
|
1835
|
+
key = int(r_id)
|
|
1836
|
+
if key not in base_map:
|
|
1837
|
+
continue
|
|
1838
|
+
dv = float(r_val) - base_map[key]
|
|
1839
|
+
if math.isfinite(dv):
|
|
1840
|
+
deltas.append(float(dv))
|
|
1841
|
+
if isinstance(run_tc, list) and idx < len(run_tc):
|
|
1842
|
+
try:
|
|
1843
|
+
wv = float(run_tc[idx])
|
|
1844
|
+
except Exception:
|
|
1845
|
+
wv = 0.0
|
|
1846
|
+
weights.append(float(max(wv, 0.0)))
|
|
1847
|
+
|
|
1848
|
+
pm_tail_result = evaluate_metric_tail(
|
|
1849
|
+
deltas=deltas,
|
|
1850
|
+
weights=weights if (weights and len(weights) == len(deltas)) else None,
|
|
1851
|
+
policy=pm_tail_policy,
|
|
1852
|
+
)
|
|
1853
|
+
pm_tail_result["source"] = "paired_baseline.final"
|
|
1854
|
+
except Exception: # pragma: no cover
|
|
1855
|
+
pm_tail_result = {"mode": "warn", "evaluated": False, "passed": True}
|
|
1856
|
+
|
|
1741
1857
|
validation_kwargs = {
|
|
1742
1858
|
"ppl": ppl_analysis,
|
|
1743
1859
|
"spectral": spectral,
|
|
@@ -1765,7 +1881,14 @@ def make_certificate(
|
|
|
1765
1881
|
except Exception: # pragma: no cover - defensive against patched functions
|
|
1766
1882
|
validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
|
|
1767
1883
|
|
|
1884
|
+
try:
|
|
1885
|
+
if "pm_tail" in inspect.signature(_compute_validation_flags).parameters:
|
|
1886
|
+
validation_kwargs["pm_tail"] = pm_tail_result
|
|
1887
|
+
except Exception: # pragma: no cover - defensive against patched functions
|
|
1888
|
+
validation_kwargs["pm_tail"] = pm_tail_result
|
|
1889
|
+
|
|
1768
1890
|
validation_flags = _compute_validation_flags(**validation_kwargs)
|
|
1891
|
+
|
|
1769
1892
|
# Enforce validation key allow-list to prevent surface drift
|
|
1770
1893
|
_allowed_validation = _load_validation_allowlist()
|
|
1771
1894
|
validation_filtered = {
|
|
@@ -1797,6 +1920,7 @@ def make_certificate(
|
|
|
1797
1920
|
"artifacts": artifacts_payload,
|
|
1798
1921
|
"validation": validation_filtered,
|
|
1799
1922
|
"guard_overhead": guard_overhead_section,
|
|
1923
|
+
"primary_metric_tail": pm_tail_result,
|
|
1800
1924
|
}
|
|
1801
1925
|
|
|
1802
1926
|
# Record tiny-relax provenance explicitly when active (dev-only demos)
|
|
@@ -2048,7 +2172,49 @@ def make_certificate(
|
|
|
2048
2172
|
except Exception: # pragma: no cover
|
|
2049
2173
|
pass
|
|
2050
2174
|
|
|
2051
|
-
#
|
|
2175
|
+
# Attach/normalize primary metric block (moved to helper)
|
|
2176
|
+
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
2177
|
+
|
|
2178
|
+
_attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2179
|
+
_enforce_display_ci_alignment(
|
|
2180
|
+
ratio_ci_source,
|
|
2181
|
+
certificate.get("primary_metric"),
|
|
2182
|
+
logloss_delta_ci,
|
|
2183
|
+
window_plan_profile,
|
|
2184
|
+
)
|
|
2185
|
+
|
|
2186
|
+
# Ensure primary_metric has display_ci populated for schema invariants
|
|
2187
|
+
try:
|
|
2188
|
+
pm = (
|
|
2189
|
+
certificate.get("primary_metric", {})
|
|
2190
|
+
if isinstance(certificate.get("primary_metric"), dict)
|
|
2191
|
+
else None
|
|
2192
|
+
)
|
|
2193
|
+
if isinstance(pm, dict) and pm:
|
|
2194
|
+
# Prefer existing bounds; otherwise collapse to point estimate
|
|
2195
|
+
disp = pm.get("display_ci")
|
|
2196
|
+
if not (
|
|
2197
|
+
isinstance(disp, list | tuple)
|
|
2198
|
+
and len(disp) == 2
|
|
2199
|
+
and all(isinstance(x, int | float) for x in disp)
|
|
2200
|
+
):
|
|
2201
|
+
point = None
|
|
2202
|
+
for key in ("ratio_vs_baseline", "final", "preview"):
|
|
2203
|
+
val = pm.get(key)
|
|
2204
|
+
if isinstance(val, int | float) and math.isfinite(float(val)):
|
|
2205
|
+
point = float(val)
|
|
2206
|
+
break
|
|
2207
|
+
if isinstance(point, float):
|
|
2208
|
+
pm["display_ci"] = [point, point]
|
|
2209
|
+
else:
|
|
2210
|
+
# As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
|
|
2211
|
+
pm["display_ci"] = [1.0, 1.0]
|
|
2212
|
+
pm.setdefault("estimated", True)
|
|
2213
|
+
except Exception: # pragma: no cover
|
|
2214
|
+
pass
|
|
2215
|
+
|
|
2216
|
+
# Emit optional one-line telemetry summary (opt-in via INVARLOCK_TELEMETRY=1).
|
|
2217
|
+
# This runs after primary_metric attachment so the summary can include display_ci/width.
|
|
2052
2218
|
try:
|
|
2053
2219
|
kind = None
|
|
2054
2220
|
pm_try = (
|
|
@@ -2135,46 +2301,6 @@ def make_certificate(
|
|
|
2135
2301
|
except Exception: # pragma: no cover
|
|
2136
2302
|
pass
|
|
2137
2303
|
|
|
2138
|
-
# Attach/normalize primary metric block (moved to helper)
|
|
2139
|
-
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
2140
|
-
|
|
2141
|
-
_attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2142
|
-
_enforce_display_ci_alignment(
|
|
2143
|
-
ratio_ci_source,
|
|
2144
|
-
certificate.get("primary_metric"),
|
|
2145
|
-
logloss_delta_ci,
|
|
2146
|
-
window_plan_profile,
|
|
2147
|
-
)
|
|
2148
|
-
|
|
2149
|
-
# Ensure primary_metric has display_ci populated for schema invariants
|
|
2150
|
-
try:
|
|
2151
|
-
pm = (
|
|
2152
|
-
certificate.get("primary_metric", {})
|
|
2153
|
-
if isinstance(certificate.get("primary_metric"), dict)
|
|
2154
|
-
else None
|
|
2155
|
-
)
|
|
2156
|
-
if isinstance(pm, dict) and pm:
|
|
2157
|
-
# Prefer existing bounds; otherwise collapse to point estimate
|
|
2158
|
-
disp = pm.get("display_ci")
|
|
2159
|
-
if not (
|
|
2160
|
-
isinstance(disp, list | tuple)
|
|
2161
|
-
and len(disp) == 2
|
|
2162
|
-
and all(isinstance(x, int | float) for x in disp)
|
|
2163
|
-
):
|
|
2164
|
-
point = None
|
|
2165
|
-
for key in ("ratio_vs_baseline", "final", "preview"):
|
|
2166
|
-
val = pm.get(key)
|
|
2167
|
-
if isinstance(val, int | float) and math.isfinite(float(val)):
|
|
2168
|
-
point = float(val)
|
|
2169
|
-
break
|
|
2170
|
-
if isinstance(point, float):
|
|
2171
|
-
pm["display_ci"] = [point, point]
|
|
2172
|
-
else:
|
|
2173
|
-
# As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
|
|
2174
|
-
pm["display_ci"] = [1.0, 1.0]
|
|
2175
|
-
except Exception: # pragma: no cover
|
|
2176
|
-
pass
|
|
2177
|
-
|
|
2178
2304
|
# Attach confidence label (non-gating)
|
|
2179
2305
|
try:
|
|
2180
2306
|
certificate["confidence"] = _compute_confidence_label(certificate)
|
|
@@ -2208,7 +2334,7 @@ def _normalize_baseline(baseline: RunReport | dict[str, Any]) -> dict[str, Any]:
|
|
|
2208
2334
|
}
|
|
2209
2335
|
# Check if it's a RunReport structure
|
|
2210
2336
|
elif "meta" in baseline and "metrics" in baseline and "edit" in baseline:
|
|
2211
|
-
# Accept both
|
|
2337
|
+
# Accept both ppl_* metrics and PM-first reports
|
|
2212
2338
|
metrics_blk = baseline.get("metrics", {}) or {}
|
|
2213
2339
|
ppl_final = metrics_blk.get("ppl_final")
|
|
2214
2340
|
ppl_preview = metrics_blk.get("ppl_preview")
|
|
@@ -2483,7 +2609,7 @@ def _extract_edit_metadata(
|
|
|
2483
2609
|
algorithm = edit_section.get("algorithm")
|
|
2484
2610
|
if not algorithm:
|
|
2485
2611
|
algorithm = edit_name or ""
|
|
2486
|
-
# Sanitize algorithm identifiers to purge
|
|
2612
|
+
# Sanitize algorithm identifiers to purge unsupported edit labels
|
|
2487
2613
|
try:
|
|
2488
2614
|
alg_lower = str(algorithm).strip().lower()
|
|
2489
2615
|
except Exception: # pragma: no cover
|
|
@@ -3112,6 +3238,7 @@ def _compute_validation_flags(
|
|
|
3112
3238
|
moe: dict[str, Any] | None = None,
|
|
3113
3239
|
dataset_capacity: dict[str, Any] | None = None,
|
|
3114
3240
|
pm_acceptance_range: dict[str, float] | None = None,
|
|
3241
|
+
pm_tail: dict[str, Any] | None = None,
|
|
3115
3242
|
) -> dict[str, bool]:
|
|
3116
3243
|
"""Compute validation flags for the certificate including canonical gates."""
|
|
3117
3244
|
tier = (tier or "balanced").lower()
|
|
@@ -3223,6 +3350,45 @@ def _compute_validation_flags(
|
|
|
3223
3350
|
except Exception: # pragma: no cover
|
|
3224
3351
|
pass
|
|
3225
3352
|
tokens_ok = total_tokens >= eff_min_tokens
|
|
3353
|
+
if not tokens_ok:
|
|
3354
|
+
coverage_ok = False
|
|
3355
|
+
try:
|
|
3356
|
+
coverage = _ppl_metrics.get("bootstrap", {}).get("coverage")
|
|
3357
|
+
if isinstance(coverage, dict):
|
|
3358
|
+
prev_cov = coverage.get("preview")
|
|
3359
|
+
fin_cov = coverage.get("final")
|
|
3360
|
+
if isinstance(prev_cov, dict) and isinstance(fin_cov, dict):
|
|
3361
|
+
prev_used = prev_cov.get("used")
|
|
3362
|
+
prev_req = prev_cov.get("required")
|
|
3363
|
+
fin_used = fin_cov.get("used")
|
|
3364
|
+
fin_req = fin_cov.get("required")
|
|
3365
|
+
prev_ok = bool(prev_cov.get("ok")) or (
|
|
3366
|
+
isinstance(prev_used, int | float)
|
|
3367
|
+
and isinstance(prev_req, int | float)
|
|
3368
|
+
and float(prev_used) >= float(prev_req)
|
|
3369
|
+
)
|
|
3370
|
+
fin_ok = bool(fin_cov.get("ok")) or (
|
|
3371
|
+
isinstance(fin_used, int | float)
|
|
3372
|
+
and isinstance(fin_req, int | float)
|
|
3373
|
+
and float(fin_used) >= float(fin_req)
|
|
3374
|
+
)
|
|
3375
|
+
coverage_ok = prev_ok and fin_ok
|
|
3376
|
+
except Exception: # pragma: no cover
|
|
3377
|
+
coverage_ok = False
|
|
3378
|
+
|
|
3379
|
+
if coverage_ok:
|
|
3380
|
+
try:
|
|
3381
|
+
tolerance_ratio = float(
|
|
3382
|
+
pm_policy.get("min_tokens_tolerance", 0.02) or 0.0
|
|
3383
|
+
)
|
|
3384
|
+
except Exception:
|
|
3385
|
+
tolerance_ratio = 0.0
|
|
3386
|
+
if tolerance_ratio < 0.0:
|
|
3387
|
+
tolerance_ratio = 0.0
|
|
3388
|
+
relaxed_floor = int(
|
|
3389
|
+
math.floor(float(eff_min_tokens) * (1.0 - tolerance_ratio))
|
|
3390
|
+
)
|
|
3391
|
+
tokens_ok = total_tokens >= max(relaxed_floor, 0)
|
|
3226
3392
|
except Exception: # pragma: no cover
|
|
3227
3393
|
tokens_ok = True
|
|
3228
3394
|
# Under tiny_relax, treat token floors as informational only
|
|
@@ -3416,6 +3582,19 @@ def _compute_validation_flags(
|
|
|
3416
3582
|
except Exception: # pragma: no cover
|
|
3417
3583
|
pass
|
|
3418
3584
|
|
|
3585
|
+
# Primary metric tail gate (warn/fail; default non-blocking)
|
|
3586
|
+
try:
|
|
3587
|
+
tail_ok = True
|
|
3588
|
+
if isinstance(pm_tail, dict) and pm_tail:
|
|
3589
|
+
mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
|
|
3590
|
+
evaluated = bool(pm_tail.get("evaluated", False))
|
|
3591
|
+
passed = bool(pm_tail.get("passed", True))
|
|
3592
|
+
if mode == "fail" and evaluated and (not passed):
|
|
3593
|
+
tail_ok = False
|
|
3594
|
+
flags["primary_metric_tail_acceptable"] = bool(tail_ok)
|
|
3595
|
+
except Exception: # pragma: no cover
|
|
3596
|
+
flags["primary_metric_tail_acceptable"] = True
|
|
3597
|
+
|
|
3419
3598
|
return flags
|
|
3420
3599
|
|
|
3421
3600
|
|
|
@@ -148,6 +148,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
148
148
|
|
|
149
149
|
_VALIDATION_ALLOWLIST_DEFAULT = {
|
|
150
150
|
"primary_metric_acceptable",
|
|
151
|
+
"primary_metric_tail_acceptable",
|
|
151
152
|
"preview_final_drift_acceptable",
|
|
152
153
|
"guard_overhead_acceptable",
|
|
153
154
|
"invariants_pass",
|
|
@@ -181,7 +182,7 @@ def _load_validation_allowlist() -> set[str]:
|
|
|
181
182
|
def _validate_with_jsonschema(certificate: dict[str, Any]) -> bool:
|
|
182
183
|
"""Validate certificate with JSON Schema when available."""
|
|
183
184
|
if jsonschema is None:
|
|
184
|
-
return True # Schema library unavailable; fall back to
|
|
185
|
+
return True # Schema library unavailable; fall back to minimal checks
|
|
185
186
|
try:
|
|
186
187
|
jsonschema.validate(instance=certificate, schema=CERTIFICATE_JSON_SCHEMA)
|
|
187
188
|
return True
|
|
@@ -1,8 +1,21 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from ..eval.data import EvaluationWindow
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compute_window_hash(window: EvaluationWindow, *, include_data: bool) -> str:
|
|
10
|
+
"""Lazy wrapper around `invarlock.eval.data.compute_window_hash`.
|
|
11
|
+
|
|
12
|
+
Importing `invarlock.eval.data` pulls in optional heavy deps (HF datasets /
|
|
13
|
+
pyarrow). Keep that import off the module import path so that lightweight
|
|
14
|
+
reporting/helpers can be used without eagerly importing those deps.
|
|
15
|
+
"""
|
|
16
|
+
from ..eval.data import compute_window_hash as _compute_window_hash
|
|
17
|
+
|
|
18
|
+
return _compute_window_hash(window, include_data=include_data)
|
|
6
19
|
|
|
7
20
|
|
|
8
21
|
def compute_window_hashes(
|