invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +2 -2
- invarlock/_data/runtime/tiers.yaml +57 -30
- invarlock/adapters/__init__.py +11 -15
- invarlock/adapters/auto.py +35 -40
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_mixin.py +25 -4
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/calibration/spectral_null.py +15 -10
- invarlock/calibration/variance_ve.py +0 -2
- invarlock/cli/adapter_auto.py +31 -21
- invarlock/cli/app.py +73 -2
- invarlock/cli/commands/calibrate.py +6 -2
- invarlock/cli/commands/certify.py +651 -91
- invarlock/cli/commands/doctor.py +11 -11
- invarlock/cli/commands/explain_gates.py +57 -8
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +233 -69
- invarlock/cli/commands/run.py +1066 -244
- invarlock/cli/commands/verify.py +154 -15
- invarlock/cli/config.py +22 -6
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +1 -1
- invarlock/core/api.py +45 -5
- invarlock/core/auto_tuning.py +65 -20
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/contracts.py +7 -1
- invarlock/core/registry.py +11 -13
- invarlock/core/runner.py +425 -75
- invarlock/edits/quant_rtn.py +65 -37
- invarlock/eval/bench.py +3 -16
- invarlock/eval/data.py +82 -51
- invarlock/eval/metrics.py +63 -2
- invarlock/eval/primary_metric.py +23 -0
- invarlock/eval/tail_stats.py +230 -0
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/_estimators.py +154 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/policies.py +16 -6
- invarlock/guards/rmt.py +627 -546
- invarlock/guards/spectral.py +348 -110
- invarlock/guards/tier_config.py +32 -30
- invarlock/guards/variance.py +7 -31
- invarlock/guards_ref/rmt_ref.py +23 -23
- invarlock/model_profile.py +90 -42
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/reporting/certificate.py +384 -55
- invarlock/reporting/certificate_schema.py +3 -2
- invarlock/reporting/dataset_hashing.py +15 -2
- invarlock/reporting/guards_analysis.py +350 -277
- invarlock/reporting/html.py +55 -5
- invarlock/reporting/normalizer.py +13 -0
- invarlock/reporting/policy_utils.py +38 -36
- invarlock/reporting/primary_metric_utils.py +71 -17
- invarlock/reporting/render.py +852 -431
- invarlock/reporting/report.py +40 -4
- invarlock/reporting/report_types.py +11 -3
- invarlock/reporting/telemetry.py +86 -0
- invarlock/reporting/validate.py +1 -18
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
InvarLock
|
|
3
|
-
|
|
2
|
+
InvarLock Evaluation Certificate Generation
|
|
3
|
+
==========================================
|
|
4
4
|
|
|
5
|
-
Generate standardized
|
|
5
|
+
Generate standardized evaluation certificates from RunReport and baseline
|
|
6
|
+
comparison.
|
|
6
7
|
Certificates are standalone, portable verification artifacts that can be used
|
|
7
8
|
for CI/CD gates and regulatory compliance.
|
|
8
9
|
"""
|
|
@@ -35,6 +36,7 @@ from invarlock.core.bootstrap import (
|
|
|
35
36
|
logspace_to_ratio_ci,
|
|
36
37
|
)
|
|
37
38
|
from invarlock.eval.primary_metric import compute_primary_metric_from_report, get_metric
|
|
39
|
+
from invarlock.eval.tail_stats import evaluate_metric_tail
|
|
38
40
|
from invarlock.utils.digest import hash_json
|
|
39
41
|
|
|
40
42
|
from . import certificate_schema as _cert_schema
|
|
@@ -81,7 +83,7 @@ TIER_RATIO_LIMITS: dict[str, float] = {
|
|
|
81
83
|
def _is_ppl_kind(name: Any) -> bool:
|
|
82
84
|
"""Return True if a primary_metric kind denotes a ppl-like metric.
|
|
83
85
|
|
|
84
|
-
Supports
|
|
86
|
+
Supports alternate names to stay resilient across schema variants.
|
|
85
87
|
"""
|
|
86
88
|
try:
|
|
87
89
|
n = str(name or "").lower()
|
|
@@ -100,7 +102,7 @@ def _is_ppl_kind(name: Any) -> bool:
|
|
|
100
102
|
}
|
|
101
103
|
|
|
102
104
|
|
|
103
|
-
## NOTE: Deprecated
|
|
105
|
+
## NOTE: Deprecated helper `_get_ppl_final` was removed; callers should
|
|
104
106
|
## use the normalized primary_metric block directly via make_certificate or
|
|
105
107
|
## report processing utilities.
|
|
106
108
|
|
|
@@ -391,6 +393,7 @@ def _compute_thresholds_hash(payload: dict[str, Any]) -> str:
|
|
|
391
393
|
# Allow-list loader with safe defaults for validation keys
|
|
392
394
|
_VALIDATION_ALLOWLIST_DEFAULT = {
|
|
393
395
|
"primary_metric_acceptable",
|
|
396
|
+
"primary_metric_tail_acceptable",
|
|
394
397
|
"preview_final_drift_acceptable",
|
|
395
398
|
"guard_overhead_acceptable",
|
|
396
399
|
"invariants_pass",
|
|
@@ -741,7 +744,7 @@ def make_certificate(
|
|
|
741
744
|
baseline: RunReport | dict[str, Any],
|
|
742
745
|
) -> dict[str, Any]:
|
|
743
746
|
"""
|
|
744
|
-
Generate
|
|
747
|
+
Generate an evaluation certificate from a RunReport and baseline comparison.
|
|
745
748
|
|
|
746
749
|
The certificate is a standalone, portable artifact that contains all
|
|
747
750
|
essential metrics and comparisons needed for safety verification.
|
|
@@ -762,6 +765,17 @@ def make_certificate(
|
|
|
762
765
|
# Normalize baseline input
|
|
763
766
|
baseline_raw = baseline
|
|
764
767
|
baseline_normalized = _normalize_baseline(baseline_raw)
|
|
768
|
+
baseline_report: RunReport | None = None
|
|
769
|
+
try:
|
|
770
|
+
if (
|
|
771
|
+
isinstance(baseline_raw, dict)
|
|
772
|
+
and "meta" in baseline_raw
|
|
773
|
+
and "metrics" in baseline_raw
|
|
774
|
+
and "edit" in baseline_raw
|
|
775
|
+
):
|
|
776
|
+
baseline_report = _normalize_and_validate_report(baseline_raw)
|
|
777
|
+
except Exception: # pragma: no cover - baseline compare is best-effort
|
|
778
|
+
baseline_report = None
|
|
765
779
|
|
|
766
780
|
# Extract core metadata with full seed bundle
|
|
767
781
|
meta = _extract_certificate_meta(report)
|
|
@@ -792,6 +806,19 @@ def make_certificate(
|
|
|
792
806
|
except Exception: # pragma: no cover
|
|
793
807
|
pass
|
|
794
808
|
|
|
809
|
+
# Execution profile provenance when available via run context.
|
|
810
|
+
try:
|
|
811
|
+
ctx = report.get("context") if isinstance(report, dict) else None
|
|
812
|
+
ctx_profile = (
|
|
813
|
+
str(ctx.get("profile") or "").strip().lower()
|
|
814
|
+
if isinstance(ctx, dict)
|
|
815
|
+
else ""
|
|
816
|
+
)
|
|
817
|
+
if ctx_profile:
|
|
818
|
+
meta["profile"] = ctx_profile
|
|
819
|
+
except Exception: # pragma: no cover
|
|
820
|
+
pass
|
|
821
|
+
|
|
795
822
|
tokenizer_hash_meta = report["meta"].get("tokenizer_hash")
|
|
796
823
|
if not tokenizer_hash_meta:
|
|
797
824
|
dataset_section = report.get("data", {})
|
|
@@ -1425,7 +1452,7 @@ def make_certificate(
|
|
|
1425
1452
|
ppl_analysis["window_plan"] = window_plan_ctx
|
|
1426
1453
|
|
|
1427
1454
|
# Extract invariant status
|
|
1428
|
-
invariants = _extract_invariants(report)
|
|
1455
|
+
invariants = _extract_invariants(report, baseline=baseline_report)
|
|
1429
1456
|
|
|
1430
1457
|
# Extract spectral analysis
|
|
1431
1458
|
spectral = _extract_spectral_analysis(report, baseline_normalized)
|
|
@@ -1518,7 +1545,10 @@ def make_certificate(
|
|
|
1518
1545
|
)
|
|
1519
1546
|
overrides_list = _extract_policy_overrides(report)
|
|
1520
1547
|
resolved_digest = _compute_policy_digest(
|
|
1521
|
-
{
|
|
1548
|
+
{
|
|
1549
|
+
"resolved_policy": resolved_policy,
|
|
1550
|
+
"overrides": overrides_list,
|
|
1551
|
+
}
|
|
1522
1552
|
)
|
|
1523
1553
|
policy_provenance = {
|
|
1524
1554
|
"tier": auto.get("tier", "balanced"),
|
|
@@ -1540,7 +1570,13 @@ def make_certificate(
|
|
|
1540
1570
|
telemetry: dict[str, Any] = {}
|
|
1541
1571
|
metrics_section = report.get("metrics", {})
|
|
1542
1572
|
if isinstance(metrics_section, dict):
|
|
1543
|
-
for key in (
|
|
1573
|
+
for key in (
|
|
1574
|
+
"latency_ms_per_tok",
|
|
1575
|
+
"memory_mb_peak",
|
|
1576
|
+
"gpu_memory_mb_peak",
|
|
1577
|
+
"gpu_memory_reserved_mb_peak",
|
|
1578
|
+
"throughput_tok_per_s",
|
|
1579
|
+
):
|
|
1544
1580
|
value = metrics_section.get(key)
|
|
1545
1581
|
if isinstance(value, int | float) and math.isfinite(value):
|
|
1546
1582
|
telemetry[key] = float(value)
|
|
@@ -1737,6 +1773,105 @@ def make_certificate(
|
|
|
1737
1773
|
capacity_examples = None
|
|
1738
1774
|
|
|
1739
1775
|
pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
|
|
1776
|
+
pm_drift_band = _resolve_pm_drift_band_from_report(report)
|
|
1777
|
+
|
|
1778
|
+
# Primary metric tail evidence and gate evaluation (ΔlogNLL vs baseline, per-window).
|
|
1779
|
+
pm_tail_result: dict[str, Any] = {}
|
|
1780
|
+
try:
|
|
1781
|
+
pm_kind = None
|
|
1782
|
+
try:
|
|
1783
|
+
pm_block = (
|
|
1784
|
+
report.get("metrics", {}).get("primary_metric")
|
|
1785
|
+
if isinstance(report.get("metrics"), dict)
|
|
1786
|
+
else None
|
|
1787
|
+
)
|
|
1788
|
+
if isinstance(pm_block, dict):
|
|
1789
|
+
pm_kind = pm_block.get("kind")
|
|
1790
|
+
except Exception: # pragma: no cover
|
|
1791
|
+
pm_kind = None
|
|
1792
|
+
|
|
1793
|
+
pm_tail_policy: dict[str, Any] = {}
|
|
1794
|
+
try:
|
|
1795
|
+
metrics_pol = (
|
|
1796
|
+
resolved_policy.get("metrics", {})
|
|
1797
|
+
if isinstance(resolved_policy, dict)
|
|
1798
|
+
else {}
|
|
1799
|
+
)
|
|
1800
|
+
if isinstance(metrics_pol, dict) and isinstance(
|
|
1801
|
+
metrics_pol.get("pm_tail"), dict
|
|
1802
|
+
):
|
|
1803
|
+
pm_tail_policy = dict(metrics_pol.get("pm_tail") or {})
|
|
1804
|
+
except Exception: # pragma: no cover
|
|
1805
|
+
pm_tail_policy = {}
|
|
1806
|
+
|
|
1807
|
+
deltas: list[float] = []
|
|
1808
|
+
weights: list[float] = []
|
|
1809
|
+
if _is_ppl_kind(pm_kind):
|
|
1810
|
+
run_windows = (
|
|
1811
|
+
report.get("evaluation_windows", {}).get("final", {})
|
|
1812
|
+
if isinstance(report.get("evaluation_windows"), dict)
|
|
1813
|
+
else {}
|
|
1814
|
+
)
|
|
1815
|
+
base_windows = (
|
|
1816
|
+
baseline_normalized.get("evaluation_windows", {}).get("final", {})
|
|
1817
|
+
if isinstance(baseline_normalized.get("evaluation_windows"), dict)
|
|
1818
|
+
else {}
|
|
1819
|
+
)
|
|
1820
|
+
run_ids = (
|
|
1821
|
+
run_windows.get("window_ids") if isinstance(run_windows, dict) else None
|
|
1822
|
+
)
|
|
1823
|
+
run_ll = (
|
|
1824
|
+
run_windows.get("logloss") if isinstance(run_windows, dict) else None
|
|
1825
|
+
)
|
|
1826
|
+
run_tc = (
|
|
1827
|
+
run_windows.get("token_counts")
|
|
1828
|
+
if isinstance(run_windows, dict)
|
|
1829
|
+
else None
|
|
1830
|
+
)
|
|
1831
|
+
base_ids = (
|
|
1832
|
+
base_windows.get("window_ids")
|
|
1833
|
+
if isinstance(base_windows, dict)
|
|
1834
|
+
else None
|
|
1835
|
+
)
|
|
1836
|
+
base_ll = (
|
|
1837
|
+
base_windows.get("logloss") if isinstance(base_windows, dict) else None
|
|
1838
|
+
)
|
|
1839
|
+
if (
|
|
1840
|
+
isinstance(run_ids, list)
|
|
1841
|
+
and isinstance(run_ll, list)
|
|
1842
|
+
and isinstance(base_ids, list)
|
|
1843
|
+
and isinstance(base_ll, list)
|
|
1844
|
+
):
|
|
1845
|
+
base_map: dict[int, float] = {}
|
|
1846
|
+
for b_id, b_val in zip(base_ids, base_ll, strict=False):
|
|
1847
|
+
if isinstance(b_id, int | float) and isinstance(b_val, int | float):
|
|
1848
|
+
base_map[int(b_id)] = float(b_val)
|
|
1849
|
+
for idx, (r_id, r_val) in enumerate(zip(run_ids, run_ll, strict=False)):
|
|
1850
|
+
if not (
|
|
1851
|
+
isinstance(r_id, int | float) and isinstance(r_val, int | float)
|
|
1852
|
+
):
|
|
1853
|
+
continue
|
|
1854
|
+
key = int(r_id)
|
|
1855
|
+
if key not in base_map:
|
|
1856
|
+
continue
|
|
1857
|
+
dv = float(r_val) - base_map[key]
|
|
1858
|
+
if math.isfinite(dv):
|
|
1859
|
+
deltas.append(float(dv))
|
|
1860
|
+
if isinstance(run_tc, list) and idx < len(run_tc):
|
|
1861
|
+
try:
|
|
1862
|
+
wv = float(run_tc[idx])
|
|
1863
|
+
except Exception:
|
|
1864
|
+
wv = 0.0
|
|
1865
|
+
weights.append(float(max(wv, 0.0)))
|
|
1866
|
+
|
|
1867
|
+
pm_tail_result = evaluate_metric_tail(
|
|
1868
|
+
deltas=deltas,
|
|
1869
|
+
weights=weights if (weights and len(weights) == len(deltas)) else None,
|
|
1870
|
+
policy=pm_tail_policy,
|
|
1871
|
+
)
|
|
1872
|
+
pm_tail_result["source"] = "paired_baseline.final"
|
|
1873
|
+
except Exception: # pragma: no cover
|
|
1874
|
+
pm_tail_result = {"mode": "warn", "evaluated": False, "passed": True}
|
|
1740
1875
|
|
|
1741
1876
|
validation_kwargs = {
|
|
1742
1877
|
"ppl": ppl_analysis,
|
|
@@ -1765,7 +1900,20 @@ def make_certificate(
|
|
|
1765
1900
|
except Exception: # pragma: no cover - defensive against patched functions
|
|
1766
1901
|
validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
|
|
1767
1902
|
|
|
1903
|
+
try:
|
|
1904
|
+
if "pm_drift_band" in inspect.signature(_compute_validation_flags).parameters:
|
|
1905
|
+
validation_kwargs["pm_drift_band"] = pm_drift_band
|
|
1906
|
+
except Exception: # pragma: no cover - defensive against patched functions
|
|
1907
|
+
validation_kwargs["pm_drift_band"] = pm_drift_band
|
|
1908
|
+
|
|
1909
|
+
try:
|
|
1910
|
+
if "pm_tail" in inspect.signature(_compute_validation_flags).parameters:
|
|
1911
|
+
validation_kwargs["pm_tail"] = pm_tail_result
|
|
1912
|
+
except Exception: # pragma: no cover - defensive against patched functions
|
|
1913
|
+
validation_kwargs["pm_tail"] = pm_tail_result
|
|
1914
|
+
|
|
1768
1915
|
validation_flags = _compute_validation_flags(**validation_kwargs)
|
|
1916
|
+
|
|
1769
1917
|
# Enforce validation key allow-list to prevent surface drift
|
|
1770
1918
|
_allowed_validation = _load_validation_allowlist()
|
|
1771
1919
|
validation_filtered = {
|
|
@@ -1797,6 +1945,7 @@ def make_certificate(
|
|
|
1797
1945
|
"artifacts": artifacts_payload,
|
|
1798
1946
|
"validation": validation_filtered,
|
|
1799
1947
|
"guard_overhead": guard_overhead_section,
|
|
1948
|
+
"primary_metric_tail": pm_tail_result,
|
|
1800
1949
|
}
|
|
1801
1950
|
|
|
1802
1951
|
# Record tiny-relax provenance explicitly when active (dev-only demos)
|
|
@@ -2048,7 +2197,56 @@ def make_certificate(
|
|
|
2048
2197
|
except Exception: # pragma: no cover
|
|
2049
2198
|
pass
|
|
2050
2199
|
|
|
2051
|
-
#
|
|
2200
|
+
# Attach/normalize primary metric block (moved to helper)
|
|
2201
|
+
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
2202
|
+
|
|
2203
|
+
_attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2204
|
+
try:
|
|
2205
|
+
if isinstance(pm_drift_band, dict) and pm_drift_band:
|
|
2206
|
+
pm_block = certificate.get("primary_metric")
|
|
2207
|
+
if isinstance(pm_block, dict):
|
|
2208
|
+
pm_block.setdefault("drift_band", dict(pm_drift_band))
|
|
2209
|
+
except Exception: # pragma: no cover
|
|
2210
|
+
pass
|
|
2211
|
+
_enforce_display_ci_alignment(
|
|
2212
|
+
ratio_ci_source,
|
|
2213
|
+
certificate.get("primary_metric"),
|
|
2214
|
+
logloss_delta_ci,
|
|
2215
|
+
window_plan_profile,
|
|
2216
|
+
)
|
|
2217
|
+
|
|
2218
|
+
# Ensure primary_metric has display_ci populated for schema invariants
|
|
2219
|
+
try:
|
|
2220
|
+
pm = (
|
|
2221
|
+
certificate.get("primary_metric", {})
|
|
2222
|
+
if isinstance(certificate.get("primary_metric"), dict)
|
|
2223
|
+
else None
|
|
2224
|
+
)
|
|
2225
|
+
if isinstance(pm, dict) and pm:
|
|
2226
|
+
# Prefer existing bounds; otherwise collapse to point estimate
|
|
2227
|
+
disp = pm.get("display_ci")
|
|
2228
|
+
if not (
|
|
2229
|
+
isinstance(disp, list | tuple)
|
|
2230
|
+
and len(disp) == 2
|
|
2231
|
+
and all(isinstance(x, int | float) for x in disp)
|
|
2232
|
+
):
|
|
2233
|
+
point = None
|
|
2234
|
+
for key in ("ratio_vs_baseline", "final", "preview"):
|
|
2235
|
+
val = pm.get(key)
|
|
2236
|
+
if isinstance(val, int | float) and math.isfinite(float(val)):
|
|
2237
|
+
point = float(val)
|
|
2238
|
+
break
|
|
2239
|
+
if isinstance(point, float):
|
|
2240
|
+
pm["display_ci"] = [point, point]
|
|
2241
|
+
else:
|
|
2242
|
+
# As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
|
|
2243
|
+
pm["display_ci"] = [1.0, 1.0]
|
|
2244
|
+
pm.setdefault("estimated", True)
|
|
2245
|
+
except Exception: # pragma: no cover
|
|
2246
|
+
pass
|
|
2247
|
+
|
|
2248
|
+
# Emit optional one-line telemetry summary (opt-in via INVARLOCK_TELEMETRY=1).
|
|
2249
|
+
# This runs after primary_metric attachment so the summary can include display_ci/width.
|
|
2052
2250
|
try:
|
|
2053
2251
|
kind = None
|
|
2054
2252
|
pm_try = (
|
|
@@ -2135,46 +2333,6 @@ def make_certificate(
|
|
|
2135
2333
|
except Exception: # pragma: no cover
|
|
2136
2334
|
pass
|
|
2137
2335
|
|
|
2138
|
-
# Attach/normalize primary metric block (moved to helper)
|
|
2139
|
-
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
2140
|
-
|
|
2141
|
-
_attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2142
|
-
_enforce_display_ci_alignment(
|
|
2143
|
-
ratio_ci_source,
|
|
2144
|
-
certificate.get("primary_metric"),
|
|
2145
|
-
logloss_delta_ci,
|
|
2146
|
-
window_plan_profile,
|
|
2147
|
-
)
|
|
2148
|
-
|
|
2149
|
-
# Ensure primary_metric has display_ci populated for schema invariants
|
|
2150
|
-
try:
|
|
2151
|
-
pm = (
|
|
2152
|
-
certificate.get("primary_metric", {})
|
|
2153
|
-
if isinstance(certificate.get("primary_metric"), dict)
|
|
2154
|
-
else None
|
|
2155
|
-
)
|
|
2156
|
-
if isinstance(pm, dict) and pm:
|
|
2157
|
-
# Prefer existing bounds; otherwise collapse to point estimate
|
|
2158
|
-
disp = pm.get("display_ci")
|
|
2159
|
-
if not (
|
|
2160
|
-
isinstance(disp, list | tuple)
|
|
2161
|
-
and len(disp) == 2
|
|
2162
|
-
and all(isinstance(x, int | float) for x in disp)
|
|
2163
|
-
):
|
|
2164
|
-
point = None
|
|
2165
|
-
for key in ("ratio_vs_baseline", "final", "preview"):
|
|
2166
|
-
val = pm.get(key)
|
|
2167
|
-
if isinstance(val, int | float) and math.isfinite(float(val)):
|
|
2168
|
-
point = float(val)
|
|
2169
|
-
break
|
|
2170
|
-
if isinstance(point, float):
|
|
2171
|
-
pm["display_ci"] = [point, point]
|
|
2172
|
-
else:
|
|
2173
|
-
# As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
|
|
2174
|
-
pm["display_ci"] = [1.0, 1.0]
|
|
2175
|
-
except Exception: # pragma: no cover
|
|
2176
|
-
pass
|
|
2177
|
-
|
|
2178
2336
|
# Attach confidence label (non-gating)
|
|
2179
2337
|
try:
|
|
2180
2338
|
certificate["confidence"] = _compute_confidence_label(certificate)
|
|
@@ -2208,7 +2366,7 @@ def _normalize_baseline(baseline: RunReport | dict[str, Any]) -> dict[str, Any]:
|
|
|
2208
2366
|
}
|
|
2209
2367
|
# Check if it's a RunReport structure
|
|
2210
2368
|
elif "meta" in baseline and "metrics" in baseline and "edit" in baseline:
|
|
2211
|
-
# Accept both
|
|
2369
|
+
# Accept both ppl_* metrics and PM-first reports
|
|
2212
2370
|
metrics_blk = baseline.get("metrics", {}) or {}
|
|
2213
2371
|
ppl_final = metrics_blk.get("ppl_final")
|
|
2214
2372
|
ppl_preview = metrics_blk.get("ppl_preview")
|
|
@@ -2483,12 +2641,12 @@ def _extract_edit_metadata(
|
|
|
2483
2641
|
algorithm = edit_section.get("algorithm")
|
|
2484
2642
|
if not algorithm:
|
|
2485
2643
|
algorithm = edit_name or ""
|
|
2486
|
-
# Sanitize algorithm identifiers to purge
|
|
2644
|
+
# Sanitize algorithm identifiers to purge unsupported edit labels
|
|
2487
2645
|
try:
|
|
2488
2646
|
alg_lower = str(algorithm).strip().lower()
|
|
2489
2647
|
except Exception: # pragma: no cover
|
|
2490
2648
|
alg_lower = ""
|
|
2491
|
-
allowed_algorithms = {"quant_rtn", "noop"}
|
|
2649
|
+
allowed_algorithms = {"quant_rtn", "noop", "custom"}
|
|
2492
2650
|
if alg_lower not in allowed_algorithms:
|
|
2493
2651
|
algorithm = ""
|
|
2494
2652
|
|
|
@@ -3099,6 +3257,105 @@ def _resolve_pm_acceptance_range_from_report(
|
|
|
3099
3257
|
return {"min": float(min_val), "max": float(max_val)}
|
|
3100
3258
|
|
|
3101
3259
|
|
|
3260
|
+
def _resolve_pm_drift_band_from_report(
|
|
3261
|
+
report: dict[str, Any] | None,
|
|
3262
|
+
) -> dict[str, float]:
|
|
3263
|
+
"""Resolve preview→final drift band from report context/meta/env."""
|
|
3264
|
+
|
|
3265
|
+
base_min = 0.95
|
|
3266
|
+
base_max = 1.05
|
|
3267
|
+
|
|
3268
|
+
def _safe_float(val: Any) -> float | None:
|
|
3269
|
+
try:
|
|
3270
|
+
if val is None:
|
|
3271
|
+
return None
|
|
3272
|
+
out = float(val)
|
|
3273
|
+
except Exception:
|
|
3274
|
+
return None
|
|
3275
|
+
return out if math.isfinite(out) else None
|
|
3276
|
+
|
|
3277
|
+
cfg_min = None
|
|
3278
|
+
cfg_max = None
|
|
3279
|
+
|
|
3280
|
+
ctx = report.get("context") if isinstance(report, dict) else None
|
|
3281
|
+
if isinstance(ctx, dict):
|
|
3282
|
+
pm_ctx = ctx.get("primary_metric")
|
|
3283
|
+
if isinstance(pm_ctx, dict):
|
|
3284
|
+
band = pm_ctx.get("drift_band")
|
|
3285
|
+
if isinstance(band, dict):
|
|
3286
|
+
cfg_min = _safe_float(band.get("min"))
|
|
3287
|
+
cfg_max = _safe_float(band.get("max"))
|
|
3288
|
+
elif isinstance(band, list | tuple) and len(band) == 2:
|
|
3289
|
+
cfg_min = _safe_float(band[0])
|
|
3290
|
+
cfg_max = _safe_float(band[1])
|
|
3291
|
+
if cfg_min is None or cfg_max is None:
|
|
3292
|
+
alt = ctx.get("pm_drift_band")
|
|
3293
|
+
if isinstance(alt, dict):
|
|
3294
|
+
cfg_min = (
|
|
3295
|
+
cfg_min if cfg_min is not None else _safe_float(alt.get("min"))
|
|
3296
|
+
)
|
|
3297
|
+
cfg_max = (
|
|
3298
|
+
cfg_max if cfg_max is not None else _safe_float(alt.get("max"))
|
|
3299
|
+
)
|
|
3300
|
+
|
|
3301
|
+
if (cfg_min is None or cfg_max is None) and isinstance(report, dict):
|
|
3302
|
+
meta = report.get("meta")
|
|
3303
|
+
if isinstance(meta, dict):
|
|
3304
|
+
meta_band = meta.get("pm_drift_band")
|
|
3305
|
+
if isinstance(meta_band, dict):
|
|
3306
|
+
cfg_min = (
|
|
3307
|
+
cfg_min
|
|
3308
|
+
if cfg_min is not None
|
|
3309
|
+
else _safe_float(meta_band.get("min"))
|
|
3310
|
+
)
|
|
3311
|
+
cfg_max = (
|
|
3312
|
+
cfg_max
|
|
3313
|
+
if cfg_max is not None
|
|
3314
|
+
else _safe_float(meta_band.get("max"))
|
|
3315
|
+
)
|
|
3316
|
+
|
|
3317
|
+
def _parse_env(name: str) -> float | None:
|
|
3318
|
+
try:
|
|
3319
|
+
raw = os.environ.get(name, "")
|
|
3320
|
+
if raw is None or str(raw).strip() == "":
|
|
3321
|
+
return None
|
|
3322
|
+
return float(raw)
|
|
3323
|
+
except Exception:
|
|
3324
|
+
return None
|
|
3325
|
+
|
|
3326
|
+
env_min = _parse_env("INVARLOCK_PM_DRIFT_MIN")
|
|
3327
|
+
env_max = _parse_env("INVARLOCK_PM_DRIFT_MAX")
|
|
3328
|
+
|
|
3329
|
+
has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
|
|
3330
|
+
if not has_explicit:
|
|
3331
|
+
return {}
|
|
3332
|
+
|
|
3333
|
+
min_val = (
|
|
3334
|
+
env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
|
|
3335
|
+
)
|
|
3336
|
+
max_val = (
|
|
3337
|
+
env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
|
|
3338
|
+
)
|
|
3339
|
+
|
|
3340
|
+
try:
|
|
3341
|
+
if min_val is not None and min_val <= 0:
|
|
3342
|
+
min_val = base_min
|
|
3343
|
+
except Exception:
|
|
3344
|
+
min_val = base_min
|
|
3345
|
+
try:
|
|
3346
|
+
if max_val is not None and max_val <= 0:
|
|
3347
|
+
max_val = base_max
|
|
3348
|
+
except Exception:
|
|
3349
|
+
max_val = base_max
|
|
3350
|
+
try:
|
|
3351
|
+
if min_val is not None and max_val is not None and min_val >= max_val:
|
|
3352
|
+
min_val, max_val = base_min, base_max
|
|
3353
|
+
except Exception:
|
|
3354
|
+
min_val, max_val = base_min, base_max
|
|
3355
|
+
|
|
3356
|
+
return {"min": float(min_val), "max": float(max_val)}
|
|
3357
|
+
|
|
3358
|
+
|
|
3102
3359
|
def _compute_validation_flags(
|
|
3103
3360
|
ppl: dict[str, Any],
|
|
3104
3361
|
spectral: dict[str, Any],
|
|
@@ -3112,6 +3369,8 @@ def _compute_validation_flags(
|
|
|
3112
3369
|
moe: dict[str, Any] | None = None,
|
|
3113
3370
|
dataset_capacity: dict[str, Any] | None = None,
|
|
3114
3371
|
pm_acceptance_range: dict[str, float] | None = None,
|
|
3372
|
+
pm_drift_band: dict[str, float] | None = None,
|
|
3373
|
+
pm_tail: dict[str, Any] | None = None,
|
|
3115
3374
|
) -> dict[str, bool]:
|
|
3116
3375
|
"""Compute validation flags for the certificate including canonical gates."""
|
|
3117
3376
|
tier = (tier or "balanced").lower()
|
|
@@ -3174,9 +3433,27 @@ def _compute_validation_flags(
|
|
|
3174
3433
|
ratio_limit = min(ratio_limit, float(target_ratio))
|
|
3175
3434
|
|
|
3176
3435
|
# Canonical Gates
|
|
3177
|
-
# 1. Drift gate: 0.95 ≤ final/preview ≤ 1.05
|
|
3436
|
+
# 1. Drift gate: by default 0.95 ≤ final/preview ≤ 1.05 (configurable)
|
|
3178
3437
|
drift_ratio = ppl.get("preview_final_ratio", 1.0)
|
|
3179
|
-
|
|
3438
|
+
drift_min = 0.95
|
|
3439
|
+
drift_max = 1.05
|
|
3440
|
+
if isinstance(pm_drift_band, dict):
|
|
3441
|
+
try:
|
|
3442
|
+
cand_min = pm_drift_band.get("min")
|
|
3443
|
+
cand_max = pm_drift_band.get("max")
|
|
3444
|
+
if isinstance(cand_min, int | float) and isinstance(cand_max, int | float):
|
|
3445
|
+
cand_min_f = float(cand_min)
|
|
3446
|
+
cand_max_f = float(cand_max)
|
|
3447
|
+
if (
|
|
3448
|
+
math.isfinite(cand_min_f)
|
|
3449
|
+
and math.isfinite(cand_max_f)
|
|
3450
|
+
and 0 < cand_min_f < cand_max_f
|
|
3451
|
+
):
|
|
3452
|
+
drift_min = cand_min_f
|
|
3453
|
+
drift_max = cand_max_f
|
|
3454
|
+
except Exception: # pragma: no cover
|
|
3455
|
+
pass
|
|
3456
|
+
preview_final_drift_acceptable = drift_min <= drift_ratio <= drift_max
|
|
3180
3457
|
if _tiny_relax:
|
|
3181
3458
|
# Treat drift identity as informational in tiny dev demos
|
|
3182
3459
|
preview_final_drift_acceptable = True
|
|
@@ -3223,6 +3500,45 @@ def _compute_validation_flags(
|
|
|
3223
3500
|
except Exception: # pragma: no cover
|
|
3224
3501
|
pass
|
|
3225
3502
|
tokens_ok = total_tokens >= eff_min_tokens
|
|
3503
|
+
if not tokens_ok:
|
|
3504
|
+
coverage_ok = False
|
|
3505
|
+
try:
|
|
3506
|
+
coverage = _ppl_metrics.get("bootstrap", {}).get("coverage")
|
|
3507
|
+
if isinstance(coverage, dict):
|
|
3508
|
+
prev_cov = coverage.get("preview")
|
|
3509
|
+
fin_cov = coverage.get("final")
|
|
3510
|
+
if isinstance(prev_cov, dict) and isinstance(fin_cov, dict):
|
|
3511
|
+
prev_used = prev_cov.get("used")
|
|
3512
|
+
prev_req = prev_cov.get("required")
|
|
3513
|
+
fin_used = fin_cov.get("used")
|
|
3514
|
+
fin_req = fin_cov.get("required")
|
|
3515
|
+
prev_ok = bool(prev_cov.get("ok")) or (
|
|
3516
|
+
isinstance(prev_used, int | float)
|
|
3517
|
+
and isinstance(prev_req, int | float)
|
|
3518
|
+
and float(prev_used) >= float(prev_req)
|
|
3519
|
+
)
|
|
3520
|
+
fin_ok = bool(fin_cov.get("ok")) or (
|
|
3521
|
+
isinstance(fin_used, int | float)
|
|
3522
|
+
and isinstance(fin_req, int | float)
|
|
3523
|
+
and float(fin_used) >= float(fin_req)
|
|
3524
|
+
)
|
|
3525
|
+
coverage_ok = prev_ok and fin_ok
|
|
3526
|
+
except Exception: # pragma: no cover
|
|
3527
|
+
coverage_ok = False
|
|
3528
|
+
|
|
3529
|
+
if coverage_ok:
|
|
3530
|
+
try:
|
|
3531
|
+
tolerance_ratio = float(
|
|
3532
|
+
pm_policy.get("min_tokens_tolerance", 0.02) or 0.0
|
|
3533
|
+
)
|
|
3534
|
+
except Exception:
|
|
3535
|
+
tolerance_ratio = 0.0
|
|
3536
|
+
if tolerance_ratio < 0.0:
|
|
3537
|
+
tolerance_ratio = 0.0
|
|
3538
|
+
relaxed_floor = int(
|
|
3539
|
+
math.floor(float(eff_min_tokens) * (1.0 - tolerance_ratio))
|
|
3540
|
+
)
|
|
3541
|
+
tokens_ok = total_tokens >= max(relaxed_floor, 0)
|
|
3226
3542
|
except Exception: # pragma: no cover
|
|
3227
3543
|
tokens_ok = True
|
|
3228
3544
|
# Under tiny_relax, treat token floors as informational only
|
|
@@ -3416,6 +3732,19 @@ def _compute_validation_flags(
|
|
|
3416
3732
|
except Exception: # pragma: no cover
|
|
3417
3733
|
pass
|
|
3418
3734
|
|
|
3735
|
+
# Primary metric tail gate (warn/fail; default non-blocking)
|
|
3736
|
+
try:
|
|
3737
|
+
tail_ok = True
|
|
3738
|
+
if isinstance(pm_tail, dict) and pm_tail:
|
|
3739
|
+
mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
|
|
3740
|
+
evaluated = bool(pm_tail.get("evaluated", False))
|
|
3741
|
+
passed = bool(pm_tail.get("passed", True))
|
|
3742
|
+
if mode == "fail" and evaluated and (not passed):
|
|
3743
|
+
tail_ok = False
|
|
3744
|
+
flags["primary_metric_tail_acceptable"] = bool(tail_ok)
|
|
3745
|
+
except Exception: # pragma: no cover
|
|
3746
|
+
flags["primary_metric_tail_acceptable"] = True
|
|
3747
|
+
|
|
3419
3748
|
return flags
|
|
3420
3749
|
|
|
3421
3750
|
|
|
@@ -20,7 +20,7 @@ CERTIFICATE_SCHEMA_VERSION = "v1"
|
|
|
20
20
|
# separately in metric-specific logic.
|
|
21
21
|
CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
22
22
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
23
|
-
"title": "InvarLock
|
|
23
|
+
"title": "InvarLock Evaluation Certificate",
|
|
24
24
|
"type": "object",
|
|
25
25
|
"required": [
|
|
26
26
|
"schema_version",
|
|
@@ -148,6 +148,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
148
148
|
|
|
149
149
|
_VALIDATION_ALLOWLIST_DEFAULT = {
|
|
150
150
|
"primary_metric_acceptable",
|
|
151
|
+
"primary_metric_tail_acceptable",
|
|
151
152
|
"preview_final_drift_acceptable",
|
|
152
153
|
"guard_overhead_acceptable",
|
|
153
154
|
"invariants_pass",
|
|
@@ -181,7 +182,7 @@ def _load_validation_allowlist() -> set[str]:
|
|
|
181
182
|
def _validate_with_jsonschema(certificate: dict[str, Any]) -> bool:
|
|
182
183
|
"""Validate certificate with JSON Schema when available."""
|
|
183
184
|
if jsonschema is None:
|
|
184
|
-
return True # Schema library unavailable; fall back to
|
|
185
|
+
return True # Schema library unavailable; fall back to minimal checks
|
|
185
186
|
try:
|
|
186
187
|
jsonschema.validate(instance=certificate, schema=CERTIFICATE_JSON_SCHEMA)
|
|
187
188
|
return True
|
|
@@ -1,8 +1,21 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from ..eval.data import EvaluationWindow
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compute_window_hash(window: EvaluationWindow, *, include_data: bool) -> str:
|
|
10
|
+
"""Lazy wrapper around `invarlock.eval.data.compute_window_hash`.
|
|
11
|
+
|
|
12
|
+
Importing `invarlock.eval.data` pulls in optional heavy deps (HF datasets /
|
|
13
|
+
pyarrow). Keep that import off the module import path so that lightweight
|
|
14
|
+
reporting/helpers can be used without eagerly importing those deps.
|
|
15
|
+
"""
|
|
16
|
+
from ..eval.data import compute_window_hash as _compute_window_hash
|
|
17
|
+
|
|
18
|
+
return _compute_window_hash(window, include_data=include_data)
|
|
6
19
|
|
|
7
20
|
|
|
8
21
|
def compute_window_hashes(
|