invarlock 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +3 -3
- invarlock/adapters/auto.py +2 -10
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +28 -5
- invarlock/assurance/__init__.py +15 -23
- invarlock/calibration/spectral_null.py +1 -1
- invarlock/cli/adapter_auto.py +1 -5
- invarlock/cli/app.py +57 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/{certify.py → evaluate.py} +69 -46
- invarlock/cli/commands/explain_gates.py +94 -51
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/report.py +121 -47
- invarlock/cli/commands/run.py +274 -66
- invarlock/cli/commands/verify.py +84 -89
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/provenance.py +3 -3
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +1 -1
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +2 -2
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +11 -7
- invarlock/eval/primary_metric.py +1 -1
- invarlock/guards/spectral.py +2 -2
- invarlock/guards_ref/spectral_ref.py +1 -1
- invarlock/model_profile.py +16 -35
- invarlock/observability/health.py +38 -20
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/html.py +7 -7
- invarlock/reporting/normalizer.py +2 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +126 -120
- invarlock/reporting/report.py +43 -37
- invarlock/reporting/{certificate.py → report_builder.py} +103 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock-0.3.9.dist-info/METADATA +303 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/RECORD +46 -46
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/WHEEL +1 -1
- invarlock-0.3.7.dist-info/METADATA +0 -602
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
"""
|
|
2
|
-
InvarLock Evaluation
|
|
3
|
-
|
|
2
|
+
InvarLock Evaluation Report Generation
|
|
3
|
+
=====================================
|
|
4
4
|
|
|
5
|
-
Generate standardized evaluation
|
|
5
|
+
Generate standardized evaluation reports from RunReport and baseline
|
|
6
6
|
comparison.
|
|
7
|
-
|
|
8
|
-
for CI/CD
|
|
7
|
+
Evaluation reports are standalone, portable artifacts that record statistical
|
|
8
|
+
gates and evidence for CI/CD checks and audits (not formal verification).
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
-
## Core
|
|
13
|
+
## Core evaluation report building and analysis orchestration lives here.
|
|
14
14
|
# mypy: ignore-errors
|
|
15
15
|
import copy
|
|
16
16
|
import hashlib
|
|
@@ -39,11 +39,7 @@ from invarlock.eval.primary_metric import compute_primary_metric_from_report, ge
|
|
|
39
39
|
from invarlock.eval.tail_stats import evaluate_metric_tail
|
|
40
40
|
from invarlock.utils.digest import hash_json
|
|
41
41
|
|
|
42
|
-
from . import
|
|
43
|
-
from .certificate_schema import (
|
|
44
|
-
CERTIFICATE_JSON_SCHEMA,
|
|
45
|
-
CERTIFICATE_SCHEMA_VERSION,
|
|
46
|
-
)
|
|
42
|
+
from . import report_schema as _report_schema
|
|
47
43
|
from .dataset_hashing import (
|
|
48
44
|
_extract_dataset_info,
|
|
49
45
|
)
|
|
@@ -53,10 +49,15 @@ from .guards_analysis import (
|
|
|
53
49
|
_extract_spectral_analysis,
|
|
54
50
|
_extract_variance_analysis,
|
|
55
51
|
)
|
|
56
|
-
from .
|
|
52
|
+
from .report_schema import (
|
|
53
|
+
REPORT_JSON_SCHEMA,
|
|
54
|
+
REPORT_SCHEMA_VERSION,
|
|
55
|
+
)
|
|
56
|
+
from .report_types import RunReport
|
|
57
|
+
from .report_types import validate_report as validate_run_report
|
|
57
58
|
|
|
58
59
|
# Expose compute_window_hash for tests that monkeypatch it
|
|
59
|
-
# compute_window_hash used to be exposed via
|
|
60
|
+
# compute_window_hash used to be exposed via the evaluation report builder; tests now patch
|
|
60
61
|
# dataset_hashing.compute_window_hash directly, so this import is no longer needed.
|
|
61
62
|
from .utils import (
|
|
62
63
|
_coerce_int,
|
|
@@ -79,6 +80,9 @@ TIER_RATIO_LIMITS: dict[str, float] = {
|
|
|
79
80
|
"none": 1.10,
|
|
80
81
|
}
|
|
81
82
|
|
|
83
|
+
# Canonical preview→final drift band used when not explicitly configured.
|
|
84
|
+
PM_DRIFT_BAND_DEFAULT: tuple[float, float] = (0.95, 1.05)
|
|
85
|
+
|
|
82
86
|
|
|
83
87
|
def _is_ppl_kind(name: Any) -> bool:
|
|
84
88
|
"""Return True if a primary_metric kind denotes a ppl-like metric.
|
|
@@ -103,7 +107,7 @@ def _is_ppl_kind(name: Any) -> bool:
|
|
|
103
107
|
|
|
104
108
|
|
|
105
109
|
## NOTE: Deprecated helper `_get_ppl_final` was removed; callers should
|
|
106
|
-
## use the normalized primary_metric block directly via
|
|
110
|
+
## use the normalized primary_metric block directly via make_report or
|
|
107
111
|
## report processing utilities.
|
|
108
112
|
|
|
109
113
|
|
|
@@ -131,8 +135,8 @@ def _compute_edit_digest(report: dict) -> dict:
|
|
|
131
135
|
return {"family": family, "impl_hash": impl_hash, "version": 1}
|
|
132
136
|
|
|
133
137
|
|
|
134
|
-
def _compute_confidence_label(
|
|
135
|
-
"""Compute
|
|
138
|
+
def _compute_confidence_label(evaluation_report: dict[str, Any]) -> dict[str, Any]:
|
|
139
|
+
"""Compute evaluation report confidence label based on stability and CI width.
|
|
136
140
|
|
|
137
141
|
Heuristics:
|
|
138
142
|
- High: ppl_acceptable=True, unstable=False, width <= 0.03 (ratio) or <= 1.0 pp for accuracy
|
|
@@ -140,7 +144,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
140
144
|
- Low: otherwise (floors unmet, failure, or missing bounds)
|
|
141
145
|
Returns a dict with label, basis, width and threshold for transparency.
|
|
142
146
|
"""
|
|
143
|
-
validation =
|
|
147
|
+
validation = evaluation_report.get("validation", {}) or {}
|
|
144
148
|
pm_ok = bool(validation.get("primary_metric_acceptable", False))
|
|
145
149
|
# Basis label shown in confidence block:
|
|
146
150
|
# - For ppl-like metrics, use 'ppl_ratio' to reflect ratio width threshold
|
|
@@ -149,7 +153,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
149
153
|
basis = "primary_metric"
|
|
150
154
|
lo = hi = float("nan")
|
|
151
155
|
try:
|
|
152
|
-
pm =
|
|
156
|
+
pm = evaluation_report.get("primary_metric", {}) or {}
|
|
153
157
|
kind = str(pm.get("kind", "") or "").lower()
|
|
154
158
|
if isinstance(pm, dict) and pm and pm.get("display_ci"):
|
|
155
159
|
dci = pm.get("display_ci")
|
|
@@ -170,7 +174,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
170
174
|
thr_ratio = 0.03 # 3% width for ratio
|
|
171
175
|
thr_pp = 1.0 # 1.0 percentage point for accuracy kinds
|
|
172
176
|
try:
|
|
173
|
-
pol =
|
|
177
|
+
pol = evaluation_report.get("resolved_policy")
|
|
174
178
|
if isinstance(pol, dict):
|
|
175
179
|
conf_pol = pol.get("confidence")
|
|
176
180
|
if isinstance(conf_pol, dict):
|
|
@@ -187,7 +191,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
187
191
|
|
|
188
192
|
# Unstable hint from primary metric (if provided)
|
|
189
193
|
try:
|
|
190
|
-
unstable = bool((
|
|
194
|
+
unstable = bool((evaluation_report.get("primary_metric") or {}).get("unstable"))
|
|
191
195
|
except Exception: # pragma: no cover
|
|
192
196
|
unstable = False
|
|
193
197
|
|
|
@@ -213,39 +217,39 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
213
217
|
}
|
|
214
218
|
|
|
215
219
|
|
|
216
|
-
# Minimal JSON Schema describing the canonical shape of
|
|
220
|
+
# Minimal JSON Schema describing the canonical shape of an evaluation report.
|
|
217
221
|
# This focuses on structural validity; numerical thresholds are validated
|
|
218
222
|
# separately in metric-specific logic.
|
|
219
|
-
# JSON Schema is provided by
|
|
223
|
+
# JSON Schema is provided by report_schema; no duplication here.
|
|
220
224
|
|
|
221
225
|
|
|
222
226
|
# Mirror jsonschema and structural validator for test monkeypatching compatibility.
|
|
223
|
-
jsonschema = getattr(
|
|
227
|
+
jsonschema = getattr(_report_schema, "jsonschema", None)
|
|
224
228
|
|
|
225
229
|
|
|
226
|
-
def _validate_with_jsonschema(
|
|
230
|
+
def _validate_with_jsonschema(evaluation_report: dict[str, Any]) -> bool:
|
|
227
231
|
if jsonschema is None:
|
|
228
232
|
return True
|
|
229
233
|
try:
|
|
230
|
-
jsonschema.validate(instance=
|
|
234
|
+
jsonschema.validate(instance=evaluation_report, schema=REPORT_JSON_SCHEMA)
|
|
231
235
|
return True
|
|
232
236
|
except Exception: # pragma: no cover
|
|
233
237
|
return False
|
|
234
238
|
|
|
235
239
|
|
|
236
|
-
def
|
|
237
|
-
"""Validate that
|
|
240
|
+
def validate_report(evaluation_report: dict[str, Any]) -> bool:
|
|
241
|
+
"""Validate that an evaluation report has all required fields and valid data."""
|
|
238
242
|
try:
|
|
239
|
-
if
|
|
243
|
+
if evaluation_report.get("schema_version") != REPORT_SCHEMA_VERSION:
|
|
240
244
|
return False
|
|
241
245
|
# Prefer JSON Schema structural validation; if unavailable or too strict,
|
|
242
246
|
# fall back to a lenient minimal check used by unit tests.
|
|
243
|
-
if not _validate_with_jsonschema(
|
|
247
|
+
if not _validate_with_jsonschema(evaluation_report):
|
|
244
248
|
# Minimal fallback: require schema version + run_id + primary_metric
|
|
245
|
-
run_id_ok = isinstance(
|
|
246
|
-
|
|
249
|
+
run_id_ok = isinstance(evaluation_report.get("run_id"), str) and bool(
|
|
250
|
+
evaluation_report.get("run_id")
|
|
247
251
|
)
|
|
248
|
-
pm =
|
|
252
|
+
pm = evaluation_report.get("primary_metric")
|
|
249
253
|
pm_ok = isinstance(pm, dict) and (
|
|
250
254
|
isinstance(pm.get("final"), int | float)
|
|
251
255
|
or (isinstance(pm.get("kind"), str) and bool(pm.get("kind")))
|
|
@@ -253,7 +257,7 @@ def validate_certificate(certificate: dict[str, Any]) -> bool:
|
|
|
253
257
|
if not (run_id_ok and pm_ok):
|
|
254
258
|
return False
|
|
255
259
|
|
|
256
|
-
validation =
|
|
260
|
+
validation = evaluation_report.get("validation", {})
|
|
257
261
|
for flag in [
|
|
258
262
|
"preview_final_drift_acceptable",
|
|
259
263
|
"primary_metric_acceptable",
|
|
@@ -428,8 +432,8 @@ def _load_validation_allowlist() -> set[str]:
|
|
|
428
432
|
# disallow unknown validation keys at schema level.
|
|
429
433
|
try:
|
|
430
434
|
_vkeys = _load_validation_allowlist()
|
|
431
|
-
if isinstance(
|
|
432
|
-
vspec =
|
|
435
|
+
if isinstance(REPORT_JSON_SCHEMA.get("properties"), dict):
|
|
436
|
+
vspec = REPORT_JSON_SCHEMA["properties"].get("validation")
|
|
433
437
|
if isinstance(vspec, dict):
|
|
434
438
|
vspec["properties"] = {k: {"type": "boolean"} for k in _vkeys}
|
|
435
439
|
vspec["additionalProperties"] = False
|
|
@@ -446,7 +450,7 @@ except Exception: # pragma: no cover
|
|
|
446
450
|
def _normalize_and_validate_report(report: RunReport | dict[str, Any]) -> RunReport:
|
|
447
451
|
"""Normalize a possibly-minimal report and validate its structure.
|
|
448
452
|
|
|
449
|
-
Uses the local normalizer when available, then checks `
|
|
453
|
+
Uses the local normalizer when available, then checks `validate_run_report`.
|
|
450
454
|
Raises ValueError on invalid input. Returns the normalized RunReport.
|
|
451
455
|
"""
|
|
452
456
|
try:
|
|
@@ -456,13 +460,13 @@ def _normalize_and_validate_report(report: RunReport | dict[str, Any]) -> RunRep
|
|
|
456
460
|
report = _norm(report)
|
|
457
461
|
except Exception: # pragma: no cover
|
|
458
462
|
pass
|
|
459
|
-
if not
|
|
463
|
+
if not validate_run_report(report):
|
|
460
464
|
raise ValueError("Invalid RunReport structure")
|
|
461
465
|
return report
|
|
462
466
|
|
|
463
467
|
|
|
464
|
-
def
|
|
465
|
-
"""Extract the
|
|
468
|
+
def _extract_report_meta(report: RunReport) -> dict[str, Any]:
|
|
469
|
+
"""Extract the evaluation report metadata block with a full seed bundle."""
|
|
466
470
|
meta_section = (
|
|
467
471
|
report.get("meta", {}) if isinstance(report.get("meta"), dict) else {}
|
|
468
472
|
)
|
|
@@ -739,22 +743,22 @@ def _fallback_paired_windows(
|
|
|
739
743
|
return paired_windows
|
|
740
744
|
|
|
741
745
|
|
|
742
|
-
def
|
|
746
|
+
def make_report(
|
|
743
747
|
report: RunReport,
|
|
744
748
|
baseline: RunReport | dict[str, Any],
|
|
745
749
|
) -> dict[str, Any]:
|
|
746
750
|
"""
|
|
747
|
-
Generate an evaluation
|
|
751
|
+
Generate an evaluation report from a RunReport and baseline comparison.
|
|
748
752
|
|
|
749
|
-
The
|
|
750
|
-
essential metrics and comparisons
|
|
753
|
+
The evaluation report is a standalone, portable artifact that contains all
|
|
754
|
+
essential paired metrics and comparisons used by InvarLock gates.
|
|
751
755
|
|
|
752
756
|
Args:
|
|
753
|
-
report: The guarded run report to
|
|
757
|
+
report: The guarded run report to evaluate
|
|
754
758
|
baseline: Step-0 baseline RunReport or baseline metrics dict
|
|
755
759
|
|
|
756
760
|
Returns:
|
|
757
|
-
|
|
761
|
+
Evaluation report dictionary with all required fields
|
|
758
762
|
|
|
759
763
|
Raises:
|
|
760
764
|
ValueError: If inputs are invalid or required data is missing
|
|
@@ -778,11 +782,11 @@ def make_certificate(
|
|
|
778
782
|
baseline_report = None
|
|
779
783
|
|
|
780
784
|
# Extract core metadata with full seed bundle
|
|
781
|
-
meta =
|
|
785
|
+
meta = _extract_report_meta(report)
|
|
782
786
|
|
|
783
787
|
# Propagate environment flags captured in the RunReport (e.g., deterministic algos,
|
|
784
788
|
# TF32 controls, MPS/CUDA availability). This is useful for auditability and
|
|
785
|
-
# reproducibility of
|
|
789
|
+
# reproducibility of evaluation runs.
|
|
786
790
|
try:
|
|
787
791
|
env_flags = (
|
|
788
792
|
report.get("meta", {}).get("env_flags")
|
|
@@ -1602,7 +1606,7 @@ def make_certificate(
|
|
|
1602
1606
|
if device_name:
|
|
1603
1607
|
telemetry.setdefault("device", device_name)
|
|
1604
1608
|
|
|
1605
|
-
# Build the
|
|
1609
|
+
# Build the evaluation report
|
|
1606
1610
|
window_capacity_ctx = (
|
|
1607
1611
|
report.get("metrics", {}).get("window_capacity")
|
|
1608
1612
|
if isinstance(report.get("metrics"), dict)
|
|
@@ -1920,8 +1924,8 @@ def make_certificate(
|
|
|
1920
1924
|
k: bool(v) for k, v in validation_flags.items() if k in _allowed_validation
|
|
1921
1925
|
}
|
|
1922
1926
|
|
|
1923
|
-
|
|
1924
|
-
"schema_version":
|
|
1927
|
+
evaluation_report = {
|
|
1928
|
+
"schema_version": REPORT_SCHEMA_VERSION,
|
|
1925
1929
|
"run_id": current_run_id,
|
|
1926
1930
|
"meta": meta,
|
|
1927
1931
|
"auto": auto,
|
|
@@ -1964,8 +1968,8 @@ def make_certificate(
|
|
|
1964
1968
|
_tiny_relax_env = False
|
|
1965
1969
|
if _tiny_relax_env:
|
|
1966
1970
|
try:
|
|
1967
|
-
|
|
1968
|
-
prov =
|
|
1971
|
+
evaluation_report.setdefault("auto", {})["tiny_relax"] = True
|
|
1972
|
+
prov = evaluation_report.setdefault("provenance", {})
|
|
1969
1973
|
flags = prov.setdefault("flags", [])
|
|
1970
1974
|
if "tiny_relax" not in flags:
|
|
1971
1975
|
flags.append("tiny_relax")
|
|
@@ -1991,12 +1995,12 @@ def make_certificate(
|
|
|
1991
1995
|
and "value" in qo
|
|
1992
1996
|
and math.isfinite(float(qo.get("value", float("nan"))))
|
|
1993
1997
|
):
|
|
1994
|
-
|
|
1998
|
+
evaluation_report["quality_overhead"] = qo
|
|
1995
1999
|
except Exception: # pragma: no cover
|
|
1996
2000
|
pass
|
|
1997
2001
|
|
|
1998
2002
|
try:
|
|
1999
|
-
_propagate_pairing_stats(
|
|
2003
|
+
_propagate_pairing_stats(evaluation_report, ppl_analysis)
|
|
2000
2004
|
except Exception: # pragma: no cover
|
|
2001
2005
|
pass
|
|
2002
2006
|
|
|
@@ -2057,7 +2061,7 @@ def make_certificate(
|
|
|
2057
2061
|
(resolved_policy.get("variance") or {}).get("min_effect_lognll", 0.0) or 0.0
|
|
2058
2062
|
)
|
|
2059
2063
|
|
|
2060
|
-
|
|
2064
|
+
evaluation_report["policy_digest"] = {
|
|
2061
2065
|
"policy_version": POLICY_VERSION,
|
|
2062
2066
|
"tier_policy_name": cur_tier,
|
|
2063
2067
|
"thresholds_hash": thresholds_hash,
|
|
@@ -2088,7 +2092,7 @@ def make_certificate(
|
|
|
2088
2092
|
payload[key] = item[key]
|
|
2089
2093
|
sanitized.append(payload)
|
|
2090
2094
|
if sanitized:
|
|
2091
|
-
|
|
2095
|
+
evaluation_report["secondary_metrics"] = sanitized
|
|
2092
2096
|
except Exception: # pragma: no cover
|
|
2093
2097
|
pass
|
|
2094
2098
|
|
|
@@ -2136,7 +2140,7 @@ def make_certificate(
|
|
|
2136
2140
|
except Exception: # pragma: no cover
|
|
2137
2141
|
continue
|
|
2138
2142
|
if out:
|
|
2139
|
-
|
|
2143
|
+
evaluation_report["classification"] = {"subgroups": out}
|
|
2140
2144
|
except Exception: # pragma: no cover
|
|
2141
2145
|
pass
|
|
2142
2146
|
|
|
@@ -2152,7 +2156,7 @@ def make_certificate(
|
|
|
2152
2156
|
if isinstance(container.get("metrics"), dict)
|
|
2153
2157
|
else {}
|
|
2154
2158
|
)
|
|
2155
|
-
# Edited report case: also check
|
|
2159
|
+
# Edited report case: also check evaluation_report telemetry keys
|
|
2156
2160
|
telem = telemetry if isinstance(telemetry, dict) else {}
|
|
2157
2161
|
# Prefer explicit p50/p95 throughput keys if present
|
|
2158
2162
|
for key in ("latency_ms_p50", "latency_ms_p95", "throughput_sps"):
|
|
@@ -2193,24 +2197,24 @@ def make_certificate(
|
|
|
2193
2197
|
entry["ratio"] = float("nan")
|
|
2194
2198
|
system_overhead[metric_key] = entry
|
|
2195
2199
|
if system_overhead:
|
|
2196
|
-
|
|
2200
|
+
evaluation_report["system_overhead"] = system_overhead
|
|
2197
2201
|
except Exception: # pragma: no cover
|
|
2198
2202
|
pass
|
|
2199
2203
|
|
|
2200
2204
|
# Attach/normalize primary metric block (moved to helper)
|
|
2201
2205
|
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
2202
2206
|
|
|
2203
|
-
_attach_pm(
|
|
2207
|
+
_attach_pm(evaluation_report, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2204
2208
|
try:
|
|
2205
2209
|
if isinstance(pm_drift_band, dict) and pm_drift_band:
|
|
2206
|
-
pm_block =
|
|
2210
|
+
pm_block = evaluation_report.get("primary_metric")
|
|
2207
2211
|
if isinstance(pm_block, dict):
|
|
2208
2212
|
pm_block.setdefault("drift_band", dict(pm_drift_band))
|
|
2209
2213
|
except Exception: # pragma: no cover
|
|
2210
2214
|
pass
|
|
2211
2215
|
_enforce_display_ci_alignment(
|
|
2212
2216
|
ratio_ci_source,
|
|
2213
|
-
|
|
2217
|
+
evaluation_report.get("primary_metric"),
|
|
2214
2218
|
logloss_delta_ci,
|
|
2215
2219
|
window_plan_profile,
|
|
2216
2220
|
)
|
|
@@ -2218,8 +2222,8 @@ def make_certificate(
|
|
|
2218
2222
|
# Ensure primary_metric has display_ci populated for schema invariants
|
|
2219
2223
|
try:
|
|
2220
2224
|
pm = (
|
|
2221
|
-
|
|
2222
|
-
if isinstance(
|
|
2225
|
+
evaluation_report.get("primary_metric", {})
|
|
2226
|
+
if isinstance(evaluation_report.get("primary_metric"), dict)
|
|
2223
2227
|
else None
|
|
2224
2228
|
)
|
|
2225
2229
|
if isinstance(pm, dict) and pm:
|
|
@@ -2259,8 +2263,8 @@ def make_certificate(
|
|
|
2259
2263
|
if not kind:
|
|
2260
2264
|
kind = "ppl"
|
|
2261
2265
|
windows_cfg = (
|
|
2262
|
-
|
|
2263
|
-
if isinstance(
|
|
2266
|
+
evaluation_report.get("dataset", {}).get("windows", {})
|
|
2267
|
+
if isinstance(evaluation_report.get("dataset"), dict)
|
|
2264
2268
|
else {}
|
|
2265
2269
|
)
|
|
2266
2270
|
n_prev = windows_cfg.get("preview")
|
|
@@ -2268,7 +2272,7 @@ def make_certificate(
|
|
|
2268
2272
|
tokens_total = None
|
|
2269
2273
|
try:
|
|
2270
2274
|
tokens_total = (
|
|
2271
|
-
|
|
2275
|
+
evaluation_report.get("dataset", {}).get("hash", {}).get("total_tokens")
|
|
2272
2276
|
)
|
|
2273
2277
|
except Exception: # pragma: no cover
|
|
2274
2278
|
tokens_total = None
|
|
@@ -2276,7 +2280,7 @@ def make_certificate(
|
|
|
2276
2280
|
ci_lo = None
|
|
2277
2281
|
ci_hi = None
|
|
2278
2282
|
ratio = None
|
|
2279
|
-
pmc =
|
|
2283
|
+
pmc = evaluation_report.get("primary_metric", {})
|
|
2280
2284
|
rci = pmc.get("display_ci") or pmc.get("ci")
|
|
2281
2285
|
if isinstance(rci, tuple | list) and len(rci) == 2:
|
|
2282
2286
|
ci_lo, ci_hi = rci[0], rci[1]
|
|
@@ -2288,7 +2292,7 @@ def make_certificate(
|
|
|
2288
2292
|
except Exception: # pragma: no cover
|
|
2289
2293
|
ci_w = None
|
|
2290
2294
|
# Gate outcome
|
|
2291
|
-
val =
|
|
2295
|
+
val = evaluation_report.get("validation", {})
|
|
2292
2296
|
gate_ok = None
|
|
2293
2297
|
try:
|
|
2294
2298
|
gate_ok = bool(val.get("primary_metric_acceptable"))
|
|
@@ -2303,10 +2307,10 @@ def make_certificate(
|
|
|
2303
2307
|
f"tokens={tokens_total}",
|
|
2304
2308
|
]
|
|
2305
2309
|
try:
|
|
2306
|
-
split = (
|
|
2310
|
+
split = (evaluation_report.get("provenance", {}) or {}).get("dataset_split")
|
|
2307
2311
|
if not split:
|
|
2308
2312
|
split = (report.get("provenance", {}) or {}).get("dataset_split")
|
|
2309
|
-
sf = (
|
|
2313
|
+
sf = (evaluation_report.get("provenance", {}) or {}).get("split_fallback")
|
|
2310
2314
|
if sf is None:
|
|
2311
2315
|
sf = (report.get("provenance", {}) or {}).get("split_fallback")
|
|
2312
2316
|
if split:
|
|
@@ -2322,7 +2326,7 @@ def make_certificate(
|
|
|
2322
2326
|
if isinstance(gate_ok, bool):
|
|
2323
2327
|
parts.append(f"gate={'pass' if gate_ok else 'fail'}")
|
|
2324
2328
|
summary_line = "INVARLOCK_TELEMETRY " + " ".join(parts)
|
|
2325
|
-
|
|
2329
|
+
evaluation_report.setdefault("telemetry", {})["summary_line"] = summary_line
|
|
2326
2330
|
if str(os.environ.get("INVARLOCK_TELEMETRY", "")).strip().lower() in {
|
|
2327
2331
|
"1",
|
|
2328
2332
|
"true",
|
|
@@ -2335,17 +2339,17 @@ def make_certificate(
|
|
|
2335
2339
|
|
|
2336
2340
|
# Attach confidence label (non-gating)
|
|
2337
2341
|
try:
|
|
2338
|
-
|
|
2342
|
+
evaluation_report["confidence"] = _compute_confidence_label(evaluation_report)
|
|
2339
2343
|
except Exception: # pragma: no cover
|
|
2340
2344
|
pass
|
|
2341
2345
|
|
|
2342
|
-
return
|
|
2346
|
+
return evaluation_report
|
|
2343
2347
|
|
|
2344
2348
|
|
|
2345
2349
|
# Console Validation Block helpers have moved to invarlock.reporting.render.
|
|
2346
2350
|
|
|
2347
2351
|
|
|
2348
|
-
## NOTE:
|
|
2352
|
+
## NOTE: render_report_markdown has been moved to invarlock.reporting.render.
|
|
2349
2353
|
## It is re-exported at the bottom of this module to preserve the public API.
|
|
2350
2354
|
## Private helper functions
|
|
2351
2355
|
|
|
@@ -2623,7 +2627,7 @@ def _extract_structural_deltas(report: RunReport) -> dict[str, Any]:
|
|
|
2623
2627
|
def _extract_edit_metadata(
|
|
2624
2628
|
report: RunReport, plugin_provenance: dict[str, Any]
|
|
2625
2629
|
) -> dict[str, Any]:
|
|
2626
|
-
"""Extract edit-level provenance and configuration metadata for the
|
|
2630
|
+
"""Extract edit-level provenance and configuration metadata for the evaluation report."""
|
|
2627
2631
|
|
|
2628
2632
|
edit_section = _get_mapping(report, "edit")
|
|
2629
2633
|
if not edit_section:
|
|
@@ -3020,12 +3024,12 @@ def _compute_quality_overhead_from_guard(
|
|
|
3020
3024
|
|
|
3021
3025
|
|
|
3022
3026
|
def _propagate_pairing_stats(
|
|
3023
|
-
|
|
3027
|
+
evaluation_report: dict[str, Any], ppl_analysis: dict[str, Any] | None
|
|
3024
3028
|
) -> None:
|
|
3025
|
-
"""Surface pairing statistics inside
|
|
3026
|
-
if not isinstance(
|
|
3029
|
+
"""Surface pairing statistics inside evaluation_report.dataset.windows.stats."""
|
|
3030
|
+
if not isinstance(evaluation_report, dict):
|
|
3027
3031
|
return
|
|
3028
|
-
ds =
|
|
3032
|
+
ds = evaluation_report.get("dataset", {})
|
|
3029
3033
|
if not isinstance(ds, dict):
|
|
3030
3034
|
return
|
|
3031
3035
|
windows = ds.get("windows", {})
|
|
@@ -3079,7 +3083,7 @@ def _propagate_pairing_stats(
|
|
|
3079
3083
|
windows["stats"] = stats
|
|
3080
3084
|
if windows is not ds.get("windows"):
|
|
3081
3085
|
ds["windows"] = windows
|
|
3082
|
-
|
|
3086
|
+
evaluation_report["dataset"] = ds
|
|
3083
3087
|
|
|
3084
3088
|
|
|
3085
3089
|
def _build_provenance_block(
|
|
@@ -3262,8 +3266,7 @@ def _resolve_pm_drift_band_from_report(
|
|
|
3262
3266
|
) -> dict[str, float]:
|
|
3263
3267
|
"""Resolve preview→final drift band from report context/meta/env."""
|
|
3264
3268
|
|
|
3265
|
-
base_min =
|
|
3266
|
-
base_max = 1.05
|
|
3269
|
+
base_min, base_max = PM_DRIFT_BAND_DEFAULT
|
|
3267
3270
|
|
|
3268
3271
|
def _safe_float(val: Any) -> float | None:
|
|
3269
3272
|
try:
|
|
@@ -3372,7 +3375,7 @@ def _compute_validation_flags(
|
|
|
3372
3375
|
pm_drift_band: dict[str, float] | None = None,
|
|
3373
3376
|
pm_tail: dict[str, Any] | None = None,
|
|
3374
3377
|
) -> dict[str, bool]:
|
|
3375
|
-
"""Compute validation flags for the
|
|
3378
|
+
"""Compute validation flags for the evaluation report including canonical gates."""
|
|
3376
3379
|
tier = (tier or "balanced").lower()
|
|
3377
3380
|
# Dev-only tiny relax: widen gates and lower floors when explicitly requested
|
|
3378
3381
|
import os as _os
|
|
@@ -3435,8 +3438,7 @@ def _compute_validation_flags(
|
|
|
3435
3438
|
# Canonical Gates
|
|
3436
3439
|
# 1. Drift gate: by default 0.95 ≤ final/preview ≤ 1.05 (configurable)
|
|
3437
3440
|
drift_ratio = ppl.get("preview_final_ratio", 1.0)
|
|
3438
|
-
drift_min =
|
|
3439
|
-
drift_max = 1.05
|
|
3441
|
+
drift_min, drift_max = PM_DRIFT_BAND_DEFAULT
|
|
3440
3442
|
if isinstance(pm_drift_band, dict):
|
|
3441
3443
|
try:
|
|
3442
3444
|
cand_min = pm_drift_band.get("min")
|
|
@@ -3613,7 +3615,7 @@ def _compute_validation_flags(
|
|
|
3613
3615
|
if _tiny_relax and threshold_val < 0.10:
|
|
3614
3616
|
threshold_val = 0.10
|
|
3615
3617
|
if not math.isfinite(ratio_val):
|
|
3616
|
-
# In dev/Compare-&-
|
|
3618
|
+
# In dev/Compare-&-Evaluate flows we often lack a bare run; treat missing metric as pass
|
|
3617
3619
|
guard_overhead_pass = True
|
|
3618
3620
|
else:
|
|
3619
3621
|
guard_overhead_pass = ratio_val <= (1.0 + max(0.0, threshold_val))
|
|
@@ -3769,7 +3771,7 @@ def _generate_run_id(report: RunReport) -> str:
|
|
|
3769
3771
|
return hashlib.sha256(base_str.encode()).hexdigest()[:16]
|
|
3770
3772
|
|
|
3771
3773
|
|
|
3772
|
-
## NOTE:
|
|
3774
|
+
## NOTE: _compute_report_hash moved to invarlock.reporting.render and is re-exported below.
|
|
3773
3775
|
|
|
3774
3776
|
|
|
3775
3777
|
def _analyze_bitwidth_map(bitwidth_map: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -4114,22 +4116,24 @@ def _extract_compression_diagnostics(
|
|
|
4114
4116
|
|
|
4115
4117
|
# Re-export rendering API from dedicated module to avoid bloat/cycles
|
|
4116
4118
|
# Rendering helpers live in invarlock.reporting.render; internal code should import there directly.
|
|
4117
|
-
# Tests and public API expect
|
|
4118
|
-
# invarlock.reporting.
|
|
4119
|
+
# Tests and public API expect render_report_markdown to be available from
|
|
4120
|
+
# invarlock.reporting.report_builder. Import lazily at module end to avoid cycles with
|
|
4119
4121
|
# invarlock.reporting.render which imports this module as a namespace.
|
|
4120
4122
|
try: # pragma: no cover - simple re-export
|
|
4121
4123
|
from .render import (
|
|
4122
4124
|
compute_console_validation_block, # type: ignore
|
|
4123
|
-
|
|
4125
|
+
render_report_markdown, # type: ignore
|
|
4124
4126
|
)
|
|
4125
4127
|
except Exception: # pragma: no cover - defensive fallback
|
|
4126
4128
|
|
|
4127
|
-
def
|
|
4129
|
+
def render_report_markdown(evaluation_report: dict[str, Any]) -> str: # type: ignore
|
|
4128
4130
|
raise ImportError(
|
|
4129
|
-
"
|
|
4131
|
+
"render_report_markdown is unavailable; rendering dependencies missing"
|
|
4130
4132
|
)
|
|
4131
4133
|
|
|
4132
|
-
def compute_console_validation_block(
|
|
4134
|
+
def compute_console_validation_block(
|
|
4135
|
+
evaluation_report: dict[str, Any],
|
|
4136
|
+
) -> dict[str, Any]: # type: ignore
|
|
4133
4137
|
raise ImportError(
|
|
4134
4138
|
"compute_console_validation_block is unavailable; rendering dependencies missing"
|
|
4135
4139
|
)
|
|
@@ -4137,12 +4141,12 @@ except Exception: # pragma: no cover - defensive fallback
|
|
|
4137
4141
|
|
|
4138
4142
|
# Export public API
|
|
4139
4143
|
__all__ = [
|
|
4140
|
-
"
|
|
4141
|
-
"
|
|
4144
|
+
"make_report",
|
|
4145
|
+
"validate_report",
|
|
4142
4146
|
"_validate_with_jsonschema",
|
|
4143
4147
|
"jsonschema",
|
|
4144
|
-
"
|
|
4148
|
+
"render_report_markdown",
|
|
4145
4149
|
"compute_console_validation_block",
|
|
4146
|
-
"
|
|
4147
|
-
"
|
|
4150
|
+
"REPORT_SCHEMA_VERSION",
|
|
4151
|
+
"REPORT_JSON_SCHEMA",
|
|
4148
4152
|
]
|