invarlock 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +3 -3
- invarlock/adapters/auto.py +2 -10
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +28 -5
- invarlock/assurance/__init__.py +15 -23
- invarlock/cli/adapter_auto.py +1 -5
- invarlock/cli/app.py +57 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/{certify.py → evaluate.py} +69 -46
- invarlock/cli/commands/explain_gates.py +25 -17
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/report.py +116 -46
- invarlock/cli/commands/run.py +274 -66
- invarlock/cli/commands/verify.py +84 -89
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/provenance.py +3 -3
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +1 -1
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +2 -2
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +11 -7
- invarlock/eval/primary_metric.py +1 -1
- invarlock/guards/spectral.py +1 -1
- invarlock/model_profile.py +16 -35
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/html.py +7 -7
- invarlock/reporting/normalizer.py +2 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +126 -120
- invarlock/reporting/report.py +43 -37
- invarlock/reporting/{certificate.py → report_builder.py} +98 -95
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock-0.3.8.dist-info/METADATA +283 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/RECORD +43 -43
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
- invarlock-0.3.7.dist-info/METADATA +0 -602
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
"""
|
|
2
|
-
InvarLock Evaluation
|
|
3
|
-
|
|
2
|
+
InvarLock Evaluation Report Generation
|
|
3
|
+
=====================================
|
|
4
4
|
|
|
5
|
-
Generate standardized evaluation
|
|
5
|
+
Generate standardized evaluation reports from RunReport and baseline
|
|
6
6
|
comparison.
|
|
7
|
-
|
|
8
|
-
for CI/CD
|
|
7
|
+
Evaluation reports are standalone, portable artifacts that record statistical
|
|
8
|
+
gates and evidence for CI/CD checks and audits (not formal verification).
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
-
## Core
|
|
13
|
+
## Core evaluation report building and analysis orchestration lives here.
|
|
14
14
|
# mypy: ignore-errors
|
|
15
15
|
import copy
|
|
16
16
|
import hashlib
|
|
@@ -39,11 +39,7 @@ from invarlock.eval.primary_metric import compute_primary_metric_from_report, ge
|
|
|
39
39
|
from invarlock.eval.tail_stats import evaluate_metric_tail
|
|
40
40
|
from invarlock.utils.digest import hash_json
|
|
41
41
|
|
|
42
|
-
from . import
|
|
43
|
-
from .certificate_schema import (
|
|
44
|
-
CERTIFICATE_JSON_SCHEMA,
|
|
45
|
-
CERTIFICATE_SCHEMA_VERSION,
|
|
46
|
-
)
|
|
42
|
+
from . import report_schema as _report_schema
|
|
47
43
|
from .dataset_hashing import (
|
|
48
44
|
_extract_dataset_info,
|
|
49
45
|
)
|
|
@@ -53,10 +49,15 @@ from .guards_analysis import (
|
|
|
53
49
|
_extract_spectral_analysis,
|
|
54
50
|
_extract_variance_analysis,
|
|
55
51
|
)
|
|
56
|
-
from .
|
|
52
|
+
from .report_schema import (
|
|
53
|
+
REPORT_JSON_SCHEMA,
|
|
54
|
+
REPORT_SCHEMA_VERSION,
|
|
55
|
+
)
|
|
56
|
+
from .report_types import RunReport
|
|
57
|
+
from .report_types import validate_report as validate_run_report
|
|
57
58
|
|
|
58
59
|
# Expose compute_window_hash for tests that monkeypatch it
|
|
59
|
-
# compute_window_hash used to be exposed via
|
|
60
|
+
# compute_window_hash used to be exposed via the evaluation report builder; tests now patch
|
|
60
61
|
# dataset_hashing.compute_window_hash directly, so this import is no longer needed.
|
|
61
62
|
from .utils import (
|
|
62
63
|
_coerce_int,
|
|
@@ -103,7 +104,7 @@ def _is_ppl_kind(name: Any) -> bool:
|
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
## NOTE: Deprecated helper `_get_ppl_final` was removed; callers should
|
|
106
|
-
## use the normalized primary_metric block directly via
|
|
107
|
+
## use the normalized primary_metric block directly via make_report or
|
|
107
108
|
## report processing utilities.
|
|
108
109
|
|
|
109
110
|
|
|
@@ -131,8 +132,8 @@ def _compute_edit_digest(report: dict) -> dict:
|
|
|
131
132
|
return {"family": family, "impl_hash": impl_hash, "version": 1}
|
|
132
133
|
|
|
133
134
|
|
|
134
|
-
def _compute_confidence_label(
|
|
135
|
-
"""Compute
|
|
135
|
+
def _compute_confidence_label(evaluation_report: dict[str, Any]) -> dict[str, Any]:
|
|
136
|
+
"""Compute evaluation report confidence label based on stability and CI width.
|
|
136
137
|
|
|
137
138
|
Heuristics:
|
|
138
139
|
- High: ppl_acceptable=True, unstable=False, width <= 0.03 (ratio) or <= 1.0 pp for accuracy
|
|
@@ -140,7 +141,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
140
141
|
- Low: otherwise (floors unmet, failure, or missing bounds)
|
|
141
142
|
Returns a dict with label, basis, width and threshold for transparency.
|
|
142
143
|
"""
|
|
143
|
-
validation =
|
|
144
|
+
validation = evaluation_report.get("validation", {}) or {}
|
|
144
145
|
pm_ok = bool(validation.get("primary_metric_acceptable", False))
|
|
145
146
|
# Basis label shown in confidence block:
|
|
146
147
|
# - For ppl-like metrics, use 'ppl_ratio' to reflect ratio width threshold
|
|
@@ -149,7 +150,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
149
150
|
basis = "primary_metric"
|
|
150
151
|
lo = hi = float("nan")
|
|
151
152
|
try:
|
|
152
|
-
pm =
|
|
153
|
+
pm = evaluation_report.get("primary_metric", {}) or {}
|
|
153
154
|
kind = str(pm.get("kind", "") or "").lower()
|
|
154
155
|
if isinstance(pm, dict) and pm and pm.get("display_ci"):
|
|
155
156
|
dci = pm.get("display_ci")
|
|
@@ -170,7 +171,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
170
171
|
thr_ratio = 0.03 # 3% width for ratio
|
|
171
172
|
thr_pp = 1.0 # 1.0 percentage point for accuracy kinds
|
|
172
173
|
try:
|
|
173
|
-
pol =
|
|
174
|
+
pol = evaluation_report.get("resolved_policy")
|
|
174
175
|
if isinstance(pol, dict):
|
|
175
176
|
conf_pol = pol.get("confidence")
|
|
176
177
|
if isinstance(conf_pol, dict):
|
|
@@ -187,7 +188,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
187
188
|
|
|
188
189
|
# Unstable hint from primary metric (if provided)
|
|
189
190
|
try:
|
|
190
|
-
unstable = bool((
|
|
191
|
+
unstable = bool((evaluation_report.get("primary_metric") or {}).get("unstable"))
|
|
191
192
|
except Exception: # pragma: no cover
|
|
192
193
|
unstable = False
|
|
193
194
|
|
|
@@ -213,39 +214,39 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
213
214
|
}
|
|
214
215
|
|
|
215
216
|
|
|
216
|
-
# Minimal JSON Schema describing the canonical shape of
|
|
217
|
+
# Minimal JSON Schema describing the canonical shape of an evaluation report.
|
|
217
218
|
# This focuses on structural validity; numerical thresholds are validated
|
|
218
219
|
# separately in metric-specific logic.
|
|
219
|
-
# JSON Schema is provided by
|
|
220
|
+
# JSON Schema is provided by report_schema; no duplication here.
|
|
220
221
|
|
|
221
222
|
|
|
222
223
|
# Mirror jsonschema and structural validator for test monkeypatching compatibility.
|
|
223
|
-
jsonschema = getattr(
|
|
224
|
+
jsonschema = getattr(_report_schema, "jsonschema", None)
|
|
224
225
|
|
|
225
226
|
|
|
226
|
-
def _validate_with_jsonschema(
|
|
227
|
+
def _validate_with_jsonschema(evaluation_report: dict[str, Any]) -> bool:
|
|
227
228
|
if jsonschema is None:
|
|
228
229
|
return True
|
|
229
230
|
try:
|
|
230
|
-
jsonschema.validate(instance=
|
|
231
|
+
jsonschema.validate(instance=evaluation_report, schema=REPORT_JSON_SCHEMA)
|
|
231
232
|
return True
|
|
232
233
|
except Exception: # pragma: no cover
|
|
233
234
|
return False
|
|
234
235
|
|
|
235
236
|
|
|
236
|
-
def
|
|
237
|
-
"""Validate that
|
|
237
|
+
def validate_report(evaluation_report: dict[str, Any]) -> bool:
|
|
238
|
+
"""Validate that an evaluation report has all required fields and valid data."""
|
|
238
239
|
try:
|
|
239
|
-
if
|
|
240
|
+
if evaluation_report.get("schema_version") != REPORT_SCHEMA_VERSION:
|
|
240
241
|
return False
|
|
241
242
|
# Prefer JSON Schema structural validation; if unavailable or too strict,
|
|
242
243
|
# fall back to a lenient minimal check used by unit tests.
|
|
243
|
-
if not _validate_with_jsonschema(
|
|
244
|
+
if not _validate_with_jsonschema(evaluation_report):
|
|
244
245
|
# Minimal fallback: require schema version + run_id + primary_metric
|
|
245
|
-
run_id_ok = isinstance(
|
|
246
|
-
|
|
246
|
+
run_id_ok = isinstance(evaluation_report.get("run_id"), str) and bool(
|
|
247
|
+
evaluation_report.get("run_id")
|
|
247
248
|
)
|
|
248
|
-
pm =
|
|
249
|
+
pm = evaluation_report.get("primary_metric")
|
|
249
250
|
pm_ok = isinstance(pm, dict) and (
|
|
250
251
|
isinstance(pm.get("final"), int | float)
|
|
251
252
|
or (isinstance(pm.get("kind"), str) and bool(pm.get("kind")))
|
|
@@ -253,7 +254,7 @@ def validate_certificate(certificate: dict[str, Any]) -> bool:
|
|
|
253
254
|
if not (run_id_ok and pm_ok):
|
|
254
255
|
return False
|
|
255
256
|
|
|
256
|
-
validation =
|
|
257
|
+
validation = evaluation_report.get("validation", {})
|
|
257
258
|
for flag in [
|
|
258
259
|
"preview_final_drift_acceptable",
|
|
259
260
|
"primary_metric_acceptable",
|
|
@@ -428,8 +429,8 @@ def _load_validation_allowlist() -> set[str]:
|
|
|
428
429
|
# disallow unknown validation keys at schema level.
|
|
429
430
|
try:
|
|
430
431
|
_vkeys = _load_validation_allowlist()
|
|
431
|
-
if isinstance(
|
|
432
|
-
vspec =
|
|
432
|
+
if isinstance(REPORT_JSON_SCHEMA.get("properties"), dict):
|
|
433
|
+
vspec = REPORT_JSON_SCHEMA["properties"].get("validation")
|
|
433
434
|
if isinstance(vspec, dict):
|
|
434
435
|
vspec["properties"] = {k: {"type": "boolean"} for k in _vkeys}
|
|
435
436
|
vspec["additionalProperties"] = False
|
|
@@ -446,7 +447,7 @@ except Exception: # pragma: no cover
|
|
|
446
447
|
def _normalize_and_validate_report(report: RunReport | dict[str, Any]) -> RunReport:
|
|
447
448
|
"""Normalize a possibly-minimal report and validate its structure.
|
|
448
449
|
|
|
449
|
-
Uses the local normalizer when available, then checks `
|
|
450
|
+
Uses the local normalizer when available, then checks `validate_run_report`.
|
|
450
451
|
Raises ValueError on invalid input. Returns the normalized RunReport.
|
|
451
452
|
"""
|
|
452
453
|
try:
|
|
@@ -456,13 +457,13 @@ def _normalize_and_validate_report(report: RunReport | dict[str, Any]) -> RunRep
|
|
|
456
457
|
report = _norm(report)
|
|
457
458
|
except Exception: # pragma: no cover
|
|
458
459
|
pass
|
|
459
|
-
if not
|
|
460
|
+
if not validate_run_report(report):
|
|
460
461
|
raise ValueError("Invalid RunReport structure")
|
|
461
462
|
return report
|
|
462
463
|
|
|
463
464
|
|
|
464
|
-
def
|
|
465
|
-
"""Extract the
|
|
465
|
+
def _extract_report_meta(report: RunReport) -> dict[str, Any]:
|
|
466
|
+
"""Extract the evaluation report metadata block with a full seed bundle."""
|
|
466
467
|
meta_section = (
|
|
467
468
|
report.get("meta", {}) if isinstance(report.get("meta"), dict) else {}
|
|
468
469
|
)
|
|
@@ -739,22 +740,22 @@ def _fallback_paired_windows(
|
|
|
739
740
|
return paired_windows
|
|
740
741
|
|
|
741
742
|
|
|
742
|
-
def
|
|
743
|
+
def make_report(
|
|
743
744
|
report: RunReport,
|
|
744
745
|
baseline: RunReport | dict[str, Any],
|
|
745
746
|
) -> dict[str, Any]:
|
|
746
747
|
"""
|
|
747
|
-
Generate an evaluation
|
|
748
|
+
Generate an evaluation report from a RunReport and baseline comparison.
|
|
748
749
|
|
|
749
|
-
The
|
|
750
|
-
essential metrics and comparisons
|
|
750
|
+
The evaluation report is a standalone, portable artifact that contains all
|
|
751
|
+
essential paired metrics and comparisons used by InvarLock gates.
|
|
751
752
|
|
|
752
753
|
Args:
|
|
753
|
-
report: The guarded run report to
|
|
754
|
+
report: The guarded run report to evaluate
|
|
754
755
|
baseline: Step-0 baseline RunReport or baseline metrics dict
|
|
755
756
|
|
|
756
757
|
Returns:
|
|
757
|
-
|
|
758
|
+
Evaluation report dictionary with all required fields
|
|
758
759
|
|
|
759
760
|
Raises:
|
|
760
761
|
ValueError: If inputs are invalid or required data is missing
|
|
@@ -778,11 +779,11 @@ def make_certificate(
|
|
|
778
779
|
baseline_report = None
|
|
779
780
|
|
|
780
781
|
# Extract core metadata with full seed bundle
|
|
781
|
-
meta =
|
|
782
|
+
meta = _extract_report_meta(report)
|
|
782
783
|
|
|
783
784
|
# Propagate environment flags captured in the RunReport (e.g., deterministic algos,
|
|
784
785
|
# TF32 controls, MPS/CUDA availability). This is useful for auditability and
|
|
785
|
-
# reproducibility of
|
|
786
|
+
# reproducibility of evaluation runs.
|
|
786
787
|
try:
|
|
787
788
|
env_flags = (
|
|
788
789
|
report.get("meta", {}).get("env_flags")
|
|
@@ -1602,7 +1603,7 @@ def make_certificate(
|
|
|
1602
1603
|
if device_name:
|
|
1603
1604
|
telemetry.setdefault("device", device_name)
|
|
1604
1605
|
|
|
1605
|
-
# Build the
|
|
1606
|
+
# Build the evaluation report
|
|
1606
1607
|
window_capacity_ctx = (
|
|
1607
1608
|
report.get("metrics", {}).get("window_capacity")
|
|
1608
1609
|
if isinstance(report.get("metrics"), dict)
|
|
@@ -1920,8 +1921,8 @@ def make_certificate(
|
|
|
1920
1921
|
k: bool(v) for k, v in validation_flags.items() if k in _allowed_validation
|
|
1921
1922
|
}
|
|
1922
1923
|
|
|
1923
|
-
|
|
1924
|
-
"schema_version":
|
|
1924
|
+
evaluation_report = {
|
|
1925
|
+
"schema_version": REPORT_SCHEMA_VERSION,
|
|
1925
1926
|
"run_id": current_run_id,
|
|
1926
1927
|
"meta": meta,
|
|
1927
1928
|
"auto": auto,
|
|
@@ -1964,8 +1965,8 @@ def make_certificate(
|
|
|
1964
1965
|
_tiny_relax_env = False
|
|
1965
1966
|
if _tiny_relax_env:
|
|
1966
1967
|
try:
|
|
1967
|
-
|
|
1968
|
-
prov =
|
|
1968
|
+
evaluation_report.setdefault("auto", {})["tiny_relax"] = True
|
|
1969
|
+
prov = evaluation_report.setdefault("provenance", {})
|
|
1969
1970
|
flags = prov.setdefault("flags", [])
|
|
1970
1971
|
if "tiny_relax" not in flags:
|
|
1971
1972
|
flags.append("tiny_relax")
|
|
@@ -1991,12 +1992,12 @@ def make_certificate(
|
|
|
1991
1992
|
and "value" in qo
|
|
1992
1993
|
and math.isfinite(float(qo.get("value", float("nan"))))
|
|
1993
1994
|
):
|
|
1994
|
-
|
|
1995
|
+
evaluation_report["quality_overhead"] = qo
|
|
1995
1996
|
except Exception: # pragma: no cover
|
|
1996
1997
|
pass
|
|
1997
1998
|
|
|
1998
1999
|
try:
|
|
1999
|
-
_propagate_pairing_stats(
|
|
2000
|
+
_propagate_pairing_stats(evaluation_report, ppl_analysis)
|
|
2000
2001
|
except Exception: # pragma: no cover
|
|
2001
2002
|
pass
|
|
2002
2003
|
|
|
@@ -2057,7 +2058,7 @@ def make_certificate(
|
|
|
2057
2058
|
(resolved_policy.get("variance") or {}).get("min_effect_lognll", 0.0) or 0.0
|
|
2058
2059
|
)
|
|
2059
2060
|
|
|
2060
|
-
|
|
2061
|
+
evaluation_report["policy_digest"] = {
|
|
2061
2062
|
"policy_version": POLICY_VERSION,
|
|
2062
2063
|
"tier_policy_name": cur_tier,
|
|
2063
2064
|
"thresholds_hash": thresholds_hash,
|
|
@@ -2088,7 +2089,7 @@ def make_certificate(
|
|
|
2088
2089
|
payload[key] = item[key]
|
|
2089
2090
|
sanitized.append(payload)
|
|
2090
2091
|
if sanitized:
|
|
2091
|
-
|
|
2092
|
+
evaluation_report["secondary_metrics"] = sanitized
|
|
2092
2093
|
except Exception: # pragma: no cover
|
|
2093
2094
|
pass
|
|
2094
2095
|
|
|
@@ -2136,7 +2137,7 @@ def make_certificate(
|
|
|
2136
2137
|
except Exception: # pragma: no cover
|
|
2137
2138
|
continue
|
|
2138
2139
|
if out:
|
|
2139
|
-
|
|
2140
|
+
evaluation_report["classification"] = {"subgroups": out}
|
|
2140
2141
|
except Exception: # pragma: no cover
|
|
2141
2142
|
pass
|
|
2142
2143
|
|
|
@@ -2152,7 +2153,7 @@ def make_certificate(
|
|
|
2152
2153
|
if isinstance(container.get("metrics"), dict)
|
|
2153
2154
|
else {}
|
|
2154
2155
|
)
|
|
2155
|
-
# Edited report case: also check
|
|
2156
|
+
# Edited report case: also check evaluation_report telemetry keys
|
|
2156
2157
|
telem = telemetry if isinstance(telemetry, dict) else {}
|
|
2157
2158
|
# Prefer explicit p50/p95 throughput keys if present
|
|
2158
2159
|
for key in ("latency_ms_p50", "latency_ms_p95", "throughput_sps"):
|
|
@@ -2193,24 +2194,24 @@ def make_certificate(
|
|
|
2193
2194
|
entry["ratio"] = float("nan")
|
|
2194
2195
|
system_overhead[metric_key] = entry
|
|
2195
2196
|
if system_overhead:
|
|
2196
|
-
|
|
2197
|
+
evaluation_report["system_overhead"] = system_overhead
|
|
2197
2198
|
except Exception: # pragma: no cover
|
|
2198
2199
|
pass
|
|
2199
2200
|
|
|
2200
2201
|
# Attach/normalize primary metric block (moved to helper)
|
|
2201
2202
|
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
2202
2203
|
|
|
2203
|
-
_attach_pm(
|
|
2204
|
+
_attach_pm(evaluation_report, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2204
2205
|
try:
|
|
2205
2206
|
if isinstance(pm_drift_band, dict) and pm_drift_band:
|
|
2206
|
-
pm_block =
|
|
2207
|
+
pm_block = evaluation_report.get("primary_metric")
|
|
2207
2208
|
if isinstance(pm_block, dict):
|
|
2208
2209
|
pm_block.setdefault("drift_band", dict(pm_drift_band))
|
|
2209
2210
|
except Exception: # pragma: no cover
|
|
2210
2211
|
pass
|
|
2211
2212
|
_enforce_display_ci_alignment(
|
|
2212
2213
|
ratio_ci_source,
|
|
2213
|
-
|
|
2214
|
+
evaluation_report.get("primary_metric"),
|
|
2214
2215
|
logloss_delta_ci,
|
|
2215
2216
|
window_plan_profile,
|
|
2216
2217
|
)
|
|
@@ -2218,8 +2219,8 @@ def make_certificate(
|
|
|
2218
2219
|
# Ensure primary_metric has display_ci populated for schema invariants
|
|
2219
2220
|
try:
|
|
2220
2221
|
pm = (
|
|
2221
|
-
|
|
2222
|
-
if isinstance(
|
|
2222
|
+
evaluation_report.get("primary_metric", {})
|
|
2223
|
+
if isinstance(evaluation_report.get("primary_metric"), dict)
|
|
2223
2224
|
else None
|
|
2224
2225
|
)
|
|
2225
2226
|
if isinstance(pm, dict) and pm:
|
|
@@ -2259,8 +2260,8 @@ def make_certificate(
|
|
|
2259
2260
|
if not kind:
|
|
2260
2261
|
kind = "ppl"
|
|
2261
2262
|
windows_cfg = (
|
|
2262
|
-
|
|
2263
|
-
if isinstance(
|
|
2263
|
+
evaluation_report.get("dataset", {}).get("windows", {})
|
|
2264
|
+
if isinstance(evaluation_report.get("dataset"), dict)
|
|
2264
2265
|
else {}
|
|
2265
2266
|
)
|
|
2266
2267
|
n_prev = windows_cfg.get("preview")
|
|
@@ -2268,7 +2269,7 @@ def make_certificate(
|
|
|
2268
2269
|
tokens_total = None
|
|
2269
2270
|
try:
|
|
2270
2271
|
tokens_total = (
|
|
2271
|
-
|
|
2272
|
+
evaluation_report.get("dataset", {}).get("hash", {}).get("total_tokens")
|
|
2272
2273
|
)
|
|
2273
2274
|
except Exception: # pragma: no cover
|
|
2274
2275
|
tokens_total = None
|
|
@@ -2276,7 +2277,7 @@ def make_certificate(
|
|
|
2276
2277
|
ci_lo = None
|
|
2277
2278
|
ci_hi = None
|
|
2278
2279
|
ratio = None
|
|
2279
|
-
pmc =
|
|
2280
|
+
pmc = evaluation_report.get("primary_metric", {})
|
|
2280
2281
|
rci = pmc.get("display_ci") or pmc.get("ci")
|
|
2281
2282
|
if isinstance(rci, tuple | list) and len(rci) == 2:
|
|
2282
2283
|
ci_lo, ci_hi = rci[0], rci[1]
|
|
@@ -2288,7 +2289,7 @@ def make_certificate(
|
|
|
2288
2289
|
except Exception: # pragma: no cover
|
|
2289
2290
|
ci_w = None
|
|
2290
2291
|
# Gate outcome
|
|
2291
|
-
val =
|
|
2292
|
+
val = evaluation_report.get("validation", {})
|
|
2292
2293
|
gate_ok = None
|
|
2293
2294
|
try:
|
|
2294
2295
|
gate_ok = bool(val.get("primary_metric_acceptable"))
|
|
@@ -2303,10 +2304,10 @@ def make_certificate(
|
|
|
2303
2304
|
f"tokens={tokens_total}",
|
|
2304
2305
|
]
|
|
2305
2306
|
try:
|
|
2306
|
-
split = (
|
|
2307
|
+
split = (evaluation_report.get("provenance", {}) or {}).get("dataset_split")
|
|
2307
2308
|
if not split:
|
|
2308
2309
|
split = (report.get("provenance", {}) or {}).get("dataset_split")
|
|
2309
|
-
sf = (
|
|
2310
|
+
sf = (evaluation_report.get("provenance", {}) or {}).get("split_fallback")
|
|
2310
2311
|
if sf is None:
|
|
2311
2312
|
sf = (report.get("provenance", {}) or {}).get("split_fallback")
|
|
2312
2313
|
if split:
|
|
@@ -2322,7 +2323,7 @@ def make_certificate(
|
|
|
2322
2323
|
if isinstance(gate_ok, bool):
|
|
2323
2324
|
parts.append(f"gate={'pass' if gate_ok else 'fail'}")
|
|
2324
2325
|
summary_line = "INVARLOCK_TELEMETRY " + " ".join(parts)
|
|
2325
|
-
|
|
2326
|
+
evaluation_report.setdefault("telemetry", {})["summary_line"] = summary_line
|
|
2326
2327
|
if str(os.environ.get("INVARLOCK_TELEMETRY", "")).strip().lower() in {
|
|
2327
2328
|
"1",
|
|
2328
2329
|
"true",
|
|
@@ -2335,17 +2336,17 @@ def make_certificate(
|
|
|
2335
2336
|
|
|
2336
2337
|
# Attach confidence label (non-gating)
|
|
2337
2338
|
try:
|
|
2338
|
-
|
|
2339
|
+
evaluation_report["confidence"] = _compute_confidence_label(evaluation_report)
|
|
2339
2340
|
except Exception: # pragma: no cover
|
|
2340
2341
|
pass
|
|
2341
2342
|
|
|
2342
|
-
return
|
|
2343
|
+
return evaluation_report
|
|
2343
2344
|
|
|
2344
2345
|
|
|
2345
2346
|
# Console Validation Block helpers have moved to invarlock.reporting.render.
|
|
2346
2347
|
|
|
2347
2348
|
|
|
2348
|
-
## NOTE:
|
|
2349
|
+
## NOTE: render_report_markdown has been moved to invarlock.reporting.render.
|
|
2349
2350
|
## It is re-exported at the bottom of this module to preserve the public API.
|
|
2350
2351
|
## Private helper functions
|
|
2351
2352
|
|
|
@@ -2623,7 +2624,7 @@ def _extract_structural_deltas(report: RunReport) -> dict[str, Any]:
|
|
|
2623
2624
|
def _extract_edit_metadata(
|
|
2624
2625
|
report: RunReport, plugin_provenance: dict[str, Any]
|
|
2625
2626
|
) -> dict[str, Any]:
|
|
2626
|
-
"""Extract edit-level provenance and configuration metadata for the
|
|
2627
|
+
"""Extract edit-level provenance and configuration metadata for the evaluation report."""
|
|
2627
2628
|
|
|
2628
2629
|
edit_section = _get_mapping(report, "edit")
|
|
2629
2630
|
if not edit_section:
|
|
@@ -3020,12 +3021,12 @@ def _compute_quality_overhead_from_guard(
|
|
|
3020
3021
|
|
|
3021
3022
|
|
|
3022
3023
|
def _propagate_pairing_stats(
|
|
3023
|
-
|
|
3024
|
+
evaluation_report: dict[str, Any], ppl_analysis: dict[str, Any] | None
|
|
3024
3025
|
) -> None:
|
|
3025
|
-
"""Surface pairing statistics inside
|
|
3026
|
-
if not isinstance(
|
|
3026
|
+
"""Surface pairing statistics inside evaluation_report.dataset.windows.stats."""
|
|
3027
|
+
if not isinstance(evaluation_report, dict):
|
|
3027
3028
|
return
|
|
3028
|
-
ds =
|
|
3029
|
+
ds = evaluation_report.get("dataset", {})
|
|
3029
3030
|
if not isinstance(ds, dict):
|
|
3030
3031
|
return
|
|
3031
3032
|
windows = ds.get("windows", {})
|
|
@@ -3079,7 +3080,7 @@ def _propagate_pairing_stats(
|
|
|
3079
3080
|
windows["stats"] = stats
|
|
3080
3081
|
if windows is not ds.get("windows"):
|
|
3081
3082
|
ds["windows"] = windows
|
|
3082
|
-
|
|
3083
|
+
evaluation_report["dataset"] = ds
|
|
3083
3084
|
|
|
3084
3085
|
|
|
3085
3086
|
def _build_provenance_block(
|
|
@@ -3372,7 +3373,7 @@ def _compute_validation_flags(
|
|
|
3372
3373
|
pm_drift_band: dict[str, float] | None = None,
|
|
3373
3374
|
pm_tail: dict[str, Any] | None = None,
|
|
3374
3375
|
) -> dict[str, bool]:
|
|
3375
|
-
"""Compute validation flags for the
|
|
3376
|
+
"""Compute validation flags for the evaluation report including canonical gates."""
|
|
3376
3377
|
tier = (tier or "balanced").lower()
|
|
3377
3378
|
# Dev-only tiny relax: widen gates and lower floors when explicitly requested
|
|
3378
3379
|
import os as _os
|
|
@@ -3613,7 +3614,7 @@ def _compute_validation_flags(
|
|
|
3613
3614
|
if _tiny_relax and threshold_val < 0.10:
|
|
3614
3615
|
threshold_val = 0.10
|
|
3615
3616
|
if not math.isfinite(ratio_val):
|
|
3616
|
-
# In dev/Compare-&-
|
|
3617
|
+
# In dev/Compare-&-Evaluate flows we often lack a bare run; treat missing metric as pass
|
|
3617
3618
|
guard_overhead_pass = True
|
|
3618
3619
|
else:
|
|
3619
3620
|
guard_overhead_pass = ratio_val <= (1.0 + max(0.0, threshold_val))
|
|
@@ -3769,7 +3770,7 @@ def _generate_run_id(report: RunReport) -> str:
|
|
|
3769
3770
|
return hashlib.sha256(base_str.encode()).hexdigest()[:16]
|
|
3770
3771
|
|
|
3771
3772
|
|
|
3772
|
-
## NOTE:
|
|
3773
|
+
## NOTE: _compute_report_hash moved to invarlock.reporting.render and is re-exported below.
|
|
3773
3774
|
|
|
3774
3775
|
|
|
3775
3776
|
def _analyze_bitwidth_map(bitwidth_map: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -4114,22 +4115,24 @@ def _extract_compression_diagnostics(
|
|
|
4114
4115
|
|
|
4115
4116
|
# Re-export rendering API from dedicated module to avoid bloat/cycles
|
|
4116
4117
|
# Rendering helpers live in invarlock.reporting.render; internal code should import there directly.
|
|
4117
|
-
# Tests and public API expect
|
|
4118
|
-
# invarlock.reporting.
|
|
4118
|
+
# Tests and public API expect render_report_markdown to be available from
|
|
4119
|
+
# invarlock.reporting.report_builder. Import lazily at module end to avoid cycles with
|
|
4119
4120
|
# invarlock.reporting.render which imports this module as a namespace.
|
|
4120
4121
|
try: # pragma: no cover - simple re-export
|
|
4121
4122
|
from .render import (
|
|
4122
4123
|
compute_console_validation_block, # type: ignore
|
|
4123
|
-
|
|
4124
|
+
render_report_markdown, # type: ignore
|
|
4124
4125
|
)
|
|
4125
4126
|
except Exception: # pragma: no cover - defensive fallback
|
|
4126
4127
|
|
|
4127
|
-
def
|
|
4128
|
+
def render_report_markdown(evaluation_report: dict[str, Any]) -> str: # type: ignore
|
|
4128
4129
|
raise ImportError(
|
|
4129
|
-
"
|
|
4130
|
+
"render_report_markdown is unavailable; rendering dependencies missing"
|
|
4130
4131
|
)
|
|
4131
4132
|
|
|
4132
|
-
def compute_console_validation_block(
|
|
4133
|
+
def compute_console_validation_block(
|
|
4134
|
+
evaluation_report: dict[str, Any],
|
|
4135
|
+
) -> dict[str, Any]: # type: ignore
|
|
4133
4136
|
raise ImportError(
|
|
4134
4137
|
"compute_console_validation_block is unavailable; rendering dependencies missing"
|
|
4135
4138
|
)
|
|
@@ -4137,12 +4140,12 @@ except Exception: # pragma: no cover - defensive fallback
|
|
|
4137
4140
|
|
|
4138
4141
|
# Export public API
|
|
4139
4142
|
__all__ = [
|
|
4140
|
-
"
|
|
4141
|
-
"
|
|
4143
|
+
"make_report",
|
|
4144
|
+
"validate_report",
|
|
4142
4145
|
"_validate_with_jsonschema",
|
|
4143
4146
|
"jsonschema",
|
|
4144
|
-
"
|
|
4147
|
+
"render_report_markdown",
|
|
4145
4148
|
"compute_console_validation_block",
|
|
4146
|
-
"
|
|
4147
|
-
"
|
|
4149
|
+
"REPORT_SCHEMA_VERSION",
|
|
4150
|
+
"REPORT_JSON_SCHEMA",
|
|
4148
4151
|
]
|
|
@@ -11,16 +11,16 @@ except Exception: # pragma: no cover
|
|
|
11
11
|
jsonschema = None
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
#
|
|
15
|
-
|
|
14
|
+
# Evaluation report schema version (PM-first canonical)
|
|
15
|
+
REPORT_SCHEMA_VERSION = "v1"
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
# Minimal JSON Schema describing the canonical shape of
|
|
18
|
+
# Minimal JSON Schema describing the canonical shape of an evaluation report.
|
|
19
19
|
# This focuses on structural validity; numerical thresholds are validated
|
|
20
20
|
# separately in metric-specific logic.
|
|
21
|
-
|
|
21
|
+
REPORT_JSON_SCHEMA: dict[str, Any] = {
|
|
22
22
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
23
|
-
"title": "InvarLock Evaluation
|
|
23
|
+
"title": "InvarLock Evaluation Report",
|
|
24
24
|
"type": "object",
|
|
25
25
|
"required": [
|
|
26
26
|
"schema_version",
|
|
@@ -32,7 +32,7 @@ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
|
32
32
|
"primary_metric",
|
|
33
33
|
],
|
|
34
34
|
"properties": {
|
|
35
|
-
"schema_version": {"const":
|
|
35
|
+
"schema_version": {"const": REPORT_SCHEMA_VERSION},
|
|
36
36
|
"run_id": {"type": "string", "minLength": 4},
|
|
37
37
|
"edit_name": {"type": "string"},
|
|
38
38
|
"policy_digest": {
|
|
@@ -179,21 +179,21 @@ def _load_validation_allowlist() -> set[str]:
|
|
|
179
179
|
return set(_VALIDATION_ALLOWLIST_DEFAULT)
|
|
180
180
|
|
|
181
181
|
|
|
182
|
-
def _validate_with_jsonschema(
|
|
183
|
-
"""Validate
|
|
182
|
+
def _validate_with_jsonschema(report: dict[str, Any]) -> bool:
|
|
183
|
+
"""Validate evaluation report with JSON Schema when available."""
|
|
184
184
|
if jsonschema is None:
|
|
185
185
|
return True # Schema library unavailable; fall back to minimal checks
|
|
186
186
|
try:
|
|
187
|
-
jsonschema.validate(instance=
|
|
187
|
+
jsonschema.validate(instance=report, schema=REPORT_JSON_SCHEMA)
|
|
188
188
|
return True
|
|
189
189
|
except Exception:
|
|
190
190
|
return False
|
|
191
191
|
|
|
192
192
|
|
|
193
|
-
def
|
|
194
|
-
"""Validate
|
|
193
|
+
def validate_report(report: dict[str, Any]) -> bool:
|
|
194
|
+
"""Validate evaluation report structure and essential flags."""
|
|
195
195
|
try:
|
|
196
|
-
if
|
|
196
|
+
if report.get("schema_version") != REPORT_SCHEMA_VERSION:
|
|
197
197
|
return False
|
|
198
198
|
|
|
199
199
|
# Prefer JSON Schema structural validation; if unavailable or too strict,
|
|
@@ -202,20 +202,20 @@ def validate_certificate(certificate: dict[str, Any]) -> bool:
|
|
|
202
202
|
# disallow unknown validation keys at schema level.
|
|
203
203
|
try:
|
|
204
204
|
vkeys = _load_validation_allowlist()
|
|
205
|
-
if isinstance(
|
|
206
|
-
vspec =
|
|
205
|
+
if isinstance(REPORT_JSON_SCHEMA.get("properties"), dict):
|
|
206
|
+
vspec = REPORT_JSON_SCHEMA["properties"].get("validation")
|
|
207
207
|
if isinstance(vspec, dict):
|
|
208
208
|
vspec["properties"] = {k: {"type": "boolean"} for k in vkeys}
|
|
209
209
|
vspec["additionalProperties"] = False
|
|
210
210
|
except Exception:
|
|
211
211
|
pass
|
|
212
212
|
|
|
213
|
-
if not _validate_with_jsonschema(
|
|
213
|
+
if not _validate_with_jsonschema(report):
|
|
214
214
|
# Minimal fallback: require schema version + run_id + primary_metric
|
|
215
|
-
run_id_ok = isinstance(
|
|
216
|
-
|
|
215
|
+
run_id_ok = isinstance(report.get("run_id"), str) and bool(
|
|
216
|
+
report.get("run_id")
|
|
217
217
|
)
|
|
218
|
-
pm =
|
|
218
|
+
pm = report.get("primary_metric")
|
|
219
219
|
pm_ok = isinstance(pm, dict) and (
|
|
220
220
|
isinstance(pm.get("final"), int | float)
|
|
221
221
|
or (isinstance(pm.get("kind"), str) and bool(pm.get("kind")))
|
|
@@ -223,7 +223,7 @@ def validate_certificate(certificate: dict[str, Any]) -> bool:
|
|
|
223
223
|
if not (run_id_ok and pm_ok):
|
|
224
224
|
return False
|
|
225
225
|
|
|
226
|
-
validation =
|
|
226
|
+
validation = report.get("validation", {})
|
|
227
227
|
for flag in [
|
|
228
228
|
"preview_final_drift_acceptable",
|
|
229
229
|
"primary_metric_acceptable",
|
|
@@ -242,7 +242,7 @@ def validate_certificate(certificate: dict[str, Any]) -> bool:
|
|
|
242
242
|
|
|
243
243
|
|
|
244
244
|
__all__ = [
|
|
245
|
-
"
|
|
246
|
-
"
|
|
247
|
-
"
|
|
245
|
+
"REPORT_SCHEMA_VERSION",
|
|
246
|
+
"REPORT_JSON_SCHEMA",
|
|
247
|
+
"validate_report",
|
|
248
248
|
]
|