invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +4 -4
- invarlock/adapters/__init__.py +10 -14
- invarlock/adapters/auto.py +37 -50
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +53 -9
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/assurance/__init__.py +15 -23
- invarlock/cli/adapter_auto.py +32 -26
- invarlock/cli/app.py +128 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/doctor.py +8 -10
- invarlock/cli/commands/evaluate.py +986 -0
- invarlock/cli/commands/explain_gates.py +25 -17
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +326 -92
- invarlock/cli/commands/run.py +1160 -228
- invarlock/cli/commands/verify.py +157 -97
- invarlock/cli/config.py +1 -1
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +4 -4
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/registry.py +9 -11
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +112 -26
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +67 -39
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +14 -10
- invarlock/eval/data.py +68 -23
- invarlock/eval/metrics.py +59 -1
- invarlock/eval/primary_metric.py +1 -1
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/rmt.py +2 -2
- invarlock/guards/spectral.py +1 -1
- invarlock/guards/variance.py +2 -2
- invarlock/model_profile.py +64 -62
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/guards_analysis.py +154 -4
- invarlock/reporting/html.py +61 -11
- invarlock/reporting/normalizer.py +9 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +876 -510
- invarlock/reporting/report.py +72 -30
- invarlock/reporting/{certificate.py → report_builder.py} +252 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock/reporting/report_types.py +6 -1
- invarlock/reporting/telemetry.py +86 -0
- invarlock-0.3.8.dist-info/METADATA +283 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- invarlock/cli/commands/certify.py +0 -422
- invarlock-0.3.6.dist-info/METADATA +0 -588
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
invarlock/cli/commands/verify.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
invarlock verify command
|
|
3
3
|
====================
|
|
4
4
|
|
|
5
|
-
Validates generated
|
|
5
|
+
Validates generated evaluation reports for internal consistency. The command
|
|
6
6
|
ensures schema compliance, checks that the primary metric ratio agrees with the
|
|
7
7
|
baseline reference, and enforces paired-window guarantees (match=1.0,
|
|
8
8
|
overlap=0.0).
|
|
@@ -26,7 +26,7 @@ from invarlock.core.exceptions import (
|
|
|
26
26
|
from invarlock.core.exceptions import (
|
|
27
27
|
ValidationError as _ValidationError,
|
|
28
28
|
)
|
|
29
|
-
from invarlock.reporting.
|
|
29
|
+
from invarlock.reporting.report_builder import validate_report
|
|
30
30
|
|
|
31
31
|
from .._json import emit as _emit_json
|
|
32
32
|
from .._json import encode_error as _encode_error
|
|
@@ -52,26 +52,42 @@ def _coerce_int(value: Any) -> int | None:
|
|
|
52
52
|
return out if out >= 0 else None
|
|
53
53
|
|
|
54
54
|
|
|
55
|
-
def
|
|
56
|
-
"""Load
|
|
55
|
+
def _load_evaluation_report(path: Path) -> dict[str, Any]:
|
|
56
|
+
"""Load an evaluation report JSON from disk."""
|
|
57
57
|
with path.open("r", encoding="utf-8") as handle:
|
|
58
58
|
return json.load(handle)
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def _validate_primary_metric(
|
|
61
|
+
def _validate_primary_metric(report: dict[str, Any]) -> list[str]:
|
|
62
62
|
"""Validate primary metric ratio consistency with baseline reference."""
|
|
63
63
|
errors: list[str] = []
|
|
64
|
-
pm =
|
|
64
|
+
pm = report.get("primary_metric", {}) or {}
|
|
65
65
|
if not isinstance(pm, dict) or not pm:
|
|
66
|
-
errors.append("
|
|
66
|
+
errors.append("report missing primary_metric block.")
|
|
67
67
|
return errors
|
|
68
68
|
|
|
69
|
+
def _is_finite_number(value: Any) -> bool:
|
|
70
|
+
return isinstance(value, (int, float)) and math.isfinite(float(value))
|
|
71
|
+
|
|
72
|
+
def _declares_invalid_primary_metric(metric: dict[str, Any]) -> bool:
|
|
73
|
+
if bool(metric.get("invalid")):
|
|
74
|
+
return True
|
|
75
|
+
reason = metric.get("degraded_reason")
|
|
76
|
+
if isinstance(reason, str):
|
|
77
|
+
r = reason.strip().lower()
|
|
78
|
+
return r.startswith("non_finite") or r in {
|
|
79
|
+
"primary_metric_invalid",
|
|
80
|
+
"evaluation_error",
|
|
81
|
+
}
|
|
82
|
+
return False
|
|
83
|
+
|
|
69
84
|
kind = str(pm.get("kind", "")).lower()
|
|
70
85
|
ratio_vs_baseline = pm.get("ratio_vs_baseline")
|
|
71
86
|
final = pm.get("final")
|
|
87
|
+
pm_invalid = _declares_invalid_primary_metric(pm)
|
|
72
88
|
|
|
73
89
|
if kind.startswith("ppl"):
|
|
74
|
-
baseline_ref =
|
|
90
|
+
baseline_ref = report.get("baseline_ref", {}) or {}
|
|
75
91
|
baseline_pm = (
|
|
76
92
|
baseline_ref.get("primary_metric")
|
|
77
93
|
if isinstance(baseline_ref, dict)
|
|
@@ -82,18 +98,16 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
|
|
|
82
98
|
bv = baseline_pm.get("final")
|
|
83
99
|
if isinstance(bv, (int | float)):
|
|
84
100
|
baseline_final = float(bv)
|
|
85
|
-
if
|
|
86
|
-
if baseline_final <= 0.0:
|
|
101
|
+
if _is_finite_number(final) and _is_finite_number(baseline_final):
|
|
102
|
+
if float(baseline_final) <= 0.0:
|
|
87
103
|
errors.append(
|
|
88
104
|
f"Baseline final must be > 0.0 to compute ratio (found {baseline_final})."
|
|
89
105
|
)
|
|
90
106
|
else:
|
|
91
107
|
expected_ratio = float(final) / float(baseline_final)
|
|
92
|
-
if not
|
|
93
|
-
float(ratio_vs_baseline)
|
|
94
|
-
):
|
|
108
|
+
if not _is_finite_number(ratio_vs_baseline):
|
|
95
109
|
errors.append(
|
|
96
|
-
"
|
|
110
|
+
"report is missing a finite primary_metric.ratio_vs_baseline value."
|
|
97
111
|
)
|
|
98
112
|
elif not math.isclose(
|
|
99
113
|
float(ratio_vs_baseline), expected_ratio, rel_tol=1e-6, abs_tol=1e-6
|
|
@@ -102,19 +116,30 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
|
|
|
102
116
|
"Primary metric ratio mismatch: "
|
|
103
117
|
f"recorded={float(ratio_vs_baseline):.12f}, expected={expected_ratio:.12f}"
|
|
104
118
|
)
|
|
119
|
+
else:
|
|
120
|
+
# If the primary metric is non-finite, it must be explicitly marked invalid.
|
|
121
|
+
# This is expected for structural error-injection runs (NaN/Inf weights).
|
|
122
|
+
if (isinstance(final, (int | float)) and not _is_finite_number(final)) and (
|
|
123
|
+
not pm_invalid
|
|
124
|
+
):
|
|
125
|
+
errors.append(
|
|
126
|
+
"Primary metric final is non-finite but primary_metric.invalid is not set."
|
|
127
|
+
)
|
|
105
128
|
else:
|
|
129
|
+
if pm_invalid:
|
|
130
|
+
return errors
|
|
106
131
|
if ratio_vs_baseline is None or not isinstance(ratio_vs_baseline, int | float):
|
|
107
132
|
errors.append(
|
|
108
|
-
"
|
|
133
|
+
"report missing primary_metric.ratio_vs_baseline for non-ppl metric."
|
|
109
134
|
)
|
|
110
135
|
|
|
111
136
|
return errors
|
|
112
137
|
|
|
113
138
|
|
|
114
|
-
def _validate_pairing(
|
|
139
|
+
def _validate_pairing(report: dict[str, Any]) -> list[str]:
|
|
115
140
|
"""Validate window pairing metrics (PM-only location)."""
|
|
116
141
|
errors: list[str] = []
|
|
117
|
-
stats =
|
|
142
|
+
stats = report.get("dataset", {}).get("windows", {}).get("stats", {})
|
|
118
143
|
|
|
119
144
|
match_fraction = stats.get("window_match_fraction")
|
|
120
145
|
overlap_fraction = stats.get("window_overlap_fraction")
|
|
@@ -123,23 +148,23 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
|
|
|
123
148
|
|
|
124
149
|
if pairing_reason is not None:
|
|
125
150
|
errors.append(
|
|
126
|
-
"window_pairing_reason must be null/None for paired
|
|
151
|
+
"window_pairing_reason must be null/None for paired reports "
|
|
127
152
|
f"(found {pairing_reason!r})."
|
|
128
153
|
)
|
|
129
154
|
if paired_windows is None:
|
|
130
|
-
errors.append("
|
|
155
|
+
errors.append("report missing paired_windows metric.")
|
|
131
156
|
elif paired_windows == 0:
|
|
132
|
-
errors.append("paired_windows must be > 0 for paired
|
|
157
|
+
errors.append("paired_windows must be > 0 for paired reports (found 0).")
|
|
133
158
|
|
|
134
159
|
if match_fraction is None:
|
|
135
|
-
errors.append("
|
|
160
|
+
errors.append("report missing window_match_fraction metric.")
|
|
136
161
|
elif match_fraction < 0.999999:
|
|
137
162
|
errors.append(
|
|
138
163
|
f"window_match_fraction must be 1.0 for paired runs (found {match_fraction:.6f})."
|
|
139
164
|
)
|
|
140
165
|
|
|
141
166
|
if overlap_fraction is None:
|
|
142
|
-
errors.append("
|
|
167
|
+
errors.append("report missing window_overlap_fraction metric.")
|
|
143
168
|
elif overlap_fraction > 1e-9:
|
|
144
169
|
errors.append(
|
|
145
170
|
f"window_overlap_fraction must be 0.0 (found {overlap_fraction:.6f})."
|
|
@@ -148,10 +173,10 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
|
|
|
148
173
|
return errors
|
|
149
174
|
|
|
150
175
|
|
|
151
|
-
def _validate_counts(
|
|
176
|
+
def _validate_counts(report: dict[str, Any]) -> list[str]:
|
|
152
177
|
"""Validate preview/final window counts align with dataset configuration."""
|
|
153
178
|
errors: list[str] = []
|
|
154
|
-
dataset =
|
|
179
|
+
dataset = report.get("dataset", {})
|
|
155
180
|
dataset_windows = dataset.get("windows", {})
|
|
156
181
|
expected_preview = dataset_windows.get("preview")
|
|
157
182
|
expected_final = dataset_windows.get("final")
|
|
@@ -165,9 +190,7 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
|
|
|
165
190
|
|
|
166
191
|
if expected_preview is not None:
|
|
167
192
|
if preview_used is None:
|
|
168
|
-
errors.append(
|
|
169
|
-
"Certificate missing coverage.preview.used for preview windows."
|
|
170
|
-
)
|
|
193
|
+
errors.append("report missing coverage.preview.used for preview windows.")
|
|
171
194
|
elif int(preview_used) != int(expected_preview):
|
|
172
195
|
errors.append(
|
|
173
196
|
f"Preview window count mismatch: expected {expected_preview}, observed {preview_used}."
|
|
@@ -175,7 +198,7 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
|
|
|
175
198
|
|
|
176
199
|
if expected_final is not None:
|
|
177
200
|
if final_used is None:
|
|
178
|
-
errors.append("
|
|
201
|
+
errors.append("report missing coverage.final.used for final windows.")
|
|
179
202
|
elif int(final_used) != int(expected_final):
|
|
180
203
|
errors.append(
|
|
181
204
|
f"Final window count mismatch: expected {expected_final}, observed {final_used}."
|
|
@@ -193,40 +216,79 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
|
|
|
193
216
|
return errors
|
|
194
217
|
|
|
195
218
|
|
|
196
|
-
def _validate_drift_band(
|
|
197
|
-
"""Validate preview→final drift stays within the configured band
|
|
219
|
+
def _validate_drift_band(report: dict[str, Any]) -> list[str]:
|
|
220
|
+
"""Validate preview→final drift stays within the configured band.
|
|
221
|
+
|
|
222
|
+
Defaults to 0.95–1.05 unless the report provides `primary_metric.drift_band`.
|
|
223
|
+
"""
|
|
198
224
|
errors: list[str] = []
|
|
199
|
-
pm =
|
|
225
|
+
pm = report.get("primary_metric", {}) or {}
|
|
226
|
+
if not isinstance(pm, dict) or not pm:
|
|
227
|
+
errors.append("report missing primary_metric block.")
|
|
228
|
+
return errors
|
|
229
|
+
if bool(pm.get("invalid")):
|
|
230
|
+
# Drift is undefined when the primary metric is invalid (e.g., NaN/Inf weights).
|
|
231
|
+
return errors
|
|
200
232
|
drift_ratio = None
|
|
201
233
|
try:
|
|
202
234
|
prev = pm.get("preview")
|
|
203
235
|
fin = pm.get("final")
|
|
204
|
-
if
|
|
236
|
+
if (
|
|
237
|
+
isinstance(prev, int | float)
|
|
238
|
+
and isinstance(fin, int | float)
|
|
239
|
+
and math.isfinite(float(prev))
|
|
240
|
+
and math.isfinite(float(fin))
|
|
241
|
+
and prev > 0
|
|
242
|
+
):
|
|
205
243
|
drift_ratio = float(fin) / float(prev)
|
|
206
244
|
except Exception:
|
|
207
245
|
drift_ratio = None
|
|
208
246
|
|
|
209
247
|
if not isinstance(drift_ratio, int | float):
|
|
210
|
-
errors.append("
|
|
248
|
+
errors.append("report missing preview/final to compute drift ratio.")
|
|
211
249
|
return errors
|
|
212
250
|
|
|
213
|
-
|
|
251
|
+
drift_min = 0.95
|
|
252
|
+
drift_max = 1.05
|
|
253
|
+
band = pm.get("drift_band")
|
|
254
|
+
try:
|
|
255
|
+
if isinstance(band, dict):
|
|
256
|
+
lo = band.get("min")
|
|
257
|
+
hi = band.get("max")
|
|
258
|
+
if isinstance(lo, int | float) and isinstance(hi, int | float):
|
|
259
|
+
lo_f = float(lo)
|
|
260
|
+
hi_f = float(hi)
|
|
261
|
+
if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
|
|
262
|
+
drift_min = lo_f
|
|
263
|
+
drift_max = hi_f
|
|
264
|
+
elif isinstance(band, list | tuple) and len(band) == 2:
|
|
265
|
+
lo_raw, hi_raw = band[0], band[1]
|
|
266
|
+
if isinstance(lo_raw, int | float) and isinstance(hi_raw, int | float):
|
|
267
|
+
lo_f = float(lo_raw)
|
|
268
|
+
hi_f = float(hi_raw)
|
|
269
|
+
if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
|
|
270
|
+
drift_min = lo_f
|
|
271
|
+
drift_max = hi_f
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
|
|
275
|
+
if not drift_min <= float(drift_ratio) <= drift_max:
|
|
214
276
|
errors.append(
|
|
215
|
-
f"Preview→final drift ratio out of band (
|
|
277
|
+
f"Preview→final drift ratio out of band ({drift_min:.2f}–{drift_max:.2f}): observed {drift_ratio:.6f}."
|
|
216
278
|
)
|
|
217
279
|
|
|
218
280
|
return errors
|
|
219
281
|
|
|
220
282
|
|
|
221
|
-
def _validate_tokenizer_hash(
|
|
283
|
+
def _validate_tokenizer_hash(report: dict[str, Any]) -> list[str]:
|
|
222
284
|
"""Validate tokenizer hash consistency between baseline and edited runs.
|
|
223
285
|
|
|
224
286
|
The check is enforced only when both hashes are present. When present and
|
|
225
287
|
different, the verification fails.
|
|
226
288
|
"""
|
|
227
289
|
errors: list[str] = []
|
|
228
|
-
meta =
|
|
229
|
-
dataset =
|
|
290
|
+
meta = report.get("meta", {}) or {}
|
|
291
|
+
dataset = report.get("dataset", {}) or {}
|
|
230
292
|
edited_hash = None
|
|
231
293
|
try:
|
|
232
294
|
# Prefer meta.tokenizer_hash; fall back to dataset.tokenizer.hash
|
|
@@ -238,7 +300,7 @@ def _validate_tokenizer_hash(certificate: dict[str, Any]) -> list[str]:
|
|
|
238
300
|
except Exception:
|
|
239
301
|
edited_hash = None
|
|
240
302
|
|
|
241
|
-
baseline_ref =
|
|
303
|
+
baseline_ref = report.get("baseline_ref", {}) or {}
|
|
242
304
|
baseline_hash = baseline_ref.get("tokenizer_hash")
|
|
243
305
|
|
|
244
306
|
if isinstance(edited_hash, str) and isinstance(baseline_hash, str):
|
|
@@ -270,15 +332,15 @@ def _measurement_contract_digest(contract: Any) -> str | None:
|
|
|
270
332
|
|
|
271
333
|
|
|
272
334
|
def _validate_measurement_contracts(
|
|
273
|
-
|
|
335
|
+
report: dict[str, Any], *, profile: str
|
|
274
336
|
) -> list[str]:
|
|
275
337
|
"""Enforce measurement-contract presence and baseline pairing for guards."""
|
|
276
338
|
errors: list[str] = []
|
|
277
339
|
prof = (profile or "").strip().lower()
|
|
278
|
-
resolved_policy =
|
|
340
|
+
resolved_policy = report.get("resolved_policy") or {}
|
|
279
341
|
|
|
280
342
|
for guard_key in ("spectral", "rmt"):
|
|
281
|
-
block =
|
|
343
|
+
block = report.get(guard_key) or {}
|
|
282
344
|
if not isinstance(block, dict):
|
|
283
345
|
continue
|
|
284
346
|
evaluated = bool(block.get("evaluated", True))
|
|
@@ -289,14 +351,14 @@ def _validate_measurement_contracts(
|
|
|
289
351
|
mc_hash = _measurement_contract_digest(mc)
|
|
290
352
|
expected_hash = block.get("measurement_contract_hash")
|
|
291
353
|
if not isinstance(mc, dict) or not mc:
|
|
292
|
-
errors.append(f"
|
|
354
|
+
errors.append(f"report missing {guard_key}.measurement_contract.")
|
|
293
355
|
elif isinstance(expected_hash, str) and expected_hash:
|
|
294
356
|
if mc_hash and mc_hash != expected_hash:
|
|
295
357
|
errors.append(
|
|
296
358
|
f"{guard_key}.measurement_contract_hash mismatch: expected={expected_hash}, computed={mc_hash}."
|
|
297
359
|
)
|
|
298
360
|
else:
|
|
299
|
-
errors.append(f"
|
|
361
|
+
errors.append(f"report missing {guard_key}.measurement_contract_hash.")
|
|
300
362
|
|
|
301
363
|
rp_guard = (
|
|
302
364
|
resolved_policy.get(guard_key)
|
|
@@ -309,7 +371,7 @@ def _validate_measurement_contracts(
|
|
|
309
371
|
rp_hash = _measurement_contract_digest(rp_mc)
|
|
310
372
|
if not isinstance(rp_mc, dict) or not rp_mc:
|
|
311
373
|
errors.append(
|
|
312
|
-
f"
|
|
374
|
+
f"report missing resolved_policy.{guard_key}.measurement_contract."
|
|
313
375
|
)
|
|
314
376
|
elif mc_hash and rp_hash and mc_hash != rp_hash:
|
|
315
377
|
errors.append(
|
|
@@ -327,10 +389,10 @@ def _validate_measurement_contracts(
|
|
|
327
389
|
return errors
|
|
328
390
|
|
|
329
391
|
|
|
330
|
-
def _apply_profile_lints(
|
|
331
|
-
"""Apply model-profile specific lint rules embedded in the
|
|
392
|
+
def _apply_profile_lints(report: dict[str, Any]) -> list[str]:
|
|
393
|
+
"""Apply model-profile specific lint rules embedded in the report."""
|
|
332
394
|
errors: list[str] = []
|
|
333
|
-
meta =
|
|
395
|
+
meta = report.get("meta", {})
|
|
334
396
|
profile = meta.get("model_profile") if isinstance(meta, dict) else None
|
|
335
397
|
if not isinstance(profile, dict):
|
|
336
398
|
return errors
|
|
@@ -346,7 +408,7 @@ def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
|
|
|
346
408
|
path = lint.get("path")
|
|
347
409
|
expected = lint.get("value")
|
|
348
410
|
message = lint.get("message") or "Model profile lint failed."
|
|
349
|
-
actual = _resolve_path(
|
|
411
|
+
actual = _resolve_path(report, path) if isinstance(path, str) else None
|
|
350
412
|
|
|
351
413
|
if lint_type == "equals":
|
|
352
414
|
if actual != expected:
|
|
@@ -383,21 +445,21 @@ def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
|
|
|
383
445
|
return errors
|
|
384
446
|
|
|
385
447
|
|
|
386
|
-
def
|
|
448
|
+
def _validate_evaluation_report_payload(
|
|
387
449
|
path: Path, *, profile: str | None = None
|
|
388
450
|
) -> list[str]:
|
|
389
|
-
"""Run all verification checks for a single
|
|
451
|
+
"""Run all verification checks for a single evaluation report."""
|
|
390
452
|
errors: list[str] = []
|
|
391
|
-
|
|
453
|
+
report = _load_evaluation_report(path)
|
|
392
454
|
|
|
393
455
|
# Always surface schema validation failures for this payload
|
|
394
|
-
if not
|
|
395
|
-
errors.append("
|
|
456
|
+
if not validate_report(report):
|
|
457
|
+
errors.append("report schema validation failed.")
|
|
396
458
|
return errors
|
|
397
459
|
|
|
398
|
-
errors.extend(_validate_primary_metric(
|
|
399
|
-
errors.extend(_validate_pairing(
|
|
400
|
-
errors.extend(_validate_counts(
|
|
460
|
+
errors.extend(_validate_primary_metric(report))
|
|
461
|
+
errors.extend(_validate_pairing(report))
|
|
462
|
+
errors.extend(_validate_counts(report))
|
|
401
463
|
try:
|
|
402
464
|
prof = (
|
|
403
465
|
(profile or "").strip().lower()
|
|
@@ -406,24 +468,25 @@ def _validate_certificate_payload(
|
|
|
406
468
|
)
|
|
407
469
|
except Exception:
|
|
408
470
|
prof = "dev"
|
|
409
|
-
#
|
|
471
|
+
# Drift band is a CI/Release enforcement check; dev profile should not
|
|
472
|
+
# fail verification due to preview→final drift.
|
|
410
473
|
if prof in {"ci", "release"}:
|
|
411
|
-
errors.extend(_validate_drift_band(
|
|
412
|
-
errors.extend(_apply_profile_lints(
|
|
413
|
-
errors.extend(_validate_tokenizer_hash(
|
|
474
|
+
errors.extend(_validate_drift_band(report))
|
|
475
|
+
errors.extend(_apply_profile_lints(report))
|
|
476
|
+
errors.extend(_validate_tokenizer_hash(report))
|
|
414
477
|
if prof in {"ci", "release"}:
|
|
415
|
-
errors.extend(_validate_measurement_contracts(
|
|
478
|
+
errors.extend(_validate_measurement_contracts(report, profile=prof))
|
|
416
479
|
|
|
417
480
|
# strict/fast assurance mode checks were removed; verification gates rely on
|
|
418
481
|
# structural schema + guard metric contracts instead.
|
|
419
482
|
|
|
420
483
|
# Release-only enforcement: guard overhead must be measured or explicitly skipped.
|
|
421
484
|
if prof == "release":
|
|
422
|
-
go =
|
|
485
|
+
go = report.get("guard_overhead")
|
|
423
486
|
if not isinstance(go, dict) or not go:
|
|
424
487
|
errors.append(
|
|
425
488
|
"Release verification requires guard_overhead (missing). "
|
|
426
|
-
"Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during
|
|
489
|
+
"Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during evaluation."
|
|
427
490
|
)
|
|
428
491
|
else:
|
|
429
492
|
skipped = bool(go.get("skipped", False)) or (
|
|
@@ -434,7 +497,7 @@ def _validate_certificate_payload(
|
|
|
434
497
|
if evaluated is not True:
|
|
435
498
|
errors.append(
|
|
436
499
|
"Release verification requires evaluated guard_overhead (not evaluated). "
|
|
437
|
-
"Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during
|
|
500
|
+
"Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during evaluation."
|
|
438
501
|
)
|
|
439
502
|
ratio = go.get("overhead_ratio")
|
|
440
503
|
if ratio is None:
|
|
@@ -446,14 +509,14 @@ def _validate_certificate_payload(
|
|
|
446
509
|
return errors
|
|
447
510
|
|
|
448
511
|
|
|
449
|
-
def _warn_adapter_family_mismatch(cert_path: Path,
|
|
512
|
+
def _warn_adapter_family_mismatch(cert_path: Path, report: dict[str, Any]) -> None:
|
|
450
513
|
"""Emit a soft warning if adapter families differ between baseline and edited.
|
|
451
514
|
|
|
452
515
|
This is a non-fatal hint to catch inadvertent cross-family comparisons.
|
|
453
|
-
Tries to load the baseline report referenced in the
|
|
516
|
+
Tries to load the baseline report referenced in the report provenance.
|
|
454
517
|
"""
|
|
455
518
|
try:
|
|
456
|
-
plugins =
|
|
519
|
+
plugins = report.get("plugins") or {}
|
|
457
520
|
adapter_meta = plugins.get("adapter") if isinstance(plugins, dict) else None
|
|
458
521
|
edited_family = None
|
|
459
522
|
edited_lib = None
|
|
@@ -466,8 +529,8 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
|
|
|
466
529
|
edited_ver = prov.get("version") or None
|
|
467
530
|
|
|
468
531
|
baseline_prov = (
|
|
469
|
-
|
|
470
|
-
if isinstance(
|
|
532
|
+
report.get("provenance")
|
|
533
|
+
if isinstance(report.get("provenance"), dict)
|
|
471
534
|
else {}
|
|
472
535
|
)
|
|
473
536
|
baseline_report_path = None
|
|
@@ -517,7 +580,7 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
|
|
|
517
580
|
f"[yellow] • edited : family={edited_family}, backend={edited_backend} {edited_version}[/yellow]"
|
|
518
581
|
)
|
|
519
582
|
console.print(
|
|
520
|
-
"[yellow] Ensure this cross-family comparison is intentional (Compare &
|
|
583
|
+
"[yellow] Ensure this cross-family comparison is intentional (Compare & Evaluate flows should normally match families).[/yellow]"
|
|
521
584
|
)
|
|
522
585
|
except Exception:
|
|
523
586
|
# Non-fatal and best-effort; suppress errors
|
|
@@ -525,18 +588,18 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
|
|
|
525
588
|
|
|
526
589
|
|
|
527
590
|
def verify_command(
|
|
528
|
-
|
|
591
|
+
reports: list[Path] = typer.Argument(
|
|
529
592
|
...,
|
|
530
593
|
exists=True,
|
|
531
594
|
dir_okay=False,
|
|
532
595
|
readable=True,
|
|
533
596
|
resolve_path=True,
|
|
534
|
-
help="One or more
|
|
597
|
+
help="One or more evaluation report JSON files to verify.",
|
|
535
598
|
),
|
|
536
599
|
baseline: Path | None = typer.Option(
|
|
537
600
|
None,
|
|
538
601
|
"--baseline",
|
|
539
|
-
help="Optional baseline
|
|
602
|
+
help="Optional baseline evaluation report (or run report) JSON to enforce provider parity.",
|
|
540
603
|
),
|
|
541
604
|
tolerance: float = typer.Option(
|
|
542
605
|
1e-9,
|
|
@@ -555,9 +618,9 @@ def verify_command(
|
|
|
555
618
|
),
|
|
556
619
|
) -> None:
|
|
557
620
|
"""
|
|
558
|
-
Verify
|
|
621
|
+
Verify evaluation report integrity.
|
|
559
622
|
|
|
560
|
-
Ensures each
|
|
623
|
+
Ensures each evaluation report passes schema validation, ratio consistency checks,
|
|
561
624
|
and strict pairing requirements (match=1.0, overlap=0.0).
|
|
562
625
|
"""
|
|
563
626
|
|
|
@@ -573,7 +636,7 @@ def verify_command(
|
|
|
573
636
|
try:
|
|
574
637
|
if baseline is not None:
|
|
575
638
|
bdata = json.loads(baseline.read_text(encoding="utf-8"))
|
|
576
|
-
# Accept either
|
|
639
|
+
# Accept either an evaluation report or a run report (report.json); look under provenance when present.
|
|
577
640
|
prov = bdata.get("provenance") if isinstance(bdata, dict) else None
|
|
578
641
|
if isinstance(prov, dict):
|
|
579
642
|
pd = prov.get("provider_digest")
|
|
@@ -585,8 +648,8 @@ def verify_command(
|
|
|
585
648
|
|
|
586
649
|
malformed_any = False
|
|
587
650
|
try:
|
|
588
|
-
for cert_path in
|
|
589
|
-
cert_obj =
|
|
651
|
+
for cert_path in reports:
|
|
652
|
+
cert_obj = _load_evaluation_report(cert_path)
|
|
590
653
|
|
|
591
654
|
# Enforce provider digest presence in CI/Release profiles
|
|
592
655
|
try:
|
|
@@ -614,24 +677,21 @@ def verify_command(
|
|
|
614
677
|
)
|
|
615
678
|
|
|
616
679
|
# Structural checks
|
|
617
|
-
errors =
|
|
680
|
+
errors = _validate_evaluation_report_payload(cert_path, profile=profile)
|
|
618
681
|
# JSON path: emit a typed ValidationError for schema failures to include error.code
|
|
619
682
|
if json_out and any(
|
|
620
683
|
"schema validation failed" in str(e).lower() for e in errors
|
|
621
684
|
):
|
|
622
685
|
raise _ValidationError(
|
|
623
686
|
code="E601",
|
|
624
|
-
message="
|
|
687
|
+
message="REPORT-SCHEMA-INVALID: schema validation failed",
|
|
625
688
|
details={"path": str(cert_path)},
|
|
626
689
|
)
|
|
627
690
|
# Determine malformed vs policy-fail for this cert
|
|
628
691
|
is_malformed = any(
|
|
629
692
|
("schema validation failed" in e.lower())
|
|
630
693
|
or ("missing primary_metric.ratio_vs_baseline" in e)
|
|
631
|
-
or (
|
|
632
|
-
"Certificate is missing a finite primary_metric.ratio_vs_baseline"
|
|
633
|
-
in e
|
|
634
|
-
)
|
|
694
|
+
or ("report is missing a finite primary_metric.ratio_vs_baseline" in e)
|
|
635
695
|
for e in errors
|
|
636
696
|
)
|
|
637
697
|
malformed_any = malformed_any or is_malformed
|
|
@@ -748,7 +808,7 @@ def verify_command(
|
|
|
748
808
|
)
|
|
749
809
|
raise _MetricsError(
|
|
750
810
|
code="E602",
|
|
751
|
-
message="RECOMPUTE-MISMATCH:
|
|
811
|
+
message="RECOMPUTE-MISMATCH: report values disagree with recomputation",
|
|
752
812
|
details={"example": str(first)},
|
|
753
813
|
)
|
|
754
814
|
|
|
@@ -770,11 +830,11 @@ def verify_command(
|
|
|
770
830
|
if not overall_ok:
|
|
771
831
|
code = 2 if malformed_any else 1
|
|
772
832
|
if json_out:
|
|
773
|
-
# Build per-
|
|
833
|
+
# Build per-report results payload
|
|
774
834
|
results: list[dict[str, Any]] = []
|
|
775
|
-
for cert_path in
|
|
835
|
+
for cert_path in reports:
|
|
776
836
|
try:
|
|
777
|
-
cert_obj =
|
|
837
|
+
cert_obj = _load_evaluation_report(cert_path)
|
|
778
838
|
except Exception:
|
|
779
839
|
cert_obj = {}
|
|
780
840
|
pm = (
|
|
@@ -915,7 +975,7 @@ def verify_command(
|
|
|
915
975
|
"ok": False,
|
|
916
976
|
"reason": "malformed" if malformed_any else "policy_fail",
|
|
917
977
|
},
|
|
918
|
-
"
|
|
978
|
+
"evaluation_report": {"count": len(reports)},
|
|
919
979
|
"results": results,
|
|
920
980
|
"resolution": {"exit_code": code},
|
|
921
981
|
}
|
|
@@ -924,11 +984,11 @@ def verify_command(
|
|
|
924
984
|
|
|
925
985
|
# Success emission
|
|
926
986
|
if json_out:
|
|
927
|
-
# Build per-
|
|
987
|
+
# Build per-report success results payload
|
|
928
988
|
results: list[dict[str, Any]] = []
|
|
929
|
-
for cert_path in
|
|
989
|
+
for cert_path in reports:
|
|
930
990
|
try:
|
|
931
|
-
cert_obj =
|
|
991
|
+
cert_obj = _load_evaluation_report(cert_path)
|
|
932
992
|
except Exception:
|
|
933
993
|
cert_obj = {}
|
|
934
994
|
pm = (
|
|
@@ -1057,7 +1117,7 @@ def verify_command(
|
|
|
1057
1117
|
payload = {
|
|
1058
1118
|
"format_version": FORMAT_VERIFY,
|
|
1059
1119
|
"summary": {"ok": True, "reason": "ok"},
|
|
1060
|
-
"
|
|
1120
|
+
"evaluation_report": {"count": len(reports)},
|
|
1061
1121
|
"results": results,
|
|
1062
1122
|
"resolution": {"exit_code": 0},
|
|
1063
1123
|
}
|
|
@@ -1065,7 +1125,7 @@ def verify_command(
|
|
|
1065
1125
|
else:
|
|
1066
1126
|
# Human-friendly success line
|
|
1067
1127
|
try:
|
|
1068
|
-
last =
|
|
1128
|
+
last = _load_evaluation_report(reports[-1]) if reports else {}
|
|
1069
1129
|
pm = last.get("primary_metric", {}) if isinstance(last, dict) else {}
|
|
1070
1130
|
kind = str(pm.get("kind") or "").strip()
|
|
1071
1131
|
ppl = last.get("ppl", {}) if isinstance(last, dict) else {}
|
|
@@ -1116,7 +1176,7 @@ def verify_command(
|
|
|
1116
1176
|
"summary": {"ok": False, "reason": reason},
|
|
1117
1177
|
"results": [
|
|
1118
1178
|
{
|
|
1119
|
-
"id": str(
|
|
1179
|
+
"id": str(reports[0]) if reports else "",
|
|
1120
1180
|
"schema_version": "v1",
|
|
1121
1181
|
"kind": "",
|
|
1122
1182
|
"ok": False,
|
|
@@ -1148,7 +1208,7 @@ def verify_command(
|
|
|
1148
1208
|
"summary": {"ok": False, "reason": reason},
|
|
1149
1209
|
"results": [
|
|
1150
1210
|
{
|
|
1151
|
-
"id": str(
|
|
1211
|
+
"id": str(reports[0]) if reports else "",
|
|
1152
1212
|
"schema_version": "v1",
|
|
1153
1213
|
"kind": "",
|
|
1154
1214
|
"ok": False,
|
invarlock/cli/config.py
CHANGED
|
@@ -415,7 +415,7 @@ def _deep_merge_dicts(a: dict, b: dict) -> dict: # pragma: no cover - trivial a
|
|
|
415
415
|
|
|
416
416
|
def create_example_config() -> InvarLockConfig: # pragma: no cover - test helper
|
|
417
417
|
return InvarLockConfig(
|
|
418
|
-
model={"id": "gpt2", "adapter": "
|
|
418
|
+
model={"id": "gpt2", "adapter": "hf_causal", "device": "auto"},
|
|
419
419
|
edit={"name": "quant_rtn", "plan": {}},
|
|
420
420
|
dataset={"provider": "wikitext2", "seq_len": 512, "stride": 512},
|
|
421
421
|
output={"dir": "runs"},
|
invarlock/cli/determinism.py
CHANGED
|
@@ -5,7 +5,7 @@ Centralizes:
|
|
|
5
5
|
- Thread caps (OMP/MKL/etc + torch threads)
|
|
6
6
|
- TF32 policy
|
|
7
7
|
- torch deterministic algorithms
|
|
8
|
-
- A structured "determinism level" for
|
|
8
|
+
- A structured "determinism level" for evaluation report provenance
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|