invarlock 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +3 -3
- invarlock/adapters/auto.py +2 -10
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +28 -5
- invarlock/assurance/__init__.py +15 -23
- invarlock/calibration/spectral_null.py +1 -1
- invarlock/cli/adapter_auto.py +1 -5
- invarlock/cli/app.py +57 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/{certify.py → evaluate.py} +69 -46
- invarlock/cli/commands/explain_gates.py +94 -51
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/report.py +121 -47
- invarlock/cli/commands/run.py +274 -66
- invarlock/cli/commands/verify.py +84 -89
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/provenance.py +3 -3
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +1 -1
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +2 -2
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +11 -7
- invarlock/eval/primary_metric.py +1 -1
- invarlock/guards/spectral.py +2 -2
- invarlock/guards_ref/spectral_ref.py +1 -1
- invarlock/model_profile.py +16 -35
- invarlock/observability/health.py +38 -20
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/html.py +7 -7
- invarlock/reporting/normalizer.py +2 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +126 -120
- invarlock/reporting/report.py +43 -37
- invarlock/reporting/{certificate.py → report_builder.py} +103 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock-0.3.9.dist-info/METADATA +303 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/RECORD +46 -46
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/WHEEL +1 -1
- invarlock-0.3.7.dist-info/METADATA +0 -602
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/top_level.txt +0 -0
invarlock/cli/commands/verify.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
invarlock verify command
|
|
3
3
|
====================
|
|
4
4
|
|
|
5
|
-
Validates generated evaluation
|
|
5
|
+
Validates generated evaluation reports for internal consistency. The command
|
|
6
6
|
ensures schema compliance, checks that the primary metric ratio agrees with the
|
|
7
7
|
baseline reference, and enforces paired-window guarantees (match=1.0,
|
|
8
8
|
overlap=0.0).
|
|
@@ -26,7 +26,7 @@ from invarlock.core.exceptions import (
|
|
|
26
26
|
from invarlock.core.exceptions import (
|
|
27
27
|
ValidationError as _ValidationError,
|
|
28
28
|
)
|
|
29
|
-
from invarlock.reporting.
|
|
29
|
+
from invarlock.reporting.report_builder import validate_report
|
|
30
30
|
|
|
31
31
|
from .._json import emit as _emit_json
|
|
32
32
|
from .._json import encode_error as _encode_error
|
|
@@ -52,18 +52,18 @@ def _coerce_int(value: Any) -> int | None:
|
|
|
52
52
|
return out if out >= 0 else None
|
|
53
53
|
|
|
54
54
|
|
|
55
|
-
def
|
|
56
|
-
"""Load
|
|
55
|
+
def _load_evaluation_report(path: Path) -> dict[str, Any]:
|
|
56
|
+
"""Load an evaluation report JSON from disk."""
|
|
57
57
|
with path.open("r", encoding="utf-8") as handle:
|
|
58
58
|
return json.load(handle)
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def _validate_primary_metric(
|
|
61
|
+
def _validate_primary_metric(report: dict[str, Any]) -> list[str]:
|
|
62
62
|
"""Validate primary metric ratio consistency with baseline reference."""
|
|
63
63
|
errors: list[str] = []
|
|
64
|
-
pm =
|
|
64
|
+
pm = report.get("primary_metric", {}) or {}
|
|
65
65
|
if not isinstance(pm, dict) or not pm:
|
|
66
|
-
errors.append("
|
|
66
|
+
errors.append("report missing primary_metric block.")
|
|
67
67
|
return errors
|
|
68
68
|
|
|
69
69
|
def _is_finite_number(value: Any) -> bool:
|
|
@@ -87,7 +87,7 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
|
|
|
87
87
|
pm_invalid = _declares_invalid_primary_metric(pm)
|
|
88
88
|
|
|
89
89
|
if kind.startswith("ppl"):
|
|
90
|
-
baseline_ref =
|
|
90
|
+
baseline_ref = report.get("baseline_ref", {}) or {}
|
|
91
91
|
baseline_pm = (
|
|
92
92
|
baseline_ref.get("primary_metric")
|
|
93
93
|
if isinstance(baseline_ref, dict)
|
|
@@ -107,7 +107,7 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
|
|
|
107
107
|
expected_ratio = float(final) / float(baseline_final)
|
|
108
108
|
if not _is_finite_number(ratio_vs_baseline):
|
|
109
109
|
errors.append(
|
|
110
|
-
"
|
|
110
|
+
"report is missing a finite primary_metric.ratio_vs_baseline value."
|
|
111
111
|
)
|
|
112
112
|
elif not math.isclose(
|
|
113
113
|
float(ratio_vs_baseline), expected_ratio, rel_tol=1e-6, abs_tol=1e-6
|
|
@@ -130,16 +130,16 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
|
|
|
130
130
|
return errors
|
|
131
131
|
if ratio_vs_baseline is None or not isinstance(ratio_vs_baseline, int | float):
|
|
132
132
|
errors.append(
|
|
133
|
-
"
|
|
133
|
+
"report missing primary_metric.ratio_vs_baseline for non-ppl metric."
|
|
134
134
|
)
|
|
135
135
|
|
|
136
136
|
return errors
|
|
137
137
|
|
|
138
138
|
|
|
139
|
-
def _validate_pairing(
|
|
139
|
+
def _validate_pairing(report: dict[str, Any]) -> list[str]:
|
|
140
140
|
"""Validate window pairing metrics (PM-only location)."""
|
|
141
141
|
errors: list[str] = []
|
|
142
|
-
stats =
|
|
142
|
+
stats = report.get("dataset", {}).get("windows", {}).get("stats", {})
|
|
143
143
|
|
|
144
144
|
match_fraction = stats.get("window_match_fraction")
|
|
145
145
|
overlap_fraction = stats.get("window_overlap_fraction")
|
|
@@ -148,23 +148,23 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
|
|
|
148
148
|
|
|
149
149
|
if pairing_reason is not None:
|
|
150
150
|
errors.append(
|
|
151
|
-
"window_pairing_reason must be null/None for paired
|
|
151
|
+
"window_pairing_reason must be null/None for paired reports "
|
|
152
152
|
f"(found {pairing_reason!r})."
|
|
153
153
|
)
|
|
154
154
|
if paired_windows is None:
|
|
155
|
-
errors.append("
|
|
155
|
+
errors.append("report missing paired_windows metric.")
|
|
156
156
|
elif paired_windows == 0:
|
|
157
|
-
errors.append("paired_windows must be > 0 for paired
|
|
157
|
+
errors.append("paired_windows must be > 0 for paired reports (found 0).")
|
|
158
158
|
|
|
159
159
|
if match_fraction is None:
|
|
160
|
-
errors.append("
|
|
160
|
+
errors.append("report missing window_match_fraction metric.")
|
|
161
161
|
elif match_fraction < 0.999999:
|
|
162
162
|
errors.append(
|
|
163
163
|
f"window_match_fraction must be 1.0 for paired runs (found {match_fraction:.6f})."
|
|
164
164
|
)
|
|
165
165
|
|
|
166
166
|
if overlap_fraction is None:
|
|
167
|
-
errors.append("
|
|
167
|
+
errors.append("report missing window_overlap_fraction metric.")
|
|
168
168
|
elif overlap_fraction > 1e-9:
|
|
169
169
|
errors.append(
|
|
170
170
|
f"window_overlap_fraction must be 0.0 (found {overlap_fraction:.6f})."
|
|
@@ -173,10 +173,10 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
|
|
|
173
173
|
return errors
|
|
174
174
|
|
|
175
175
|
|
|
176
|
-
def _validate_counts(
|
|
176
|
+
def _validate_counts(report: dict[str, Any]) -> list[str]:
|
|
177
177
|
"""Validate preview/final window counts align with dataset configuration."""
|
|
178
178
|
errors: list[str] = []
|
|
179
|
-
dataset =
|
|
179
|
+
dataset = report.get("dataset", {})
|
|
180
180
|
dataset_windows = dataset.get("windows", {})
|
|
181
181
|
expected_preview = dataset_windows.get("preview")
|
|
182
182
|
expected_final = dataset_windows.get("final")
|
|
@@ -190,9 +190,7 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
|
|
|
190
190
|
|
|
191
191
|
if expected_preview is not None:
|
|
192
192
|
if preview_used is None:
|
|
193
|
-
errors.append(
|
|
194
|
-
"Certificate missing coverage.preview.used for preview windows."
|
|
195
|
-
)
|
|
193
|
+
errors.append("report missing coverage.preview.used for preview windows.")
|
|
196
194
|
elif int(preview_used) != int(expected_preview):
|
|
197
195
|
errors.append(
|
|
198
196
|
f"Preview window count mismatch: expected {expected_preview}, observed {preview_used}."
|
|
@@ -200,7 +198,7 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
|
|
|
200
198
|
|
|
201
199
|
if expected_final is not None:
|
|
202
200
|
if final_used is None:
|
|
203
|
-
errors.append("
|
|
201
|
+
errors.append("report missing coverage.final.used for final windows.")
|
|
204
202
|
elif int(final_used) != int(expected_final):
|
|
205
203
|
errors.append(
|
|
206
204
|
f"Final window count mismatch: expected {expected_final}, observed {final_used}."
|
|
@@ -218,15 +216,15 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
|
|
|
218
216
|
return errors
|
|
219
217
|
|
|
220
218
|
|
|
221
|
-
def _validate_drift_band(
|
|
219
|
+
def _validate_drift_band(report: dict[str, Any]) -> list[str]:
|
|
222
220
|
"""Validate preview→final drift stays within the configured band.
|
|
223
221
|
|
|
224
|
-
Defaults to 0.95–1.05 unless the
|
|
222
|
+
Defaults to 0.95–1.05 unless the report provides `primary_metric.drift_band`.
|
|
225
223
|
"""
|
|
226
224
|
errors: list[str] = []
|
|
227
|
-
pm =
|
|
225
|
+
pm = report.get("primary_metric", {}) or {}
|
|
228
226
|
if not isinstance(pm, dict) or not pm:
|
|
229
|
-
errors.append("
|
|
227
|
+
errors.append("report missing primary_metric block.")
|
|
230
228
|
return errors
|
|
231
229
|
if bool(pm.get("invalid")):
|
|
232
230
|
# Drift is undefined when the primary metric is invalid (e.g., NaN/Inf weights).
|
|
@@ -247,7 +245,7 @@ def _validate_drift_band(certificate: dict[str, Any]) -> list[str]:
|
|
|
247
245
|
drift_ratio = None
|
|
248
246
|
|
|
249
247
|
if not isinstance(drift_ratio, int | float):
|
|
250
|
-
errors.append("
|
|
248
|
+
errors.append("report missing preview/final to compute drift ratio.")
|
|
251
249
|
return errors
|
|
252
250
|
|
|
253
251
|
drift_min = 0.95
|
|
@@ -282,15 +280,15 @@ def _validate_drift_band(certificate: dict[str, Any]) -> list[str]:
|
|
|
282
280
|
return errors
|
|
283
281
|
|
|
284
282
|
|
|
285
|
-
def _validate_tokenizer_hash(
|
|
283
|
+
def _validate_tokenizer_hash(report: dict[str, Any]) -> list[str]:
|
|
286
284
|
"""Validate tokenizer hash consistency between baseline and edited runs.
|
|
287
285
|
|
|
288
286
|
The check is enforced only when both hashes are present. When present and
|
|
289
287
|
different, the verification fails.
|
|
290
288
|
"""
|
|
291
289
|
errors: list[str] = []
|
|
292
|
-
meta =
|
|
293
|
-
dataset =
|
|
290
|
+
meta = report.get("meta", {}) or {}
|
|
291
|
+
dataset = report.get("dataset", {}) or {}
|
|
294
292
|
edited_hash = None
|
|
295
293
|
try:
|
|
296
294
|
# Prefer meta.tokenizer_hash; fall back to dataset.tokenizer.hash
|
|
@@ -302,7 +300,7 @@ def _validate_tokenizer_hash(certificate: dict[str, Any]) -> list[str]:
|
|
|
302
300
|
except Exception:
|
|
303
301
|
edited_hash = None
|
|
304
302
|
|
|
305
|
-
baseline_ref =
|
|
303
|
+
baseline_ref = report.get("baseline_ref", {}) or {}
|
|
306
304
|
baseline_hash = baseline_ref.get("tokenizer_hash")
|
|
307
305
|
|
|
308
306
|
if isinstance(edited_hash, str) and isinstance(baseline_hash, str):
|
|
@@ -334,15 +332,15 @@ def _measurement_contract_digest(contract: Any) -> str | None:
|
|
|
334
332
|
|
|
335
333
|
|
|
336
334
|
def _validate_measurement_contracts(
|
|
337
|
-
|
|
335
|
+
report: dict[str, Any], *, profile: str
|
|
338
336
|
) -> list[str]:
|
|
339
337
|
"""Enforce measurement-contract presence and baseline pairing for guards."""
|
|
340
338
|
errors: list[str] = []
|
|
341
339
|
prof = (profile or "").strip().lower()
|
|
342
|
-
resolved_policy =
|
|
340
|
+
resolved_policy = report.get("resolved_policy") or {}
|
|
343
341
|
|
|
344
342
|
for guard_key in ("spectral", "rmt"):
|
|
345
|
-
block =
|
|
343
|
+
block = report.get(guard_key) or {}
|
|
346
344
|
if not isinstance(block, dict):
|
|
347
345
|
continue
|
|
348
346
|
evaluated = bool(block.get("evaluated", True))
|
|
@@ -353,14 +351,14 @@ def _validate_measurement_contracts(
|
|
|
353
351
|
mc_hash = _measurement_contract_digest(mc)
|
|
354
352
|
expected_hash = block.get("measurement_contract_hash")
|
|
355
353
|
if not isinstance(mc, dict) or not mc:
|
|
356
|
-
errors.append(f"
|
|
354
|
+
errors.append(f"report missing {guard_key}.measurement_contract.")
|
|
357
355
|
elif isinstance(expected_hash, str) and expected_hash:
|
|
358
356
|
if mc_hash and mc_hash != expected_hash:
|
|
359
357
|
errors.append(
|
|
360
358
|
f"{guard_key}.measurement_contract_hash mismatch: expected={expected_hash}, computed={mc_hash}."
|
|
361
359
|
)
|
|
362
360
|
else:
|
|
363
|
-
errors.append(f"
|
|
361
|
+
errors.append(f"report missing {guard_key}.measurement_contract_hash.")
|
|
364
362
|
|
|
365
363
|
rp_guard = (
|
|
366
364
|
resolved_policy.get(guard_key)
|
|
@@ -373,7 +371,7 @@ def _validate_measurement_contracts(
|
|
|
373
371
|
rp_hash = _measurement_contract_digest(rp_mc)
|
|
374
372
|
if not isinstance(rp_mc, dict) or not rp_mc:
|
|
375
373
|
errors.append(
|
|
376
|
-
f"
|
|
374
|
+
f"report missing resolved_policy.{guard_key}.measurement_contract."
|
|
377
375
|
)
|
|
378
376
|
elif mc_hash and rp_hash and mc_hash != rp_hash:
|
|
379
377
|
errors.append(
|
|
@@ -391,10 +389,10 @@ def _validate_measurement_contracts(
|
|
|
391
389
|
return errors
|
|
392
390
|
|
|
393
391
|
|
|
394
|
-
def _apply_profile_lints(
|
|
395
|
-
"""Apply model-profile specific lint rules embedded in the
|
|
392
|
+
def _apply_profile_lints(report: dict[str, Any]) -> list[str]:
|
|
393
|
+
"""Apply model-profile specific lint rules embedded in the report."""
|
|
396
394
|
errors: list[str] = []
|
|
397
|
-
meta =
|
|
395
|
+
meta = report.get("meta", {})
|
|
398
396
|
profile = meta.get("model_profile") if isinstance(meta, dict) else None
|
|
399
397
|
if not isinstance(profile, dict):
|
|
400
398
|
return errors
|
|
@@ -410,7 +408,7 @@ def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
|
|
|
410
408
|
path = lint.get("path")
|
|
411
409
|
expected = lint.get("value")
|
|
412
410
|
message = lint.get("message") or "Model profile lint failed."
|
|
413
|
-
actual = _resolve_path(
|
|
411
|
+
actual = _resolve_path(report, path) if isinstance(path, str) else None
|
|
414
412
|
|
|
415
413
|
if lint_type == "equals":
|
|
416
414
|
if actual != expected:
|
|
@@ -447,21 +445,21 @@ def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
|
|
|
447
445
|
return errors
|
|
448
446
|
|
|
449
447
|
|
|
450
|
-
def
|
|
448
|
+
def _validate_evaluation_report_payload(
|
|
451
449
|
path: Path, *, profile: str | None = None
|
|
452
450
|
) -> list[str]:
|
|
453
|
-
"""Run all verification checks for a single
|
|
451
|
+
"""Run all verification checks for a single evaluation report."""
|
|
454
452
|
errors: list[str] = []
|
|
455
|
-
|
|
453
|
+
report = _load_evaluation_report(path)
|
|
456
454
|
|
|
457
455
|
# Always surface schema validation failures for this payload
|
|
458
|
-
if not
|
|
459
|
-
errors.append("
|
|
456
|
+
if not validate_report(report):
|
|
457
|
+
errors.append("report schema validation failed.")
|
|
460
458
|
return errors
|
|
461
459
|
|
|
462
|
-
errors.extend(_validate_primary_metric(
|
|
463
|
-
errors.extend(_validate_pairing(
|
|
464
|
-
errors.extend(_validate_counts(
|
|
460
|
+
errors.extend(_validate_primary_metric(report))
|
|
461
|
+
errors.extend(_validate_pairing(report))
|
|
462
|
+
errors.extend(_validate_counts(report))
|
|
465
463
|
try:
|
|
466
464
|
prof = (
|
|
467
465
|
(profile or "").strip().lower()
|
|
@@ -473,22 +471,22 @@ def _validate_certificate_payload(
|
|
|
473
471
|
# Drift band is a CI/Release enforcement check; dev profile should not
|
|
474
472
|
# fail verification due to preview→final drift.
|
|
475
473
|
if prof in {"ci", "release"}:
|
|
476
|
-
errors.extend(_validate_drift_band(
|
|
477
|
-
errors.extend(_apply_profile_lints(
|
|
478
|
-
errors.extend(_validate_tokenizer_hash(
|
|
474
|
+
errors.extend(_validate_drift_band(report))
|
|
475
|
+
errors.extend(_apply_profile_lints(report))
|
|
476
|
+
errors.extend(_validate_tokenizer_hash(report))
|
|
479
477
|
if prof in {"ci", "release"}:
|
|
480
|
-
errors.extend(_validate_measurement_contracts(
|
|
478
|
+
errors.extend(_validate_measurement_contracts(report, profile=prof))
|
|
481
479
|
|
|
482
480
|
# strict/fast assurance mode checks were removed; verification gates rely on
|
|
483
481
|
# structural schema + guard metric contracts instead.
|
|
484
482
|
|
|
485
483
|
# Release-only enforcement: guard overhead must be measured or explicitly skipped.
|
|
486
484
|
if prof == "release":
|
|
487
|
-
go =
|
|
485
|
+
go = report.get("guard_overhead")
|
|
488
486
|
if not isinstance(go, dict) or not go:
|
|
489
487
|
errors.append(
|
|
490
488
|
"Release verification requires guard_overhead (missing). "
|
|
491
|
-
"Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during
|
|
489
|
+
"Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during evaluation."
|
|
492
490
|
)
|
|
493
491
|
else:
|
|
494
492
|
skipped = bool(go.get("skipped", False)) or (
|
|
@@ -499,7 +497,7 @@ def _validate_certificate_payload(
|
|
|
499
497
|
if evaluated is not True:
|
|
500
498
|
errors.append(
|
|
501
499
|
"Release verification requires evaluated guard_overhead (not evaluated). "
|
|
502
|
-
"Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during
|
|
500
|
+
"Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during evaluation."
|
|
503
501
|
)
|
|
504
502
|
ratio = go.get("overhead_ratio")
|
|
505
503
|
if ratio is None:
|
|
@@ -511,14 +509,14 @@ def _validate_certificate_payload(
|
|
|
511
509
|
return errors
|
|
512
510
|
|
|
513
511
|
|
|
514
|
-
def _warn_adapter_family_mismatch(cert_path: Path,
|
|
512
|
+
def _warn_adapter_family_mismatch(cert_path: Path, report: dict[str, Any]) -> None:
|
|
515
513
|
"""Emit a soft warning if adapter families differ between baseline and edited.
|
|
516
514
|
|
|
517
515
|
This is a non-fatal hint to catch inadvertent cross-family comparisons.
|
|
518
|
-
Tries to load the baseline report referenced in the
|
|
516
|
+
Tries to load the baseline report referenced in the report provenance.
|
|
519
517
|
"""
|
|
520
518
|
try:
|
|
521
|
-
plugins =
|
|
519
|
+
plugins = report.get("plugins") or {}
|
|
522
520
|
adapter_meta = plugins.get("adapter") if isinstance(plugins, dict) else None
|
|
523
521
|
edited_family = None
|
|
524
522
|
edited_lib = None
|
|
@@ -531,8 +529,8 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
|
|
|
531
529
|
edited_ver = prov.get("version") or None
|
|
532
530
|
|
|
533
531
|
baseline_prov = (
|
|
534
|
-
|
|
535
|
-
if isinstance(
|
|
532
|
+
report.get("provenance")
|
|
533
|
+
if isinstance(report.get("provenance"), dict)
|
|
536
534
|
else {}
|
|
537
535
|
)
|
|
538
536
|
baseline_report_path = None
|
|
@@ -582,7 +580,7 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
|
|
|
582
580
|
f"[yellow] • edited : family={edited_family}, backend={edited_backend} {edited_version}[/yellow]"
|
|
583
581
|
)
|
|
584
582
|
console.print(
|
|
585
|
-
"[yellow] Ensure this cross-family comparison is intentional (Compare &
|
|
583
|
+
"[yellow] Ensure this cross-family comparison is intentional (Compare & Evaluate flows should normally match families).[/yellow]"
|
|
586
584
|
)
|
|
587
585
|
except Exception:
|
|
588
586
|
# Non-fatal and best-effort; suppress errors
|
|
@@ -590,18 +588,18 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
|
|
|
590
588
|
|
|
591
589
|
|
|
592
590
|
def verify_command(
|
|
593
|
-
|
|
591
|
+
reports: list[Path] = typer.Argument(
|
|
594
592
|
...,
|
|
595
593
|
exists=True,
|
|
596
594
|
dir_okay=False,
|
|
597
595
|
readable=True,
|
|
598
596
|
resolve_path=True,
|
|
599
|
-
help="One or more
|
|
597
|
+
help="One or more evaluation report JSON files to verify.",
|
|
600
598
|
),
|
|
601
599
|
baseline: Path | None = typer.Option(
|
|
602
600
|
None,
|
|
603
601
|
"--baseline",
|
|
604
|
-
help="Optional baseline
|
|
602
|
+
help="Optional baseline evaluation report (or run report) JSON to enforce provider parity.",
|
|
605
603
|
),
|
|
606
604
|
tolerance: float = typer.Option(
|
|
607
605
|
1e-9,
|
|
@@ -620,9 +618,9 @@ def verify_command(
|
|
|
620
618
|
),
|
|
621
619
|
) -> None:
|
|
622
620
|
"""
|
|
623
|
-
Verify
|
|
621
|
+
Verify evaluation report integrity.
|
|
624
622
|
|
|
625
|
-
Ensures each
|
|
623
|
+
Ensures each evaluation report passes schema validation, ratio consistency checks,
|
|
626
624
|
and strict pairing requirements (match=1.0, overlap=0.0).
|
|
627
625
|
"""
|
|
628
626
|
|
|
@@ -638,7 +636,7 @@ def verify_command(
|
|
|
638
636
|
try:
|
|
639
637
|
if baseline is not None:
|
|
640
638
|
bdata = json.loads(baseline.read_text(encoding="utf-8"))
|
|
641
|
-
# Accept either
|
|
639
|
+
# Accept either an evaluation report or a run report (report.json); look under provenance when present.
|
|
642
640
|
prov = bdata.get("provenance") if isinstance(bdata, dict) else None
|
|
643
641
|
if isinstance(prov, dict):
|
|
644
642
|
pd = prov.get("provider_digest")
|
|
@@ -650,8 +648,8 @@ def verify_command(
|
|
|
650
648
|
|
|
651
649
|
malformed_any = False
|
|
652
650
|
try:
|
|
653
|
-
for cert_path in
|
|
654
|
-
cert_obj =
|
|
651
|
+
for cert_path in reports:
|
|
652
|
+
cert_obj = _load_evaluation_report(cert_path)
|
|
655
653
|
|
|
656
654
|
# Enforce provider digest presence in CI/Release profiles
|
|
657
655
|
try:
|
|
@@ -679,24 +677,21 @@ def verify_command(
|
|
|
679
677
|
)
|
|
680
678
|
|
|
681
679
|
# Structural checks
|
|
682
|
-
errors =
|
|
680
|
+
errors = _validate_evaluation_report_payload(cert_path, profile=profile)
|
|
683
681
|
# JSON path: emit a typed ValidationError for schema failures to include error.code
|
|
684
682
|
if json_out and any(
|
|
685
683
|
"schema validation failed" in str(e).lower() for e in errors
|
|
686
684
|
):
|
|
687
685
|
raise _ValidationError(
|
|
688
686
|
code="E601",
|
|
689
|
-
message="
|
|
687
|
+
message="REPORT-SCHEMA-INVALID: schema validation failed",
|
|
690
688
|
details={"path": str(cert_path)},
|
|
691
689
|
)
|
|
692
690
|
# Determine malformed vs policy-fail for this cert
|
|
693
691
|
is_malformed = any(
|
|
694
692
|
("schema validation failed" in e.lower())
|
|
695
693
|
or ("missing primary_metric.ratio_vs_baseline" in e)
|
|
696
|
-
or (
|
|
697
|
-
"Certificate is missing a finite primary_metric.ratio_vs_baseline"
|
|
698
|
-
in e
|
|
699
|
-
)
|
|
694
|
+
or ("report is missing a finite primary_metric.ratio_vs_baseline" in e)
|
|
700
695
|
for e in errors
|
|
701
696
|
)
|
|
702
697
|
malformed_any = malformed_any or is_malformed
|
|
@@ -813,7 +808,7 @@ def verify_command(
|
|
|
813
808
|
)
|
|
814
809
|
raise _MetricsError(
|
|
815
810
|
code="E602",
|
|
816
|
-
message="RECOMPUTE-MISMATCH:
|
|
811
|
+
message="RECOMPUTE-MISMATCH: report values disagree with recomputation",
|
|
817
812
|
details={"example": str(first)},
|
|
818
813
|
)
|
|
819
814
|
|
|
@@ -835,11 +830,11 @@ def verify_command(
|
|
|
835
830
|
if not overall_ok:
|
|
836
831
|
code = 2 if malformed_any else 1
|
|
837
832
|
if json_out:
|
|
838
|
-
# Build per-
|
|
833
|
+
# Build per-report results payload
|
|
839
834
|
results: list[dict[str, Any]] = []
|
|
840
|
-
for cert_path in
|
|
835
|
+
for cert_path in reports:
|
|
841
836
|
try:
|
|
842
|
-
cert_obj =
|
|
837
|
+
cert_obj = _load_evaluation_report(cert_path)
|
|
843
838
|
except Exception:
|
|
844
839
|
cert_obj = {}
|
|
845
840
|
pm = (
|
|
@@ -980,7 +975,7 @@ def verify_command(
|
|
|
980
975
|
"ok": False,
|
|
981
976
|
"reason": "malformed" if malformed_any else "policy_fail",
|
|
982
977
|
},
|
|
983
|
-
"
|
|
978
|
+
"evaluation_report": {"count": len(reports)},
|
|
984
979
|
"results": results,
|
|
985
980
|
"resolution": {"exit_code": code},
|
|
986
981
|
}
|
|
@@ -989,11 +984,11 @@ def verify_command(
|
|
|
989
984
|
|
|
990
985
|
# Success emission
|
|
991
986
|
if json_out:
|
|
992
|
-
# Build per-
|
|
987
|
+
# Build per-report success results payload
|
|
993
988
|
results: list[dict[str, Any]] = []
|
|
994
|
-
for cert_path in
|
|
989
|
+
for cert_path in reports:
|
|
995
990
|
try:
|
|
996
|
-
cert_obj =
|
|
991
|
+
cert_obj = _load_evaluation_report(cert_path)
|
|
997
992
|
except Exception:
|
|
998
993
|
cert_obj = {}
|
|
999
994
|
pm = (
|
|
@@ -1122,7 +1117,7 @@ def verify_command(
|
|
|
1122
1117
|
payload = {
|
|
1123
1118
|
"format_version": FORMAT_VERIFY,
|
|
1124
1119
|
"summary": {"ok": True, "reason": "ok"},
|
|
1125
|
-
"
|
|
1120
|
+
"evaluation_report": {"count": len(reports)},
|
|
1126
1121
|
"results": results,
|
|
1127
1122
|
"resolution": {"exit_code": 0},
|
|
1128
1123
|
}
|
|
@@ -1130,7 +1125,7 @@ def verify_command(
|
|
|
1130
1125
|
else:
|
|
1131
1126
|
# Human-friendly success line
|
|
1132
1127
|
try:
|
|
1133
|
-
last =
|
|
1128
|
+
last = _load_evaluation_report(reports[-1]) if reports else {}
|
|
1134
1129
|
pm = last.get("primary_metric", {}) if isinstance(last, dict) else {}
|
|
1135
1130
|
kind = str(pm.get("kind") or "").strip()
|
|
1136
1131
|
ppl = last.get("ppl", {}) if isinstance(last, dict) else {}
|
|
@@ -1181,7 +1176,7 @@ def verify_command(
|
|
|
1181
1176
|
"summary": {"ok": False, "reason": reason},
|
|
1182
1177
|
"results": [
|
|
1183
1178
|
{
|
|
1184
|
-
"id": str(
|
|
1179
|
+
"id": str(reports[0]) if reports else "",
|
|
1185
1180
|
"schema_version": "v1",
|
|
1186
1181
|
"kind": "",
|
|
1187
1182
|
"ok": False,
|
|
@@ -1213,7 +1208,7 @@ def verify_command(
|
|
|
1213
1208
|
"summary": {"ok": False, "reason": reason},
|
|
1214
1209
|
"results": [
|
|
1215
1210
|
{
|
|
1216
|
-
"id": str(
|
|
1211
|
+
"id": str(reports[0]) if reports else "",
|
|
1217
1212
|
"schema_version": "v1",
|
|
1218
1213
|
"kind": "",
|
|
1219
1214
|
"ok": False,
|
invarlock/cli/determinism.py
CHANGED
|
@@ -5,7 +5,7 @@ Centralizes:
|
|
|
5
5
|
- Thread caps (OMP/MKL/etc + torch threads)
|
|
6
6
|
- TF32 policy
|
|
7
7
|
- torch deterministic algorithms
|
|
8
|
-
- A structured "determinism level" for
|
|
8
|
+
- A structured "determinism level" for evaluation report provenance
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|
invarlock/cli/provenance.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Provides a tiny, versioned schema describing the adapter family and the
|
|
4
4
|
underlying library versions. This does not perform any edits; it only reads
|
|
5
|
-
environment and import metadata to annotate
|
|
5
|
+
environment and import metadata to annotate evaluation artifacts.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
@@ -46,12 +46,12 @@ def extract_adapter_provenance(adapter_name: str) -> AdapterProvenance:
|
|
|
46
46
|
msg = (
|
|
47
47
|
None
|
|
48
48
|
if supported
|
|
49
|
-
else f"Use Compare &
|
|
49
|
+
else f"Use Compare & Evaluate (BYOE); {library} version unsupported (tested: {tested})"
|
|
50
50
|
)
|
|
51
51
|
except Exception: # Package not installed or version unknown
|
|
52
52
|
ver = None
|
|
53
53
|
supported = False
|
|
54
|
-
msg = f"{library} not available; prefer Compare &
|
|
54
|
+
msg = f"{library} not available; prefer Compare & Evaluate (BYOE) or install extras."
|
|
55
55
|
|
|
56
56
|
return AdapterProvenance(
|
|
57
57
|
family=family,
|
invarlock/core/bootstrap.py
CHANGED
|
@@ -6,7 +6,7 @@ Numerically stable bootstrap helpers for evaluation metrics.
|
|
|
6
6
|
|
|
7
7
|
This module provides bias-corrected and accelerated (BCa) confidence
|
|
8
8
|
intervals tailored for paired log-loss statistics used by the runner
|
|
9
|
-
and evaluation
|
|
9
|
+
and evaluation reports.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
from __future__ import annotations
|
invarlock/core/retry.py
CHANGED
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
InvarLock Retry Controller
|
|
3
3
|
=====================
|
|
4
4
|
|
|
5
|
-
Manages retry logic for automated
|
|
5
|
+
Manages retry logic for automated evaluation workflows with:
|
|
6
6
|
- Attempt budgets (max 3 attempts default)
|
|
7
7
|
- Time budgets (optional timeout)
|
|
8
8
|
- Parameter adjustment strategies per edit type
|
|
9
|
-
-
|
|
9
|
+
- Gate-driven retry decisions
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
from __future__ import annotations
|
|
@@ -19,7 +19,7 @@ __all__ = ["RetryController", "adjust_edit_params"]
|
|
|
19
19
|
|
|
20
20
|
class RetryController:
|
|
21
21
|
"""
|
|
22
|
-
Controls retry logic for
|
|
22
|
+
Controls retry logic for evaluation-report-driven automation.
|
|
23
23
|
|
|
24
24
|
Features:
|
|
25
25
|
- Attempt budget enforcement (default 3 max)
|
|
@@ -45,18 +45,18 @@ class RetryController:
|
|
|
45
45
|
self.start_time = time.time()
|
|
46
46
|
self.attempt_history: list[dict[str, Any]] = []
|
|
47
47
|
|
|
48
|
-
def should_retry(self,
|
|
48
|
+
def should_retry(self, report_passed: bool) -> bool:
|
|
49
49
|
"""
|
|
50
50
|
Determine if retry should be attempted.
|
|
51
51
|
|
|
52
52
|
Args:
|
|
53
|
-
|
|
53
|
+
report_passed: Whether evaluation report gates passed
|
|
54
54
|
|
|
55
55
|
Returns:
|
|
56
56
|
True if retry should be attempted, False otherwise
|
|
57
57
|
"""
|
|
58
|
-
# If
|
|
59
|
-
if
|
|
58
|
+
# If report passed, no retry needed
|
|
59
|
+
if report_passed:
|
|
60
60
|
return False
|
|
61
61
|
|
|
62
62
|
# Check attempt budget (attempt count equals history length)
|
|
@@ -81,21 +81,21 @@ class RetryController:
|
|
|
81
81
|
def record_attempt(
|
|
82
82
|
self,
|
|
83
83
|
attempt_num: int,
|
|
84
|
-
|
|
84
|
+
report_result: dict[str, Any],
|
|
85
85
|
edit_params: dict[str, Any],
|
|
86
86
|
) -> None:
|
|
87
87
|
"""Record details of an attempt for tracking."""
|
|
88
|
-
|
|
88
|
+
report_result = report_result or {}
|
|
89
89
|
edit_params = edit_params or {}
|
|
90
90
|
|
|
91
91
|
self.attempt_history.append(
|
|
92
92
|
{
|
|
93
93
|
"attempt": attempt_num,
|
|
94
94
|
"timestamp": time.time(),
|
|
95
|
-
"
|
|
95
|
+
"report_passed": report_result.get("passed", False),
|
|
96
96
|
"edit_params": edit_params.copy(),
|
|
97
|
-
"failures":
|
|
98
|
-
"validation":
|
|
97
|
+
"failures": report_result.get("failures", []),
|
|
98
|
+
"validation": report_result.get("validation", {}),
|
|
99
99
|
}
|
|
100
100
|
)
|
|
101
101
|
|
|
@@ -114,7 +114,7 @@ def adjust_edit_params(
|
|
|
114
114
|
edit_name: str,
|
|
115
115
|
edit_params: dict[str, Any],
|
|
116
116
|
attempt: int,
|
|
117
|
-
|
|
117
|
+
report_result: dict[str, Any] | None = None,
|
|
118
118
|
) -> dict[str, Any]:
|
|
119
119
|
"""
|
|
120
120
|
Adjust edit parameters for retry attempt based on edit type and failure mode.
|
|
@@ -126,7 +126,7 @@ def adjust_edit_params(
|
|
|
126
126
|
edit_name: Name of the edit operation
|
|
127
127
|
edit_params: Current edit parameters
|
|
128
128
|
attempt: Attempt number (1-indexed)
|
|
129
|
-
|
|
129
|
+
report_result: Optional evaluation report result for failure analysis
|
|
130
130
|
|
|
131
131
|
Returns:
|
|
132
132
|
Adjusted parameters for next attempt
|