invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. invarlock/__init__.py +4 -4
  2. invarlock/adapters/__init__.py +10 -14
  3. invarlock/adapters/auto.py +37 -50
  4. invarlock/adapters/capabilities.py +2 -2
  5. invarlock/adapters/hf_causal.py +418 -0
  6. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  7. invarlock/adapters/hf_loading.py +7 -7
  8. invarlock/adapters/hf_mixin.py +53 -9
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/assurance/__init__.py +15 -23
  12. invarlock/cli/adapter_auto.py +32 -26
  13. invarlock/cli/app.py +128 -27
  14. invarlock/cli/commands/__init__.py +2 -2
  15. invarlock/cli/commands/calibrate.py +48 -4
  16. invarlock/cli/commands/doctor.py +8 -10
  17. invarlock/cli/commands/evaluate.py +986 -0
  18. invarlock/cli/commands/explain_gates.py +25 -17
  19. invarlock/cli/commands/export_html.py +11 -9
  20. invarlock/cli/commands/plugins.py +13 -9
  21. invarlock/cli/commands/report.py +326 -92
  22. invarlock/cli/commands/run.py +1160 -228
  23. invarlock/cli/commands/verify.py +157 -97
  24. invarlock/cli/config.py +1 -1
  25. invarlock/cli/determinism.py +1 -1
  26. invarlock/cli/doctor_helpers.py +4 -5
  27. invarlock/cli/output.py +193 -0
  28. invarlock/cli/provenance.py +4 -4
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/registry.py +9 -11
  31. invarlock/core/retry.py +14 -14
  32. invarlock/core/runner.py +112 -26
  33. invarlock/edits/noop.py +2 -2
  34. invarlock/edits/quant_rtn.py +67 -39
  35. invarlock/eval/__init__.py +1 -1
  36. invarlock/eval/bench.py +14 -10
  37. invarlock/eval/data.py +68 -23
  38. invarlock/eval/metrics.py +59 -1
  39. invarlock/eval/primary_metric.py +1 -1
  40. invarlock/eval/tasks/__init__.py +12 -0
  41. invarlock/eval/tasks/classification.py +48 -0
  42. invarlock/eval/tasks/qa.py +36 -0
  43. invarlock/eval/tasks/text_generation.py +102 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/rmt.py +2 -2
  46. invarlock/guards/spectral.py +1 -1
  47. invarlock/guards/variance.py +2 -2
  48. invarlock/model_profile.py +64 -62
  49. invarlock/observability/health.py +6 -6
  50. invarlock/observability/metrics.py +108 -0
  51. invarlock/plugins/hf_bnb_adapter.py +32 -21
  52. invarlock/reporting/__init__.py +18 -4
  53. invarlock/reporting/guards_analysis.py +154 -4
  54. invarlock/reporting/html.py +61 -11
  55. invarlock/reporting/normalizer.py +9 -2
  56. invarlock/reporting/policy_utils.py +1 -1
  57. invarlock/reporting/primary_metric_utils.py +11 -11
  58. invarlock/reporting/render.py +876 -510
  59. invarlock/reporting/report.py +72 -30
  60. invarlock/reporting/{certificate.py → report_builder.py} +252 -99
  61. invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
  62. invarlock/reporting/report_types.py +6 -1
  63. invarlock/reporting/telemetry.py +86 -0
  64. invarlock-0.3.8.dist-info/METADATA +283 -0
  65. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
  66. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
  67. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
  68. invarlock/adapters/hf_gpt2.py +0 -404
  69. invarlock/adapters/hf_llama.py +0 -487
  70. invarlock/cli/commands/certify.py +0 -422
  71. invarlock-0.3.6.dist-info/METADATA +0 -588
  72. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
  73. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  invarlock verify command
3
3
  ====================
4
4
 
5
- Validates generated safety certificates for internal consistency. The command
5
+ Validates generated evaluation reports for internal consistency. The command
6
6
  ensures schema compliance, checks that the primary metric ratio agrees with the
7
7
  baseline reference, and enforces paired-window guarantees (match=1.0,
8
8
  overlap=0.0).
@@ -26,7 +26,7 @@ from invarlock.core.exceptions import (
26
26
  from invarlock.core.exceptions import (
27
27
  ValidationError as _ValidationError,
28
28
  )
29
- from invarlock.reporting.certificate import validate_certificate
29
+ from invarlock.reporting.report_builder import validate_report
30
30
 
31
31
  from .._json import emit as _emit_json
32
32
  from .._json import encode_error as _encode_error
@@ -52,26 +52,42 @@ def _coerce_int(value: Any) -> int | None:
52
52
  return out if out >= 0 else None
53
53
 
54
54
 
55
- def _load_certificate(path: Path) -> dict[str, Any]:
56
- """Load certificate JSON from disk."""
55
+ def _load_evaluation_report(path: Path) -> dict[str, Any]:
56
+ """Load an evaluation report JSON from disk."""
57
57
  with path.open("r", encoding="utf-8") as handle:
58
58
  return json.load(handle)
59
59
 
60
60
 
61
- def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
61
+ def _validate_primary_metric(report: dict[str, Any]) -> list[str]:
62
62
  """Validate primary metric ratio consistency with baseline reference."""
63
63
  errors: list[str] = []
64
- pm = certificate.get("primary_metric", {}) or {}
64
+ pm = report.get("primary_metric", {}) or {}
65
65
  if not isinstance(pm, dict) or not pm:
66
- errors.append("Certificate missing primary_metric block.")
66
+ errors.append("report missing primary_metric block.")
67
67
  return errors
68
68
 
69
+ def _is_finite_number(value: Any) -> bool:
70
+ return isinstance(value, (int, float)) and math.isfinite(float(value))
71
+
72
+ def _declares_invalid_primary_metric(metric: dict[str, Any]) -> bool:
73
+ if bool(metric.get("invalid")):
74
+ return True
75
+ reason = metric.get("degraded_reason")
76
+ if isinstance(reason, str):
77
+ r = reason.strip().lower()
78
+ return r.startswith("non_finite") or r in {
79
+ "primary_metric_invalid",
80
+ "evaluation_error",
81
+ }
82
+ return False
83
+
69
84
  kind = str(pm.get("kind", "")).lower()
70
85
  ratio_vs_baseline = pm.get("ratio_vs_baseline")
71
86
  final = pm.get("final")
87
+ pm_invalid = _declares_invalid_primary_metric(pm)
72
88
 
73
89
  if kind.startswith("ppl"):
74
- baseline_ref = certificate.get("baseline_ref", {}) or {}
90
+ baseline_ref = report.get("baseline_ref", {}) or {}
75
91
  baseline_pm = (
76
92
  baseline_ref.get("primary_metric")
77
93
  if isinstance(baseline_ref, dict)
@@ -82,18 +98,16 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
82
98
  bv = baseline_pm.get("final")
83
99
  if isinstance(bv, (int | float)):
84
100
  baseline_final = float(bv)
85
- if isinstance(final, int | float) and isinstance(baseline_final, int | float):
86
- if baseline_final <= 0.0:
101
+ if _is_finite_number(final) and _is_finite_number(baseline_final):
102
+ if float(baseline_final) <= 0.0:
87
103
  errors.append(
88
104
  f"Baseline final must be > 0.0 to compute ratio (found {baseline_final})."
89
105
  )
90
106
  else:
91
107
  expected_ratio = float(final) / float(baseline_final)
92
- if not isinstance(ratio_vs_baseline, int | float) or not math.isfinite(
93
- float(ratio_vs_baseline)
94
- ):
108
+ if not _is_finite_number(ratio_vs_baseline):
95
109
  errors.append(
96
- "Certificate is missing a finite primary_metric.ratio_vs_baseline value."
110
+ "report is missing a finite primary_metric.ratio_vs_baseline value."
97
111
  )
98
112
  elif not math.isclose(
99
113
  float(ratio_vs_baseline), expected_ratio, rel_tol=1e-6, abs_tol=1e-6
@@ -102,19 +116,30 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
102
116
  "Primary metric ratio mismatch: "
103
117
  f"recorded={float(ratio_vs_baseline):.12f}, expected={expected_ratio:.12f}"
104
118
  )
119
+ else:
120
+ # If the primary metric is non-finite, it must be explicitly marked invalid.
121
+ # This is expected for structural error-injection runs (NaN/Inf weights).
122
+ if (isinstance(final, (int | float)) and not _is_finite_number(final)) and (
123
+ not pm_invalid
124
+ ):
125
+ errors.append(
126
+ "Primary metric final is non-finite but primary_metric.invalid is not set."
127
+ )
105
128
  else:
129
+ if pm_invalid:
130
+ return errors
106
131
  if ratio_vs_baseline is None or not isinstance(ratio_vs_baseline, int | float):
107
132
  errors.append(
108
- "Certificate missing primary_metric.ratio_vs_baseline for non-ppl metric."
133
+ "report missing primary_metric.ratio_vs_baseline for non-ppl metric."
109
134
  )
110
135
 
111
136
  return errors
112
137
 
113
138
 
114
- def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
139
+ def _validate_pairing(report: dict[str, Any]) -> list[str]:
115
140
  """Validate window pairing metrics (PM-only location)."""
116
141
  errors: list[str] = []
117
- stats = certificate.get("dataset", {}).get("windows", {}).get("stats", {})
142
+ stats = report.get("dataset", {}).get("windows", {}).get("stats", {})
118
143
 
119
144
  match_fraction = stats.get("window_match_fraction")
120
145
  overlap_fraction = stats.get("window_overlap_fraction")
@@ -123,23 +148,23 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
123
148
 
124
149
  if pairing_reason is not None:
125
150
  errors.append(
126
- "window_pairing_reason must be null/None for paired certificates "
151
+ "window_pairing_reason must be null/None for paired reports "
127
152
  f"(found {pairing_reason!r})."
128
153
  )
129
154
  if paired_windows is None:
130
- errors.append("Certificate missing paired_windows metric.")
155
+ errors.append("report missing paired_windows metric.")
131
156
  elif paired_windows == 0:
132
- errors.append("paired_windows must be > 0 for paired certificates (found 0).")
157
+ errors.append("paired_windows must be > 0 for paired reports (found 0).")
133
158
 
134
159
  if match_fraction is None:
135
- errors.append("Certificate missing window_match_fraction metric.")
160
+ errors.append("report missing window_match_fraction metric.")
136
161
  elif match_fraction < 0.999999:
137
162
  errors.append(
138
163
  f"window_match_fraction must be 1.0 for paired runs (found {match_fraction:.6f})."
139
164
  )
140
165
 
141
166
  if overlap_fraction is None:
142
- errors.append("Certificate missing window_overlap_fraction metric.")
167
+ errors.append("report missing window_overlap_fraction metric.")
143
168
  elif overlap_fraction > 1e-9:
144
169
  errors.append(
145
170
  f"window_overlap_fraction must be 0.0 (found {overlap_fraction:.6f})."
@@ -148,10 +173,10 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
148
173
  return errors
149
174
 
150
175
 
151
- def _validate_counts(certificate: dict[str, Any]) -> list[str]:
176
+ def _validate_counts(report: dict[str, Any]) -> list[str]:
152
177
  """Validate preview/final window counts align with dataset configuration."""
153
178
  errors: list[str] = []
154
- dataset = certificate.get("dataset", {})
179
+ dataset = report.get("dataset", {})
155
180
  dataset_windows = dataset.get("windows", {})
156
181
  expected_preview = dataset_windows.get("preview")
157
182
  expected_final = dataset_windows.get("final")
@@ -165,9 +190,7 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
165
190
 
166
191
  if expected_preview is not None:
167
192
  if preview_used is None:
168
- errors.append(
169
- "Certificate missing coverage.preview.used for preview windows."
170
- )
193
+ errors.append("report missing coverage.preview.used for preview windows.")
171
194
  elif int(preview_used) != int(expected_preview):
172
195
  errors.append(
173
196
  f"Preview window count mismatch: expected {expected_preview}, observed {preview_used}."
@@ -175,7 +198,7 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
175
198
 
176
199
  if expected_final is not None:
177
200
  if final_used is None:
178
- errors.append("Certificate missing coverage.final.used for final windows.")
201
+ errors.append("report missing coverage.final.used for final windows.")
179
202
  elif int(final_used) != int(expected_final):
180
203
  errors.append(
181
204
  f"Final window count mismatch: expected {expected_final}, observed {final_used}."
@@ -193,40 +216,79 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
193
216
  return errors
194
217
 
195
218
 
196
- def _validate_drift_band(certificate: dict[str, Any]) -> list[str]:
197
- """Validate preview→final drift stays within the configured band (0.95–1.05)."""
219
+ def _validate_drift_band(report: dict[str, Any]) -> list[str]:
220
+ """Validate preview→final drift stays within the configured band.
221
+
222
+ Defaults to 0.95–1.05 unless the report provides `primary_metric.drift_band`.
223
+ """
198
224
  errors: list[str] = []
199
- pm = certificate.get("primary_metric", {}) or {}
225
+ pm = report.get("primary_metric", {}) or {}
226
+ if not isinstance(pm, dict) or not pm:
227
+ errors.append("report missing primary_metric block.")
228
+ return errors
229
+ if bool(pm.get("invalid")):
230
+ # Drift is undefined when the primary metric is invalid (e.g., NaN/Inf weights).
231
+ return errors
200
232
  drift_ratio = None
201
233
  try:
202
234
  prev = pm.get("preview")
203
235
  fin = pm.get("final")
204
- if isinstance(prev, int | float) and isinstance(fin, int | float) and prev > 0:
236
+ if (
237
+ isinstance(prev, int | float)
238
+ and isinstance(fin, int | float)
239
+ and math.isfinite(float(prev))
240
+ and math.isfinite(float(fin))
241
+ and prev > 0
242
+ ):
205
243
  drift_ratio = float(fin) / float(prev)
206
244
  except Exception:
207
245
  drift_ratio = None
208
246
 
209
247
  if not isinstance(drift_ratio, int | float):
210
- errors.append("Certificate missing preview/final to compute drift ratio.")
248
+ errors.append("report missing preview/final to compute drift ratio.")
211
249
  return errors
212
250
 
213
- if not 0.95 <= float(drift_ratio) <= 1.05:
251
+ drift_min = 0.95
252
+ drift_max = 1.05
253
+ band = pm.get("drift_band")
254
+ try:
255
+ if isinstance(band, dict):
256
+ lo = band.get("min")
257
+ hi = band.get("max")
258
+ if isinstance(lo, int | float) and isinstance(hi, int | float):
259
+ lo_f = float(lo)
260
+ hi_f = float(hi)
261
+ if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
262
+ drift_min = lo_f
263
+ drift_max = hi_f
264
+ elif isinstance(band, list | tuple) and len(band) == 2:
265
+ lo_raw, hi_raw = band[0], band[1]
266
+ if isinstance(lo_raw, int | float) and isinstance(hi_raw, int | float):
267
+ lo_f = float(lo_raw)
268
+ hi_f = float(hi_raw)
269
+ if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
270
+ drift_min = lo_f
271
+ drift_max = hi_f
272
+ except Exception:
273
+ pass
274
+
275
+ if not drift_min <= float(drift_ratio) <= drift_max:
214
276
  errors.append(
215
- f"Preview→final drift ratio out of band (0.951.05): observed {drift_ratio:.6f}."
277
+ f"Preview→final drift ratio out of band ({drift_min:.2f}{drift_max:.2f}): observed {drift_ratio:.6f}."
216
278
  )
217
279
 
218
280
  return errors
219
281
 
220
282
 
221
- def _validate_tokenizer_hash(certificate: dict[str, Any]) -> list[str]:
283
+ def _validate_tokenizer_hash(report: dict[str, Any]) -> list[str]:
222
284
  """Validate tokenizer hash consistency between baseline and edited runs.
223
285
 
224
286
  The check is enforced only when both hashes are present. When present and
225
287
  different, the verification fails.
226
288
  """
227
289
  errors: list[str] = []
228
- meta = certificate.get("meta", {}) or {}
229
- dataset = certificate.get("dataset", {}) or {}
290
+ meta = report.get("meta", {}) or {}
291
+ dataset = report.get("dataset", {}) or {}
230
292
  edited_hash = None
231
293
  try:
232
294
  # Prefer meta.tokenizer_hash; fall back to dataset.tokenizer.hash
@@ -238,7 +300,7 @@ def _validate_tokenizer_hash(certificate: dict[str, Any]) -> list[str]:
238
300
  except Exception:
239
301
  edited_hash = None
240
302
 
241
- baseline_ref = certificate.get("baseline_ref", {}) or {}
303
+ baseline_ref = report.get("baseline_ref", {}) or {}
242
304
  baseline_hash = baseline_ref.get("tokenizer_hash")
243
305
 
244
306
  if isinstance(edited_hash, str) and isinstance(baseline_hash, str):
@@ -270,15 +332,15 @@ def _measurement_contract_digest(contract: Any) -> str | None:
270
332
 
271
333
 
272
334
  def _validate_measurement_contracts(
273
- certificate: dict[str, Any], *, profile: str
335
+ report: dict[str, Any], *, profile: str
274
336
  ) -> list[str]:
275
337
  """Enforce measurement-contract presence and baseline pairing for guards."""
276
338
  errors: list[str] = []
277
339
  prof = (profile or "").strip().lower()
278
- resolved_policy = certificate.get("resolved_policy") or {}
340
+ resolved_policy = report.get("resolved_policy") or {}
279
341
 
280
342
  for guard_key in ("spectral", "rmt"):
281
- block = certificate.get(guard_key) or {}
343
+ block = report.get(guard_key) or {}
282
344
  if not isinstance(block, dict):
283
345
  continue
284
346
  evaluated = bool(block.get("evaluated", True))
@@ -289,14 +351,14 @@ def _validate_measurement_contracts(
289
351
  mc_hash = _measurement_contract_digest(mc)
290
352
  expected_hash = block.get("measurement_contract_hash")
291
353
  if not isinstance(mc, dict) or not mc:
292
- errors.append(f"Certificate missing {guard_key}.measurement_contract.")
354
+ errors.append(f"report missing {guard_key}.measurement_contract.")
293
355
  elif isinstance(expected_hash, str) and expected_hash:
294
356
  if mc_hash and mc_hash != expected_hash:
295
357
  errors.append(
296
358
  f"{guard_key}.measurement_contract_hash mismatch: expected={expected_hash}, computed={mc_hash}."
297
359
  )
298
360
  else:
299
- errors.append(f"Certificate missing {guard_key}.measurement_contract_hash.")
361
+ errors.append(f"report missing {guard_key}.measurement_contract_hash.")
300
362
 
301
363
  rp_guard = (
302
364
  resolved_policy.get(guard_key)
@@ -309,7 +371,7 @@ def _validate_measurement_contracts(
309
371
  rp_hash = _measurement_contract_digest(rp_mc)
310
372
  if not isinstance(rp_mc, dict) or not rp_mc:
311
373
  errors.append(
312
- f"Certificate missing resolved_policy.{guard_key}.measurement_contract."
374
+ f"report missing resolved_policy.{guard_key}.measurement_contract."
313
375
  )
314
376
  elif mc_hash and rp_hash and mc_hash != rp_hash:
315
377
  errors.append(
@@ -327,10 +389,10 @@ def _validate_measurement_contracts(
327
389
  return errors
328
390
 
329
391
 
330
- def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
331
- """Apply model-profile specific lint rules embedded in the certificate."""
392
+ def _apply_profile_lints(report: dict[str, Any]) -> list[str]:
393
+ """Apply model-profile specific lint rules embedded in the report."""
332
394
  errors: list[str] = []
333
- meta = certificate.get("meta", {})
395
+ meta = report.get("meta", {})
334
396
  profile = meta.get("model_profile") if isinstance(meta, dict) else None
335
397
  if not isinstance(profile, dict):
336
398
  return errors
@@ -346,7 +408,7 @@ def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
346
408
  path = lint.get("path")
347
409
  expected = lint.get("value")
348
410
  message = lint.get("message") or "Model profile lint failed."
349
- actual = _resolve_path(certificate, path) if isinstance(path, str) else None
411
+ actual = _resolve_path(report, path) if isinstance(path, str) else None
350
412
 
351
413
  if lint_type == "equals":
352
414
  if actual != expected:
@@ -383,21 +445,21 @@ def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
383
445
  return errors
384
446
 
385
447
 
386
- def _validate_certificate_payload(
448
+ def _validate_evaluation_report_payload(
387
449
  path: Path, *, profile: str | None = None
388
450
  ) -> list[str]:
389
- """Run all verification checks for a single certificate."""
451
+ """Run all verification checks for a single evaluation report."""
390
452
  errors: list[str] = []
391
- certificate = _load_certificate(path)
453
+ report = _load_evaluation_report(path)
392
454
 
393
455
  # Always surface schema validation failures for this payload
394
- if not validate_certificate(certificate):
395
- errors.append("Certificate schema validation failed.")
456
+ if not validate_report(report):
457
+ errors.append("report schema validation failed.")
396
458
  return errors
397
459
 
398
- errors.extend(_validate_primary_metric(certificate))
399
- errors.extend(_validate_pairing(certificate))
400
- errors.extend(_validate_counts(certificate))
460
+ errors.extend(_validate_primary_metric(report))
461
+ errors.extend(_validate_pairing(report))
462
+ errors.extend(_validate_counts(report))
401
463
  try:
402
464
  prof = (
403
465
  (profile or "").strip().lower()
@@ -406,24 +468,25 @@ def _validate_certificate_payload(
406
468
  )
407
469
  except Exception:
408
470
  prof = "dev"
409
- # Enforce drift band only for CI/Release; skip in dev profile
471
+ # Drift band is a CI/Release enforcement check; dev profile should not
472
+ # fail verification due to preview→final drift.
410
473
  if prof in {"ci", "release"}:
411
- errors.extend(_validate_drift_band(certificate))
412
- errors.extend(_apply_profile_lints(certificate))
413
- errors.extend(_validate_tokenizer_hash(certificate))
474
+ errors.extend(_validate_drift_band(report))
475
+ errors.extend(_apply_profile_lints(report))
476
+ errors.extend(_validate_tokenizer_hash(report))
414
477
  if prof in {"ci", "release"}:
415
- errors.extend(_validate_measurement_contracts(certificate, profile=prof))
478
+ errors.extend(_validate_measurement_contracts(report, profile=prof))
416
479
 
417
480
  # strict/fast assurance mode checks were removed; verification gates rely on
418
481
  # structural schema + guard metric contracts instead.
419
482
 
420
483
  # Release-only enforcement: guard overhead must be measured or explicitly skipped.
421
484
  if prof == "release":
422
- go = certificate.get("guard_overhead")
485
+ go = report.get("guard_overhead")
423
486
  if not isinstance(go, dict) or not go:
424
487
  errors.append(
425
488
  "Release verification requires guard_overhead (missing). "
426
- "Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during certification."
489
+ "Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during evaluation."
427
490
  )
428
491
  else:
429
492
  skipped = bool(go.get("skipped", False)) or (
@@ -434,7 +497,7 @@ def _validate_certificate_payload(
434
497
  if evaluated is not True:
435
498
  errors.append(
436
499
  "Release verification requires evaluated guard_overhead (not evaluated). "
437
- "Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during certification."
500
+ "Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during evaluation."
438
501
  )
439
502
  ratio = go.get("overhead_ratio")
440
503
  if ratio is None:
@@ -446,14 +509,14 @@ def _validate_certificate_payload(
446
509
  return errors
447
510
 
448
511
 
449
- def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any]) -> None:
512
+ def _warn_adapter_family_mismatch(cert_path: Path, report: dict[str, Any]) -> None:
450
513
  """Emit a soft warning if adapter families differ between baseline and edited.
451
514
 
452
515
  This is a non-fatal hint to catch inadvertent cross-family comparisons.
453
- Tries to load the baseline report referenced in the certificate provenance.
516
+ Tries to load the baseline report referenced in the report provenance.
454
517
  """
455
518
  try:
456
- plugins = certificate.get("plugins") or {}
519
+ plugins = report.get("plugins") or {}
457
520
  adapter_meta = plugins.get("adapter") if isinstance(plugins, dict) else None
458
521
  edited_family = None
459
522
  edited_lib = None
@@ -466,8 +529,8 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
466
529
  edited_ver = prov.get("version") or None
467
530
 
468
531
  baseline_prov = (
469
- certificate.get("provenance")
470
- if isinstance(certificate.get("provenance"), dict)
532
+ report.get("provenance")
533
+ if isinstance(report.get("provenance"), dict)
471
534
  else {}
472
535
  )
473
536
  baseline_report_path = None
@@ -517,7 +580,7 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
517
580
  f"[yellow] • edited : family={edited_family}, backend={edited_backend} {edited_version}[/yellow]"
518
581
  )
519
582
  console.print(
520
- "[yellow] Ensure this cross-family comparison is intentional (Compare & Certify flows should normally match families).[/yellow]"
583
+ "[yellow] Ensure this cross-family comparison is intentional (Compare & Evaluate flows should normally match families).[/yellow]"
521
584
  )
522
585
  except Exception:
523
586
  # Non-fatal and best-effort; suppress errors
@@ -525,18 +588,18 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
525
588
 
526
589
 
527
590
  def verify_command(
528
- certificates: list[Path] = typer.Argument(
591
+ reports: list[Path] = typer.Argument(
529
592
  ...,
530
593
  exists=True,
531
594
  dir_okay=False,
532
595
  readable=True,
533
596
  resolve_path=True,
534
- help="One or more certificate JSON files to verify.",
597
+ help="One or more evaluation report JSON files to verify.",
535
598
  ),
536
599
  baseline: Path | None = typer.Option(
537
600
  None,
538
601
  "--baseline",
539
- help="Optional baseline certificate/report JSON to enforce provider parity.",
602
+ help="Optional baseline evaluation report (or run report) JSON to enforce provider parity.",
540
603
  ),
541
604
  tolerance: float = typer.Option(
542
605
  1e-9,
@@ -555,9 +618,9 @@ def verify_command(
555
618
  ),
556
619
  ) -> None:
557
620
  """
558
- Verify certificate integrity.
621
+ Verify evaluation report integrity.
559
622
 
560
- Ensures each certificate passes schema validation, ratio consistency checks,
623
+ Ensures each evaluation report passes schema validation, ratio consistency checks,
561
624
  and strict pairing requirements (match=1.0, overlap=0.0).
562
625
  """
563
626
 
@@ -573,7 +636,7 @@ def verify_command(
573
636
  try:
574
637
  if baseline is not None:
575
638
  bdata = json.loads(baseline.read_text(encoding="utf-8"))
576
- # Accept either a certificate or a raw report; look under provenance when present
639
+ # Accept either an evaluation report or a run report (report.json); look under provenance when present.
577
640
  prov = bdata.get("provenance") if isinstance(bdata, dict) else None
578
641
  if isinstance(prov, dict):
579
642
  pd = prov.get("provider_digest")
@@ -585,8 +648,8 @@ def verify_command(
585
648
 
586
649
  malformed_any = False
587
650
  try:
588
- for cert_path in certificates:
589
- cert_obj = _load_certificate(cert_path)
651
+ for cert_path in reports:
652
+ cert_obj = _load_evaluation_report(cert_path)
590
653
 
591
654
  # Enforce provider digest presence in CI/Release profiles
592
655
  try:
@@ -614,24 +677,21 @@ def verify_command(
614
677
  )
615
678
 
616
679
  # Structural checks
617
- errors = _validate_certificate_payload(cert_path, profile=profile)
680
+ errors = _validate_evaluation_report_payload(cert_path, profile=profile)
618
681
  # JSON path: emit a typed ValidationError for schema failures to include error.code
619
682
  if json_out and any(
620
683
  "schema validation failed" in str(e).lower() for e in errors
621
684
  ):
622
685
  raise _ValidationError(
623
686
  code="E601",
624
- message="CERTIFICATE-SCHEMA-INVALID: schema validation failed",
687
+ message="REPORT-SCHEMA-INVALID: schema validation failed",
625
688
  details={"path": str(cert_path)},
626
689
  )
627
690
  # Determine malformed vs policy-fail for this cert
628
691
  is_malformed = any(
629
692
  ("schema validation failed" in e.lower())
630
693
  or ("missing primary_metric.ratio_vs_baseline" in e)
631
- or (
632
- "Certificate is missing a finite primary_metric.ratio_vs_baseline"
633
- in e
634
- )
694
+ or ("report is missing a finite primary_metric.ratio_vs_baseline" in e)
635
695
  for e in errors
636
696
  )
637
697
  malformed_any = malformed_any or is_malformed
@@ -748,7 +808,7 @@ def verify_command(
748
808
  )
749
809
  raise _MetricsError(
750
810
  code="E602",
751
- message="RECOMPUTE-MISMATCH: certificate values disagree with recomputation",
811
+ message="RECOMPUTE-MISMATCH: report values disagree with recomputation",
752
812
  details={"example": str(first)},
753
813
  )
754
814
 
@@ -770,11 +830,11 @@ def verify_command(
770
830
  if not overall_ok:
771
831
  code = 2 if malformed_any else 1
772
832
  if json_out:
773
- # Build per-certificate results payload
833
+ # Build per-report results payload
774
834
  results: list[dict[str, Any]] = []
775
- for cert_path in certificates:
835
+ for cert_path in reports:
776
836
  try:
777
- cert_obj = _load_certificate(cert_path)
837
+ cert_obj = _load_evaluation_report(cert_path)
778
838
  except Exception:
779
839
  cert_obj = {}
780
840
  pm = (
@@ -915,7 +975,7 @@ def verify_command(
915
975
  "ok": False,
916
976
  "reason": "malformed" if malformed_any else "policy_fail",
917
977
  },
918
- "certificate": {"count": len(certificates)},
978
+ "evaluation_report": {"count": len(reports)},
919
979
  "results": results,
920
980
  "resolution": {"exit_code": code},
921
981
  }
@@ -924,11 +984,11 @@ def verify_command(
924
984
 
925
985
  # Success emission
926
986
  if json_out:
927
- # Build per-certificate success results payload
987
+ # Build per-report success results payload
928
988
  results: list[dict[str, Any]] = []
929
- for cert_path in certificates:
989
+ for cert_path in reports:
930
990
  try:
931
- cert_obj = _load_certificate(cert_path)
991
+ cert_obj = _load_evaluation_report(cert_path)
932
992
  except Exception:
933
993
  cert_obj = {}
934
994
  pm = (
@@ -1057,7 +1117,7 @@ def verify_command(
1057
1117
  payload = {
1058
1118
  "format_version": FORMAT_VERIFY,
1059
1119
  "summary": {"ok": True, "reason": "ok"},
1060
- "certificate": {"count": len(certificates)},
1120
+ "evaluation_report": {"count": len(reports)},
1061
1121
  "results": results,
1062
1122
  "resolution": {"exit_code": 0},
1063
1123
  }
@@ -1065,7 +1125,7 @@ def verify_command(
1065
1125
  else:
1066
1126
  # Human-friendly success line
1067
1127
  try:
1068
- last = _load_certificate(certificates[-1]) if certificates else {}
1128
+ last = _load_evaluation_report(reports[-1]) if reports else {}
1069
1129
  pm = last.get("primary_metric", {}) if isinstance(last, dict) else {}
1070
1130
  kind = str(pm.get("kind") or "").strip()
1071
1131
  ppl = last.get("ppl", {}) if isinstance(last, dict) else {}
@@ -1116,7 +1176,7 @@ def verify_command(
1116
1176
  "summary": {"ok": False, "reason": reason},
1117
1177
  "results": [
1118
1178
  {
1119
- "id": str(certificates[0]) if certificates else "",
1179
+ "id": str(reports[0]) if reports else "",
1120
1180
  "schema_version": "v1",
1121
1181
  "kind": "",
1122
1182
  "ok": False,
@@ -1148,7 +1208,7 @@ def verify_command(
1148
1208
  "summary": {"ok": False, "reason": reason},
1149
1209
  "results": [
1150
1210
  {
1151
- "id": str(certificates[0]) if certificates else "",
1211
+ "id": str(reports[0]) if reports else "",
1152
1212
  "schema_version": "v1",
1153
1213
  "kind": "",
1154
1214
  "ok": False,
invarlock/cli/config.py CHANGED
@@ -415,7 +415,7 @@ def _deep_merge_dicts(a: dict, b: dict) -> dict: # pragma: no cover - trivial a
415
415
 
416
416
  def create_example_config() -> InvarLockConfig: # pragma: no cover - test helper
417
417
  return InvarLockConfig(
418
- model={"id": "gpt2", "adapter": "hf_gpt2", "device": "auto"},
418
+ model={"id": "gpt2", "adapter": "hf_causal", "device": "auto"},
419
419
  edit={"name": "quant_rtn", "plan": {}},
420
420
  dataset={"provider": "wikitext2", "seq_len": 512, "stride": 512},
421
421
  output={"dir": "runs"},
@@ -5,7 +5,7 @@ Centralizes:
5
5
  - Thread caps (OMP/MKL/etc + torch threads)
6
6
  - TF32 policy
7
7
  - torch deterministic algorithms
8
- - A structured "determinism level" for certificate provenance
8
+ - A structured "determinism level" for evaluation report provenance
9
9
  """
10
10
 
11
11
  from __future__ import annotations