invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. invarlock/__init__.py +4 -4
  2. invarlock/adapters/__init__.py +10 -14
  3. invarlock/adapters/auto.py +37 -50
  4. invarlock/adapters/capabilities.py +2 -2
  5. invarlock/adapters/hf_causal.py +418 -0
  6. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  7. invarlock/adapters/hf_loading.py +7 -7
  8. invarlock/adapters/hf_mixin.py +53 -9
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/assurance/__init__.py +15 -23
  12. invarlock/cli/adapter_auto.py +32 -26
  13. invarlock/cli/app.py +128 -27
  14. invarlock/cli/commands/__init__.py +2 -2
  15. invarlock/cli/commands/calibrate.py +48 -4
  16. invarlock/cli/commands/doctor.py +8 -10
  17. invarlock/cli/commands/evaluate.py +986 -0
  18. invarlock/cli/commands/explain_gates.py +25 -17
  19. invarlock/cli/commands/export_html.py +11 -9
  20. invarlock/cli/commands/plugins.py +13 -9
  21. invarlock/cli/commands/report.py +326 -92
  22. invarlock/cli/commands/run.py +1160 -228
  23. invarlock/cli/commands/verify.py +157 -97
  24. invarlock/cli/config.py +1 -1
  25. invarlock/cli/determinism.py +1 -1
  26. invarlock/cli/doctor_helpers.py +4 -5
  27. invarlock/cli/output.py +193 -0
  28. invarlock/cli/provenance.py +4 -4
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/registry.py +9 -11
  31. invarlock/core/retry.py +14 -14
  32. invarlock/core/runner.py +112 -26
  33. invarlock/edits/noop.py +2 -2
  34. invarlock/edits/quant_rtn.py +67 -39
  35. invarlock/eval/__init__.py +1 -1
  36. invarlock/eval/bench.py +14 -10
  37. invarlock/eval/data.py +68 -23
  38. invarlock/eval/metrics.py +59 -1
  39. invarlock/eval/primary_metric.py +1 -1
  40. invarlock/eval/tasks/__init__.py +12 -0
  41. invarlock/eval/tasks/classification.py +48 -0
  42. invarlock/eval/tasks/qa.py +36 -0
  43. invarlock/eval/tasks/text_generation.py +102 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/rmt.py +2 -2
  46. invarlock/guards/spectral.py +1 -1
  47. invarlock/guards/variance.py +2 -2
  48. invarlock/model_profile.py +64 -62
  49. invarlock/observability/health.py +6 -6
  50. invarlock/observability/metrics.py +108 -0
  51. invarlock/plugins/hf_bnb_adapter.py +32 -21
  52. invarlock/reporting/__init__.py +18 -4
  53. invarlock/reporting/guards_analysis.py +154 -4
  54. invarlock/reporting/html.py +61 -11
  55. invarlock/reporting/normalizer.py +9 -2
  56. invarlock/reporting/policy_utils.py +1 -1
  57. invarlock/reporting/primary_metric_utils.py +11 -11
  58. invarlock/reporting/render.py +876 -510
  59. invarlock/reporting/report.py +72 -30
  60. invarlock/reporting/{certificate.py → report_builder.py} +252 -99
  61. invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
  62. invarlock/reporting/report_types.py +6 -1
  63. invarlock/reporting/telemetry.py +86 -0
  64. invarlock-0.3.8.dist-info/METADATA +283 -0
  65. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
  66. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
  67. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
  68. invarlock/adapters/hf_gpt2.py +0 -404
  69. invarlock/adapters/hf_llama.py +0 -487
  70. invarlock/cli/commands/certify.py +0 -422
  71. invarlock-0.3.6.dist-info/METADATA +0 -588
  72. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
  73. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
@@ -9,8 +9,7 @@ from typing import Any
9
9
 
10
10
  import yaml
11
11
 
12
- # Import certificate module for helper access without creating hard cycles
13
- from . import certificate as C
12
+ from .report_schema import validate_report
14
13
 
15
14
  # Console Validation Block helpers (allow-list driven)
16
15
  _CONSOLE_LABELS_DEFAULT = [
@@ -37,8 +36,10 @@ def _load_console_labels() -> list[str]:
37
36
  return list(_CONSOLE_LABELS_DEFAULT)
38
37
 
39
38
 
40
- def compute_console_validation_block(certificate: dict[str, Any]) -> dict[str, Any]:
41
- """Produce a normalized console validation block from a certificate.
39
+ def compute_console_validation_block(
40
+ evaluation_report: dict[str, Any],
41
+ ) -> dict[str, Any]:
42
+ """Produce a normalized console validation block from an evaluation report.
42
43
 
43
44
  Returns a dict with keys:
44
45
  - labels: the canonical label list
@@ -47,8 +48,8 @@ def compute_console_validation_block(certificate: dict[str, Any]) -> dict[str, A
47
48
  counted only when evaluated.
48
49
  """
49
50
  labels = _load_console_labels()
50
- validation = certificate.get("validation", {}) or {}
51
- guard_ctx = certificate.get("guard_overhead", {}) or {}
51
+ validation = evaluation_report.get("validation", {}) or {}
52
+ guard_ctx = evaluation_report.get("guard_overhead", {}) or {}
52
53
  guard_evaluated = (
53
54
  bool(guard_ctx.get("evaluated")) if isinstance(guard_ctx, dict) else False
54
55
  )
@@ -113,6 +114,462 @@ def _short_digest(v: str) -> str:
113
114
  return v if len(v) <= 16 else (v[:8] + "…" + v[-8:])
114
115
 
115
116
 
117
+ def _render_executive_dashboard(cert: dict[str, Any]) -> str:
118
+ """Render executive summary dashboard table."""
119
+ lines: list[str] = []
120
+ _append_safety_dashboard_section(lines, cert)
121
+ return "\n".join(lines).rstrip()
122
+
123
+
124
+ def _append_safety_dashboard_section(
125
+ lines: list[str], evaluation_report: dict[str, Any]
126
+ ) -> None:
127
+ """Append a concise, first-screen dashboard for the evaluation report."""
128
+ block = compute_console_validation_block(evaluation_report)
129
+ overall_pass = bool(block.get("overall_pass"))
130
+ overall_status = (
131
+ f"{'✅' if overall_pass else '❌'} {'PASS' if overall_pass else 'FAIL'}"
132
+ )
133
+
134
+ validation = evaluation_report.get("validation", {}) or {}
135
+ pm = evaluation_report.get("primary_metric", {}) or {}
136
+ auto = evaluation_report.get("auto", {}) or {}
137
+ tier = str(auto.get("tier") or "balanced").lower()
138
+
139
+ # Primary metric summary
140
+ pm_kind = str(pm.get("kind", "")).lower()
141
+ pm_basis = pm.get("gating_basis") or pm.get("basis") or "point"
142
+ pm_ok: bool | None
143
+ if isinstance(validation, dict) and "primary_metric_acceptable" in validation:
144
+ pm_ok = bool(validation.get("primary_metric_acceptable"))
145
+ else:
146
+ pm_ok = None
147
+ pm_value = pm.get("ratio_vs_baseline")
148
+
149
+ if pm_kind in {"accuracy", "vqa_accuracy"}:
150
+ measured = f"{pm_value:+.2f} pp" if isinstance(pm_value, int | float) else "N/A"
151
+ th_map = {
152
+ "conservative": -0.5,
153
+ "balanced": -1.0,
154
+ "aggressive": -2.0,
155
+ "none": -1.0,
156
+ }
157
+ th = th_map.get(tier, -1.0)
158
+ threshold = f"≥ {th:+.2f} pp ({pm_basis})"
159
+ else:
160
+ measured = f"{pm_value:.3f}×" if isinstance(pm_value, int | float) else "N/A"
161
+ tier_thresholds = {
162
+ "conservative": 1.05,
163
+ "balanced": 1.10,
164
+ "aggressive": 1.20,
165
+ "none": 1.10,
166
+ }
167
+ ratio_limit = tier_thresholds.get(tier, 1.10)
168
+ target_ratio = auto.get("target_pm_ratio")
169
+ if isinstance(target_ratio, int | float) and target_ratio > 0:
170
+ ratio_limit = min(ratio_limit, float(target_ratio))
171
+ threshold = f"≤ {ratio_limit:.2f}× ({pm_basis})"
172
+
173
+ pm_status = (
174
+ f"{'✅' if pm_ok else '❌'} {measured}"
175
+ if isinstance(pm_ok, bool)
176
+ else f"ℹ️ {measured}"
177
+ )
178
+
179
+ # Drift summary (final/preview ratio) when preview/final are numeric
180
+ drift_ok: bool | None
181
+ if isinstance(validation, dict) and "preview_final_drift_acceptable" in validation:
182
+ drift_ok = bool(validation.get("preview_final_drift_acceptable"))
183
+ else:
184
+ drift_ok = None
185
+ drift_val = "N/A"
186
+ try:
187
+ pv = (
188
+ float(pm.get("preview"))
189
+ if isinstance(pm.get("preview"), int | float)
190
+ else float("nan")
191
+ )
192
+ fv = (
193
+ float(pm.get("final"))
194
+ if isinstance(pm.get("final"), int | float)
195
+ else float("nan")
196
+ )
197
+ drift = (
198
+ fv / pv
199
+ if (math.isfinite(pv) and pv > 0 and math.isfinite(fv))
200
+ else float("nan")
201
+ )
202
+ if math.isfinite(drift):
203
+ drift_val = f"{drift:.3f}×"
204
+ except Exception:
205
+ drift_val = "N/A"
206
+ drift_status = (
207
+ f"{'✅' if drift_ok else '❌'} {drift_val}"
208
+ if isinstance(drift_ok, bool)
209
+ else f"ℹ️ {drift_val}"
210
+ )
211
+
212
+ def _gate_cell(key: str, ok_default: bool | None = None) -> str:
213
+ ok: bool | None
214
+ if not isinstance(validation, dict):
215
+ ok = ok_default
216
+ elif key not in validation:
217
+ ok = ok_default
218
+ else:
219
+ ok = bool(validation.get(key))
220
+ if ok is None:
221
+ return "ℹ️ N/A"
222
+ return "✅ PASS" if ok else "❌ FAIL"
223
+
224
+ overhead_ctx = evaluation_report.get("guard_overhead", {}) or {}
225
+ overhead_evaluated = (
226
+ bool(overhead_ctx.get("evaluated")) if isinstance(overhead_ctx, dict) else False
227
+ )
228
+ overhead_row: tuple[str, str, str] | None = None
229
+ if overhead_evaluated:
230
+ overhead_pct = overhead_ctx.get("overhead_percent")
231
+ overhead_ratio = overhead_ctx.get("overhead_ratio")
232
+ if isinstance(overhead_pct, int | float) and math.isfinite(float(overhead_pct)):
233
+ overhead_measured = f"{float(overhead_pct):+.2f}%"
234
+ elif isinstance(overhead_ratio, int | float) and math.isfinite(
235
+ float(overhead_ratio)
236
+ ):
237
+ overhead_measured = f"{float(overhead_ratio):.3f}×"
238
+ else:
239
+ overhead_measured = "N/A"
240
+ threshold_pct = overhead_ctx.get("threshold_percent")
241
+ if isinstance(threshold_pct, int | float) and math.isfinite(
242
+ float(threshold_pct)
243
+ ):
244
+ threshold_str = f"≤ +{float(threshold_pct):.1f}%"
245
+ else:
246
+ threshold_str = "≤ +1.0%"
247
+ overhead_row = (
248
+ "Overhead",
249
+ f"{'✅' if bool(validation.get('guard_overhead_acceptable', True)) else '❌'} {overhead_measured}"
250
+ if isinstance(validation, dict)
251
+ else f"ℹ️ {overhead_measured}",
252
+ threshold_str,
253
+ )
254
+
255
+ lines.append("## Evaluation Dashboard")
256
+ lines.append("")
257
+ lines.append("| Check | Status | Quick Summary |")
258
+ lines.append("|-------|--------|---------------|")
259
+ lines.append(f"| Overall | {overall_status} | Canonical gate outcomes |")
260
+ lines.append(f"| Primary Metric | {pm_status} | {threshold} |")
261
+ lines.append(f"| Drift | {drift_status} | 0.95–1.05× band |")
262
+ lines.append(
263
+ f"| Invariants | {_gate_cell('invariants_pass')} | Model integrity checks |"
264
+ )
265
+ lines.append(
266
+ f"| Spectral | {_gate_cell('spectral_stable')} | Weight matrix spectral norms |"
267
+ )
268
+ lines.append(f"| RMT | {_gate_cell('rmt_stable')} | Random Matrix Theory guard |")
269
+ if overhead_row:
270
+ lines.append(f"| {overhead_row[0]} | {overhead_row[1]} | {overhead_row[2]} |")
271
+ lines.append("")
272
+
273
+
274
+ def _append_primary_metric_section(
275
+ lines: list[str], evaluation_report: dict[str, Any]
276
+ ) -> None:
277
+ """Append the Primary Metric section early for quick triage."""
278
+ pm = evaluation_report.get("primary_metric")
279
+ if not isinstance(pm, dict) or not pm:
280
+ return
281
+
282
+ kind = pm.get("kind", "unknown")
283
+ lines.append("## Primary Metric")
284
+ lines.append("")
285
+ unit = pm.get("unit", "-")
286
+ paired = pm.get("paired", False)
287
+
288
+ estimated_flag = False
289
+ try:
290
+ if bool(pm.get("estimated")):
291
+ estimated_flag = True
292
+ elif str(pm.get("counts_source", "")).lower() == "pseudo_config":
293
+ estimated_flag = True
294
+ except Exception:
295
+ estimated_flag = False
296
+ est_suffix = " (estimated)" if estimated_flag else ""
297
+
298
+ lines.append(f"- Kind: {kind} (unit: {unit}){est_suffix}")
299
+ gating_basis = pm.get("gating_basis") or pm.get("basis")
300
+ if gating_basis:
301
+ lines.append(f"- Basis: {gating_basis}")
302
+ if isinstance(paired, bool):
303
+ lines.append(f"- Paired: {paired}")
304
+ reps = pm.get("reps")
305
+ if isinstance(reps, int | float):
306
+ lines.append(f"- Bootstrap Reps: {int(reps)}")
307
+ ci = pm.get("ci") or pm.get("display_ci")
308
+ if (
309
+ isinstance(ci, list | tuple)
310
+ and len(ci) == 2
311
+ and all(isinstance(x, int | float) for x in ci)
312
+ ):
313
+ lines.append(f"- CI: {ci[0]:.3f}–{ci[1]:.3f}")
314
+
315
+ prev = pm.get("preview")
316
+ fin = pm.get("final")
317
+ ratio = pm.get("ratio_vs_baseline")
318
+
319
+ lines.append("")
320
+ if estimated_flag and str(kind).lower() in {"accuracy", "vqa_accuracy"}:
321
+ lines.append(
322
+ "- Note: Accuracy derived from pseudo counts (quick dev preset); use a labeled preset for measured accuracy."
323
+ )
324
+ lines.append("| Field | Value |")
325
+ lines.append("|-------|-------|")
326
+ lines.append(f"| Preview | {_fmt_by_kind(prev, str(kind))} |")
327
+ lines.append(f"| Final | {_fmt_by_kind(fin, str(kind))} |")
328
+
329
+ if kind in {"accuracy", "vqa_accuracy"}:
330
+ lines.append(f"| Δ vs Baseline | {_fmt_by_kind(ratio, str(kind))} |")
331
+ try:
332
+ base_pt = pm.get("baseline_point")
333
+ if isinstance(base_pt, int | float) and base_pt < 0.05:
334
+ lines.append("- Note: baseline < 5%; ratio suppressed; showing Δpp")
335
+ except Exception:
336
+ pass
337
+ else:
338
+ try:
339
+ lines.append(f"| Ratio vs Baseline | {float(ratio):.3f} |")
340
+ except Exception:
341
+ lines.append("| Ratio vs Baseline | N/A |")
342
+ lines.append("")
343
+
344
+ # Secondary metrics (informational)
345
+ try:
346
+ secs = evaluation_report.get("secondary_metrics")
347
+ if isinstance(secs, list) and secs:
348
+ lines.append("## Secondary Metrics (informational)")
349
+ lines.append("")
350
+ lines.append("| Kind | Preview | Final | vs Baseline | CI |")
351
+ lines.append("|------|---------|-------|-------------|----|")
352
+ for m in secs:
353
+ if not isinstance(m, dict):
354
+ continue
355
+ k = m.get("kind", "?")
356
+ pv = _fmt_by_kind(m.get("preview"), str(k))
357
+ fv = _fmt_by_kind(m.get("final"), str(k))
358
+ rb = m.get("ratio_vs_baseline")
359
+ try:
360
+ rb_str = (
361
+ f"{float(rb):.3f}"
362
+ if (str(k).startswith("ppl"))
363
+ else _fmt_by_kind(rb, str(k))
364
+ )
365
+ except Exception:
366
+ rb_str = "N/A"
367
+ ci = m.get("display_ci") or m.get("ci")
368
+ if isinstance(ci, tuple | list) and len(ci) == 2:
369
+ ci_str = f"{float(ci[0]):.3f}-{float(ci[1]):.3f}"
370
+ else:
371
+ ci_str = "–"
372
+ lines.append(f"| {k} | {pv} | {fv} | {rb_str} | {ci_str} |")
373
+ lines.append("")
374
+ except Exception:
375
+ pass
376
+
377
+
378
+ def _append_policy_configuration_section(
379
+ lines: list[str], evaluation_report: dict[str, Any]
380
+ ) -> None:
381
+ resolved_policy = evaluation_report.get("resolved_policy")
382
+ policy_provenance = evaluation_report.get("policy_provenance", {}) or {}
383
+ has_prov = isinstance(policy_provenance, dict) and bool(policy_provenance)
384
+ has_resolved = isinstance(resolved_policy, dict) and bool(resolved_policy)
385
+ if not (has_prov or has_resolved):
386
+ return
387
+
388
+ lines.append("## Policy Configuration")
389
+ lines.append("")
390
+
391
+ tier = None
392
+ if has_prov:
393
+ tier = policy_provenance.get("tier")
394
+ if not tier:
395
+ tier = (evaluation_report.get("auto", {}) or {}).get("tier")
396
+ digest_value = None
397
+ if has_prov:
398
+ digest_value = policy_provenance.get("policy_digest")
399
+ if not digest_value:
400
+ digest_value = (evaluation_report.get("policy_digest", {}) or {}).get(
401
+ "thresholds_hash"
402
+ )
403
+
404
+ summary_parts: list[str] = []
405
+ if tier:
406
+ summary_parts.append(f"**Tier:** {tier}")
407
+ if digest_value:
408
+ summary_parts.append(f"**Digest:** `{_short_digest(str(digest_value))}`")
409
+ if summary_parts:
410
+ lines.append(" | ".join(summary_parts))
411
+
412
+ if has_prov:
413
+ overrides_list = policy_provenance.get("overrides") or []
414
+ if overrides_list:
415
+ lines.append(f"- **Overrides:** {', '.join(overrides_list)}")
416
+ else:
417
+ lines.append("- **Overrides:** (none)")
418
+ if policy_provenance.get("resolved_at"):
419
+ lines.append(f"- **Resolved At:** {policy_provenance.get('resolved_at')}")
420
+
421
+ if has_resolved:
422
+ lines.append("")
423
+ lines.append("<details>")
424
+ lines.append("<summary>Resolved Policy YAML</summary>")
425
+ lines.append("")
426
+ lines.append("```yaml")
427
+ resolved_yaml = yaml.safe_dump(
428
+ resolved_policy, sort_keys=True, width=80, default_flow_style=False
429
+ ).strip()
430
+ for line in resolved_yaml.splitlines():
431
+ lines.append(line)
432
+ lines.append("```")
433
+ lines.append("")
434
+ lines.append("</details>")
435
+
436
+ lines.append("")
437
+
438
+
439
+ def _append_dataset_and_provenance_section(
440
+ lines: list[str], evaluation_report: dict[str, Any]
441
+ ) -> None:
442
+ dataset = evaluation_report.get("dataset", {}) or {}
443
+ provenance_info = evaluation_report.get("provenance", {}) or {}
444
+
445
+ has_dataset = isinstance(dataset, dict) and bool(dataset)
446
+ has_provenance = isinstance(provenance_info, dict) and bool(provenance_info)
447
+ if not (has_dataset or has_provenance):
448
+ return
449
+
450
+ lines.append("## Dataset and Provenance")
451
+ lines.append("")
452
+
453
+ if has_dataset:
454
+ prov = dataset.get("provider") or "unknown"
455
+ lines.append(f"- **Provider:** {prov}")
456
+ try:
457
+ seq_len_val = (
458
+ int(dataset.get("seq_len"))
459
+ if isinstance(dataset.get("seq_len"), int | float)
460
+ else dataset.get("seq_len")
461
+ )
462
+ except Exception: # pragma: no cover - defensive
463
+ seq_len_val = dataset.get("seq_len")
464
+ if seq_len_val is not None:
465
+ lines.append(f"- **Sequence Length:** {seq_len_val}")
466
+ windows_blk = (
467
+ dataset.get("windows", {})
468
+ if isinstance(dataset.get("windows"), dict)
469
+ else {}
470
+ )
471
+ win_prev = windows_blk.get("preview")
472
+ win_final = windows_blk.get("final")
473
+ if win_prev is not None and win_final is not None:
474
+ lines.append(f"- **Windows:** {win_prev} preview + {win_final} final")
475
+ if windows_blk.get("seed") is not None:
476
+ lines.append(f"- **Seed:** {windows_blk.get('seed')}")
477
+ hash_blk = (
478
+ dataset.get("hash", {}) if isinstance(dataset.get("hash"), dict) else {}
479
+ )
480
+ if hash_blk.get("preview_tokens") is not None:
481
+ lines.append(f"- **Preview Tokens:** {hash_blk.get('preview_tokens'):,}")
482
+ if hash_blk.get("final_tokens") is not None:
483
+ lines.append(f"- **Final Tokens:** {hash_blk.get('final_tokens'):,}")
484
+ if hash_blk.get("total_tokens") is not None:
485
+ lines.append(f"- **Total Tokens:** {hash_blk.get('total_tokens'):,}")
486
+ if hash_blk.get("dataset"):
487
+ lines.append(f"- **Dataset Hash:** {hash_blk.get('dataset')}")
488
+ tokenizer = dataset.get("tokenizer", {})
489
+ if isinstance(tokenizer, dict) and (
490
+ tokenizer.get("name") or tokenizer.get("hash")
491
+ ):
492
+ vocab_size = tokenizer.get("vocab_size")
493
+ vocab_suffix = (
494
+ f" (vocab {vocab_size})" if isinstance(vocab_size, int) else ""
495
+ )
496
+ lines.append(
497
+ f"- **Tokenizer:** {tokenizer.get('name', 'unknown')}{vocab_suffix}"
498
+ )
499
+ if tokenizer.get("hash"):
500
+ lines.append(f" - Hash: {tokenizer['hash']}")
501
+ lines.append(
502
+ f" - BOS/EOS: {tokenizer.get('bos_token')} / {tokenizer.get('eos_token')}"
503
+ )
504
+ if tokenizer.get("pad_token") is not None:
505
+ lines.append(f" - PAD: {tokenizer.get('pad_token')}")
506
+ if tokenizer.get("add_prefix_space") is not None:
507
+ lines.append(
508
+ f" - add_prefix_space: {tokenizer.get('add_prefix_space')}"
509
+ )
510
+
511
+ if has_provenance:
512
+ baseline_info = provenance_info.get("baseline", {}) or {}
513
+ edited_info = provenance_info.get("edited", {}) or {}
514
+
515
+ if baseline_info or edited_info:
516
+ lines.append("")
517
+ if baseline_info:
518
+ lines.append(f"- **Baseline Run ID:** {baseline_info.get('run_id')}")
519
+ if baseline_info.get("report_hash"):
520
+ lines.append(f" - Report Hash: `{baseline_info.get('report_hash')}`")
521
+ if baseline_info.get("report_path"):
522
+ lines.append(f" - Report Path: {baseline_info.get('report_path')}")
523
+ if edited_info:
524
+ lines.append(f"- **Edited Run ID:** {edited_info.get('run_id')}")
525
+ if edited_info.get("report_hash"):
526
+ lines.append(f" - Report Hash: `{edited_info.get('report_hash')}`")
527
+ if edited_info.get("report_path"):
528
+ lines.append(f" - Report Path: {edited_info.get('report_path')}")
529
+
530
+ provider_digest = provenance_info.get("provider_digest")
531
+ if isinstance(provider_digest, dict) and provider_digest:
532
+ ids_d = provider_digest.get("ids_sha256")
533
+ tok_d = provider_digest.get("tokenizer_sha256")
534
+ mask_d = provider_digest.get("masking_sha256")
535
+
536
+ lines.append("- **Provider Digest:**")
537
+ if tok_d:
538
+ lines.append(
539
+ f" - tokenizer_sha256: `{_short_digest(tok_d)}` (full in JSON)"
540
+ )
541
+ if ids_d:
542
+ lines.append(f" - ids_sha256: `{_short_digest(ids_d)}` (full in JSON)")
543
+ if mask_d:
544
+ lines.append(
545
+ f" - masking_sha256: `{_short_digest(mask_d)}` (full in JSON)"
546
+ )
547
+
548
+ try:
549
+ conf = evaluation_report.get("confidence", {}) or {}
550
+ if isinstance(conf, dict) and conf.get("label"):
551
+ lines.append(f"- **Confidence:** {conf.get('label')}")
552
+ except Exception:
553
+ pass
554
+
555
+ try:
556
+ pd = evaluation_report.get("policy_digest", {}) or {}
557
+ if isinstance(pd, dict) and pd:
558
+ pv = pd.get("policy_version")
559
+ th = pd.get("thresholds_hash")
560
+ if pv:
561
+ lines.append(f"- **Policy Version:** {pv}")
562
+ if isinstance(th, str) and th:
563
+ short = th if len(th) <= 16 else (th[:8] + "…" + th[-8:])
564
+ lines.append(f"- **Thresholds Digest:** `{short}` (full in JSON)")
565
+ if pd.get("changed"):
566
+ lines.append("- Note: policy changed")
567
+ except Exception:
568
+ pass
569
+
570
+ lines.append("")
571
+
572
+
116
573
  def _fmt_by_kind(x: Any, k: str) -> str:
117
574
  try:
118
575
  xv = float(x)
@@ -215,13 +672,13 @@ def _append_accuracy_subgroups(lines: list[str], subgroups: dict[str, Any]) -> N
215
672
  lines.append("")
216
673
 
217
674
 
218
- def _compute_certificate_hash(certificate: dict[str, Any]) -> str:
219
- """Compute integrity hash for the certificate.
675
+ def _compute_report_hash(evaluation_report: dict[str, Any]) -> str:
676
+ """Compute integrity hash for the evaluation_report.
220
677
 
221
678
  Hash ignores the `artifacts` section for stability across saves.
222
679
  """
223
680
  # Create a copy without the artifacts section for stable hashing
224
- cert_copy = dict(certificate or {})
681
+ cert_copy = dict(evaluation_report or {})
225
682
  cert_copy.pop("artifacts", None)
226
683
 
227
684
  # Sort keys for deterministic hashing
@@ -231,8 +688,8 @@ def _compute_certificate_hash(certificate: dict[str, Any]) -> str:
231
688
  return _hash.sha256(cert_str.encode()).hexdigest()[:16]
232
689
 
233
690
 
234
- def build_console_summary_pack(certificate: dict[str, Any]) -> dict[str, Any]:
235
- """Build a small, reusable console summary pack from a certificate.
691
+ def build_console_summary_pack(evaluation_report: dict[str, Any]) -> dict[str, Any]:
692
+ """Build a small, reusable console summary pack from a evaluation_report.
236
693
 
237
694
  Returns a dict with:
238
695
  - overall_pass: bool
@@ -240,7 +697,7 @@ def build_console_summary_pack(certificate: dict[str, Any]) -> dict[str, Any]:
240
697
  - gate_lines: list of "<Label>: <Status>" strings for each evaluated gate
241
698
  - labels: the canonical label list used
242
699
  """
243
- block = compute_console_validation_block(certificate)
700
+ block = compute_console_validation_block(evaluation_report)
244
701
  overall_pass = bool(block.get("overall_pass"))
245
702
  emoji = "✅" if overall_pass else "❌"
246
703
  overall_line = f"Overall Status: {emoji} {'PASS' if overall_pass else 'FAIL'}"
@@ -261,38 +718,38 @@ def build_console_summary_pack(certificate: dict[str, Any]) -> dict[str, Any]:
261
718
  }
262
719
 
263
720
 
264
- def render_certificate_markdown(certificate: dict[str, Any]) -> str:
721
+ def render_report_markdown(evaluation_report: dict[str, Any]) -> str:
265
722
  """
266
- Render a certificate as a formatted Markdown report with pretty tables.
723
+ Render an evaluation report as a formatted Markdown report with pretty tables.
267
724
 
268
- This implementation is moved from certificate.py to keep that module lean.
269
- To avoid circular import issues, we alias helpers from the certificate
270
- module inside the function body.
725
+ This implementation is moved from report_builder.py to keep that module lean.
271
726
  """
272
- # Alias frequently used helpers locally to avoid editing the large body
273
- validate_certificate = C.validate_certificate
274
-
275
- if not validate_certificate(certificate):
276
- raise ValueError("Invalid certificate structure")
727
+ if not validate_report(evaluation_report):
728
+ raise ValueError("Invalid evaluation report structure")
277
729
 
278
- lines = []
279
- edit_name = str(certificate.get("edit_name") or "").lower()
730
+ lines: list[str] = []
731
+ appendix_lines: list[str] = []
732
+ edit_name = str(evaluation_report.get("edit_name") or "").lower()
280
733
 
281
734
  # Header
282
- lines.append("# InvarLock Safety Certificate")
735
+ lines.append("# InvarLock Evaluation Report")
283
736
  lines.append("")
284
737
  lines.append(
285
738
  "> *Basis: “point” gates check the point estimate; “upper” gates check the CI "
286
739
  "upper bound; “point & upper” requires both to pass.*"
287
740
  )
288
741
  lines.append("")
289
- lines.append(f"**Schema Version:** {certificate['schema_version']}")
290
- lines.append(f"**Run ID:** `{certificate['run_id']}`")
291
- lines.append(f"**Generated:** {certificate['artifacts']['generated_at']}")
292
- lines.append(f"**Edit Type:** {certificate.get('edit_name', 'Unknown')}")
742
+ lines.append(f"**Schema Version:** {evaluation_report['schema_version']}")
743
+ lines.append(f"**Run ID:** `{evaluation_report['run_id']}`")
744
+ lines.append(f"**Generated:** {evaluation_report['artifacts']['generated_at']}")
745
+ lines.append(f"**Edit Type:** {evaluation_report.get('edit_name', 'Unknown')}")
746
+ lines.append("")
747
+ lines.append(
748
+ "> Full evidence: see [`evaluation.report.json`](evaluation.report.json) for complete provenance, digests, and raw measurements."
749
+ )
293
750
  lines.append("")
294
751
 
295
- plugins = certificate.get("plugins", {})
752
+ plugins = evaluation_report.get("plugins", {})
296
753
  if isinstance(plugins, dict) and plugins:
297
754
  lines.append("## Plugin Provenance")
298
755
  lines.append("")
@@ -314,12 +771,12 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
314
771
  ]
315
772
  if guard_entries:
316
773
  lines.append("- Guards:\n - " + "\n - ".join(guard_entries))
317
- lines.append("")
774
+ lines.append("")
318
775
 
319
776
  # Executive Summary with validation status (canonical, from console block)
320
777
  lines.append("## Executive Summary")
321
778
  lines.append("")
322
- _block = compute_console_validation_block(certificate)
779
+ _block = compute_console_validation_block(evaluation_report)
323
780
  overall_pass = bool(_block.get("overall_pass"))
324
781
  status_emoji = "✅" if overall_pass else "❌"
325
782
  lines.append(
@@ -328,13 +785,13 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
328
785
  # Window Plan one-liner for quick audit
329
786
  try:
330
787
  plan_ctx = (
331
- certificate.get("window_plan")
332
- or certificate.get("dataset", {}).get("windows", {})
333
- or certificate.get("ppl", {}).get("window_plan")
788
+ evaluation_report.get("window_plan")
789
+ or evaluation_report.get("dataset", {}).get("windows", {})
790
+ or evaluation_report.get("ppl", {}).get("window_plan")
334
791
  )
335
- seq_len = certificate.get("dataset", {}).get("seq_len") or certificate.get(
336
- "dataset", {}
337
- ).get("sequence_length")
792
+ seq_len = evaluation_report.get("dataset", {}).get(
793
+ "seq_len"
794
+ ) or evaluation_report.get("dataset", {}).get("sequence_length")
338
795
  if isinstance(plan_ctx, dict):
339
796
  profile = plan_ctx.get("profile")
340
797
  preview_n = (
@@ -354,15 +811,34 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
354
811
  pass
355
812
  lines.append("")
356
813
 
814
+ dashboard = _render_executive_dashboard(evaluation_report)
815
+ if dashboard:
816
+ lines.extend(dashboard.splitlines())
817
+ lines.append("")
818
+
819
+ lines.append("## Contents")
820
+ lines.append("")
821
+ lines.append("- [Evaluation Dashboard](#evaluation-dashboard)")
822
+ lines.append("- [Quality Gates](#quality-gates)")
823
+ lines.append("- [Guard Check Details](#guard-check-details)")
824
+ lines.append("- [Primary Metric](#primary-metric)")
825
+ lines.append("- [Guard Observability](#guard-observability)")
826
+ lines.append("- [Model Information](#model-information)")
827
+ lines.append("- [Dataset and Provenance](#dataset-and-provenance)")
828
+ lines.append("- [Policy Configuration](#policy-configuration)")
829
+ lines.append("- [Appendix](#appendix)")
830
+ lines.append("- [Evaluation Report Integrity](#evaluation-report-integrity)")
831
+ lines.append("")
832
+
357
833
  # Validation table with canonical gates (mirrors console allow-list)
358
834
  lines.append("## Quality Gates")
359
835
  lines.append("")
360
836
  lines.append("| Gate | Status | Measured | Threshold | Basis | Description |")
361
837
  lines.append("|------|--------|----------|-----------|-------|-------------|")
362
838
 
363
- pm_block = certificate.get("primary_metric", {}) or {}
839
+ pm_block = evaluation_report.get("primary_metric", {}) or {}
364
840
  has_pm = isinstance(pm_block, dict) and bool(pm_block)
365
- auto_info = certificate.get("auto", {})
841
+ auto_info = evaluation_report.get("auto", {})
366
842
  tier = (auto_info.get("tier") or "balanced").lower()
367
843
 
368
844
  # Helper to emit Primary Metric Acceptable row
@@ -371,7 +847,9 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
371
847
  value = pm_block.get("ratio_vs_baseline")
372
848
  gating_basis = pm_block.get("gating_basis") or "point"
373
849
  ok = bool(
374
- certificate.get("validation", {}).get("primary_metric_acceptable", True)
850
+ evaluation_report.get("validation", {}).get(
851
+ "primary_metric_acceptable", True
852
+ )
375
853
  )
376
854
  status = "✅ PASS" if ok else "❌ FAIL"
377
855
  if pm_kind in {"accuracy", "vqa_accuracy"}:
@@ -405,11 +883,36 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
405
883
  # Helper to emit Preview Final Drift Acceptable row
406
884
  def _emit_drift_gate_row() -> None:
407
885
  ok = bool(
408
- certificate.get("validation", {}).get(
886
+ evaluation_report.get("validation", {}).get(
409
887
  "preview_final_drift_acceptable", True
410
888
  )
411
889
  )
412
890
  status = "✅ PASS" if ok else "❌ FAIL"
891
+ drift_min = 0.95
892
+ drift_max = 1.05
893
+ try:
894
+ drift_band = (
895
+ pm_block.get("drift_band") if isinstance(pm_block, dict) else None
896
+ )
897
+ if isinstance(drift_band, dict):
898
+ lo = drift_band.get("min")
899
+ hi = drift_band.get("max")
900
+ if isinstance(lo, int | float) and isinstance(hi, int | float):
901
+ lo_f = float(lo)
902
+ hi_f = float(hi)
903
+ if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
904
+ drift_min = lo_f
905
+ drift_max = hi_f
906
+ elif isinstance(drift_band, list | tuple) and len(drift_band) == 2:
907
+ lo_raw, hi_raw = drift_band[0], drift_band[1]
908
+ if isinstance(lo_raw, int | float) and isinstance(hi_raw, int | float):
909
+ lo_f = float(lo_raw)
910
+ hi_f = float(hi_raw)
911
+ if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
912
+ drift_min = lo_f
913
+ drift_max = hi_f
914
+ except Exception:
915
+ pass
413
916
  # Compute drift from PM preview/final when available
414
917
  try:
415
918
  pv = (
@@ -430,18 +933,21 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
430
933
  except Exception:
431
934
  drift = float("nan")
432
935
  measured = f"{drift:.3f}x" if math.isfinite(drift) else "N/A"
936
+ band_label = f"{drift_min:.2f}–{drift_max:.2f}x"
433
937
  lines.append(
434
- f"| Preview Final Drift Acceptable | {status} | {measured} | 0.95–1.05x | point | Final/Preview ratio stability |"
938
+ f"| Preview Final Drift Acceptable | {status} | {measured} | {band_label} | point | Final/Preview ratio stability |"
435
939
  )
436
940
 
437
941
  # Helper to emit Guard Overhead Acceptable row (only when evaluated)
438
942
  def _emit_overhead_gate_row() -> None:
439
- guard_overhead = certificate.get("guard_overhead", {}) or {}
943
+ guard_overhead = evaluation_report.get("guard_overhead", {}) or {}
440
944
  evaluated = bool(guard_overhead.get("evaluated"))
441
945
  if not evaluated:
442
946
  return
443
947
  ok = bool(
444
- certificate.get("validation", {}).get("guard_overhead_acceptable", True)
948
+ evaluation_report.get("validation", {}).get(
949
+ "guard_overhead_acceptable", True
950
+ )
445
951
  )
446
952
  status = "✅ PASS" if ok else "❌ FAIL"
447
953
  overhead_pct = guard_overhead.get("overhead_percent")
@@ -469,7 +975,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
469
975
  )
470
976
 
471
977
  def _emit_pm_tail_gate_row() -> None:
472
- pm_tail = certificate.get("primary_metric_tail", {}) or {}
978
+ pm_tail = evaluation_report.get("primary_metric_tail", {}) or {}
473
979
  if not isinstance(pm_tail, dict) or not pm_tail:
474
980
  return
475
981
 
@@ -479,7 +985,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
479
985
  warned = bool(pm_tail.get("warned", False))
480
986
 
481
987
  if not evaluated:
482
- status = "🛈 INFO"
988
+ status = "ℹ️ INFO"
483
989
  elif passed:
484
990
  status = "✅ PASS"
485
991
  elif mode == "fail":
@@ -536,17 +1042,17 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
536
1042
  _emit_overhead_gate_row()
537
1043
 
538
1044
  # Annotate hysteresis usage if applied
539
- if certificate.get("validation", {}).get("hysteresis_applied"):
1045
+ if evaluation_report.get("validation", {}).get("hysteresis_applied"):
540
1046
  lines.append("- Note: hysteresis applied to gate boundary")
541
1047
 
542
1048
  lines.append("")
543
- lines.append("## Safety Check Details")
1049
+ lines.append("## Guard Check Details")
544
1050
  lines.append("")
545
- lines.append("| Safety Check | Status | Measured | Threshold | Description |")
1051
+ lines.append("| Guard Check | Status | Measured | Threshold | Description |")
546
1052
  lines.append("|--------------|--------|----------|-----------|-------------|")
547
1053
 
548
- inv_summary = certificate["invariants"]
549
- validation = certificate.get("validation", {})
1054
+ inv_summary = evaluation_report["invariants"]
1055
+ validation = evaluation_report.get("validation", {})
550
1056
  inv_status = "✅ PASS" if validation.get("invariants_pass", False) else "❌ FAIL"
551
1057
  inv_counts = inv_summary.get("summary", {}) or {}
552
1058
  inv_measure = inv_summary.get("status", "pass").upper()
@@ -578,23 +1084,23 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
578
1084
  lines.append(f"- Non-fatal: {non_fatal_message}")
579
1085
 
580
1086
  spec_status = "✅ PASS" if validation.get("spectral_stable", False) else "❌ FAIL"
581
- caps_applied = certificate["spectral"]["caps_applied"]
1087
+ caps_applied = evaluation_report["spectral"]["caps_applied"]
582
1088
  lines.append(
583
1089
  f"| Spectral Stability | {spec_status} | {caps_applied} violations | < 5 | Weight matrix spectral norms |"
584
1090
  )
585
1091
 
586
1092
  # Catastrophic spike safety stop row is now driven by primary metric flags
587
- if isinstance(certificate.get("primary_metric"), dict):
1093
+ if isinstance(evaluation_report.get("primary_metric"), dict):
588
1094
  pm_ok = bool(validation.get("primary_metric_acceptable", True))
589
- pm_ratio = certificate.get("primary_metric", {}).get("ratio_vs_baseline")
1095
+ pm_ratio = evaluation_report.get("primary_metric", {}).get("ratio_vs_baseline")
590
1096
  if isinstance(pm_ratio, int | float):
591
1097
  lines.append(
592
- f"| Catastrophic Spike Gate (safety stop) | {'✅ PASS' if pm_ok else '❌ FAIL'} | {pm_ratio:.3f}x | ≤ 2.0x | Hard stop @ 2.0× |"
1098
+ f"| Catastrophic Spike Gate (hard stop) | {'✅ PASS' if pm_ok else '❌ FAIL'} | {pm_ratio:.3f}x | ≤ 2.0x | Hard stop @ 2.0× |"
593
1099
  )
594
1100
 
595
1101
  # Include RMT Health row for compatibility and clarity
596
1102
  rmt_status = "✅ PASS" if validation.get("rmt_stable", False) else "❌ FAIL"
597
- rmt_state = certificate.get("rmt", {}).get("status", "unknown").title()
1103
+ rmt_state = evaluation_report.get("rmt", {}).get("status", "unknown").title()
598
1104
  lines.append(
599
1105
  f"| RMT Health | {rmt_status} | {rmt_state} | ε-rule | Random Matrix Theory guard status |"
600
1106
  )
@@ -602,8 +1108,8 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
602
1108
  # Pairing + Bootstrap snapshot (quick audit surface)
603
1109
  try:
604
1110
  stats = (
605
- certificate.get("dataset", {}).get("windows", {}).get("stats", {})
606
- or certificate.get("ppl", {}).get("stats", {})
1111
+ evaluation_report.get("dataset", {}).get("windows", {}).get("stats", {})
1112
+ or evaluation_report.get("ppl", {}).get("stats", {})
607
1113
  or {}
608
1114
  )
609
1115
  paired_windows = stats.get("paired_windows")
@@ -616,24 +1122,51 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
616
1122
  or overlap_frac is not None
617
1123
  ):
618
1124
  lines.append("")
619
- lines.append(
620
- f"- Pairing: paired={paired_windows}, match={match_frac:.3f}, overlap={overlap_frac:.3f}"
621
- )
1125
+ parts: list[str] = []
1126
+ if paired_windows is not None:
1127
+ try:
1128
+ parts.append(f"{int(paired_windows)} windows")
1129
+ except Exception:
1130
+ parts.append(f"windows={paired_windows}")
1131
+ if isinstance(match_frac, int | float) and math.isfinite(float(match_frac)):
1132
+ parts.append(f"{float(match_frac) * 100.0:.1f}% match")
1133
+ elif match_frac is not None:
1134
+ parts.append(f"match={match_frac}")
1135
+ if isinstance(overlap_frac, int | float) and math.isfinite(
1136
+ float(overlap_frac)
1137
+ ):
1138
+ parts.append(f"{float(overlap_frac) * 100.0:.1f}% overlap")
1139
+ elif overlap_frac is not None:
1140
+ parts.append(f"overlap={overlap_frac}")
1141
+ lines.append(f"- ✅ Pairing: {', '.join(parts) if parts else 'N/A'}")
622
1142
  if isinstance(bootstrap, dict):
623
1143
  reps = bootstrap.get("replicates")
624
1144
  bseed = bootstrap.get("seed")
625
1145
  if reps is not None or bseed is not None:
626
- lines.append(f"- Bootstrap: replicates={reps}, seed={bseed}")
1146
+ bits: list[str] = []
1147
+ if reps is not None:
1148
+ try:
1149
+ bits.append(f"{int(reps)} replicates")
1150
+ except Exception:
1151
+ bits.append(f"replicates={reps}")
1152
+ if bseed is not None:
1153
+ try:
1154
+ bits.append(f"seed={int(bseed)}")
1155
+ except Exception:
1156
+ bits.append(f"seed={bseed}")
1157
+ lines.append(f"- ✅ Bootstrap: {', '.join(bits) if bits else 'N/A'}")
627
1158
  # Optional: show log-space paired Δ CI next to ratio CI for clarity
628
- delta_ci = certificate.get("primary_metric", {}).get("ci") or certificate.get(
629
- "ppl", {}
630
- ).get("logloss_delta_ci")
1159
+ delta_ci = evaluation_report.get("primary_metric", {}).get(
1160
+ "ci"
1161
+ ) or evaluation_report.get("ppl", {}).get("logloss_delta_ci")
631
1162
  if (
632
1163
  isinstance(delta_ci, tuple | list)
633
1164
  and len(delta_ci) == 2
634
1165
  and all(isinstance(x, int | float) for x in delta_ci)
635
1166
  ):
636
- lines.append(f"- Log Δ (paired) CI: [{delta_ci[0]:.6f}, {delta_ci[1]:.6f}]")
1167
+ lines.append(
1168
+ f"- ℹ️ Log Δ (paired) CI: [{delta_ci[0]:.6f}, {delta_ci[1]:.6f}]"
1169
+ )
637
1170
  except Exception:
638
1171
  pass
639
1172
 
@@ -654,124 +1187,198 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
654
1187
 
655
1188
  lines.append("")
656
1189
 
1190
+ _append_primary_metric_section(lines, evaluation_report)
1191
+
657
1192
  # Guard observability snapshots
658
1193
  lines.append("## Guard Observability")
659
1194
  lines.append("")
660
1195
 
661
- spectral_info = certificate.get("spectral", {}) or {}
1196
+ spectral_info = evaluation_report.get("spectral", {}) or {}
662
1197
  if spectral_info:
663
- lines.append("### Spectral Guard")
1198
+ lines.append("### Spectral Guard Summary")
664
1199
  lines.append("")
665
- mt_info = spectral_info.get("multiple_testing", {}) or {}
666
- if mt_info:
667
- lines.append("- **Multiple Testing:**")
668
- lines.append(" ```yaml")
669
- mt_yaml = (
670
- yaml.safe_dump(mt_info, sort_keys=True, width=70).strip().splitlines()
671
- )
672
- for line in mt_yaml:
673
- lines.append(f" {line}")
674
- lines.append(" ```")
675
- # Spectral summary (place key knobs together for quick scan)
676
- spec_sigma = spectral_info.get("sigma_quantile")
677
- spec_deadband = spectral_info.get("deadband")
678
- spec_max_caps = spectral_info.get("max_caps")
679
- summary_yaml = {
680
- "sigma_quantile": float(spec_sigma)
681
- if isinstance(spec_sigma, int | float)
682
- else None,
683
- "deadband": float(spec_deadband)
684
- if isinstance(spec_deadband, int | float)
685
- else None,
686
- "max_caps": int(spec_max_caps)
687
- if isinstance(spec_max_caps, int | float)
688
- else None,
689
- }
690
- # Drop Nones from summary
691
- summary_yaml = {k: v for k, v in summary_yaml.items() if v is not None}
692
- if summary_yaml:
693
- lines.append("- **Spectral Summary:**")
694
- lines.append(" ```yaml")
695
- for line in (
696
- yaml.safe_dump(summary_yaml, sort_keys=True, width=70)
697
- .strip()
698
- .splitlines()
699
- ):
700
- lines.append(f" {line}")
701
- lines.append(" ```")
1200
+ lines.append("| Metric | Value | Status |")
1201
+ lines.append("|--------|-------|--------|")
1202
+
1203
+ spectral_ok = bool(validation.get("spectral_stable", False))
1204
+ caps_applied = spectral_info.get("caps_applied")
1205
+ max_caps = spectral_info.get("max_caps")
1206
+ caps_val = (
1207
+ f"{caps_applied}/{max_caps}"
1208
+ if caps_applied is not None and max_caps is not None
1209
+ else "-"
1210
+ )
702
1211
  lines.append(
703
- f"- Caps Applied: {spectral_info.get('caps_applied')} / {spectral_info.get('max_caps')}"
1212
+ f"| Caps Applied | {caps_val} | {'✅ OK' if spectral_ok else '❌ FAIL'} |"
704
1213
  )
1214
+
705
1215
  summary = spectral_info.get("summary", {}) or {}
706
- lines.append(f"- Caps Exceeded: {summary.get('caps_exceeded', False)}")
707
- caps_by_family = spectral_info.get("caps_applied_by_family") or {}
1216
+ caps_exceeded = summary.get("caps_exceeded")
1217
+ if caps_exceeded is not None:
1218
+ cap_status = "✅ OK" if not bool(caps_exceeded) else "⚠️ WARN"
1219
+ lines.append(f"| Caps Exceeded | {caps_exceeded} | {cap_status} |")
1220
+
1221
+ top_scores = spectral_info.get("top_z_scores") or {}
1222
+ max_family: str | None = None
1223
+ max_module: str | None = None
1224
+ max_abs_z: float | None = None
1225
+ if isinstance(top_scores, dict):
1226
+ for family, entries in top_scores.items():
1227
+ if not isinstance(entries, list):
1228
+ continue
1229
+ for entry in entries:
1230
+ if not isinstance(entry, dict):
1231
+ continue
1232
+ z_val = entry.get("z")
1233
+ if not (
1234
+ isinstance(z_val, int | float) and math.isfinite(float(z_val))
1235
+ ):
1236
+ continue
1237
+ z_abs = abs(float(z_val))
1238
+ if max_abs_z is None or z_abs > max_abs_z:
1239
+ max_abs_z = z_abs
1240
+ max_family = str(family)
1241
+ max_module = (
1242
+ str(entry.get("module")) if entry.get("module") else None
1243
+ )
1244
+
708
1245
  family_caps = spectral_info.get("family_caps") or {}
709
- if caps_by_family:
710
- lines.append("")
711
- lines.append("| Family | κ | Violations |")
712
- lines.append("|--------|---|------------|")
713
- for family, count in caps_by_family.items():
714
- kappa = family_caps.get(family, {}).get("kappa")
715
- if isinstance(kappa, int | float) and math.isfinite(float(kappa)):
716
- kappa_str = f"{kappa:.3f}"
717
- else:
718
- kappa_str = "-"
719
- lines.append(f"| {family} | {kappa_str} | {count} |")
720
- lines.append("")
1246
+ kappa = None
1247
+ if max_family and isinstance(family_caps, dict):
1248
+ try:
1249
+ kappa = (family_caps.get(max_family, {}) or {}).get("kappa")
1250
+ except Exception:
1251
+ kappa = None
1252
+ kappa_f = (
1253
+ float(kappa)
1254
+ if isinstance(kappa, int | float) and math.isfinite(float(kappa))
1255
+ else None
1256
+ )
1257
+
1258
+ if max_abs_z is not None:
1259
+ max_val = f"{max_abs_z:.3f}"
1260
+ if max_family:
1261
+ max_val += f" ({max_family})"
1262
+ if max_module:
1263
+ max_val += f" – {max_module}"
1264
+ if kappa_f is None:
1265
+ max_status = "ℹ️ No κ"
1266
+ elif max_abs_z <= kappa_f:
1267
+ max_status = f"✅ Within κ={kappa_f:.3f}"
1268
+ else:
1269
+ max_status = f"❌ Exceeds κ={kappa_f:.3f}"
1270
+ lines.append(f"| Max |z| | {max_val} | {max_status} |")
1271
+
1272
+ mt_info = spectral_info.get("multiple_testing", {}) or {}
1273
+ if isinstance(mt_info, dict) and mt_info:
1274
+ mt_method = mt_info.get("method")
1275
+ mt_alpha = mt_info.get("alpha")
1276
+ mt_m = mt_info.get("m")
1277
+ parts: list[str] = []
1278
+ if mt_method:
1279
+ parts.append(f"method={mt_method}")
1280
+ if isinstance(mt_alpha, int | float) and math.isfinite(float(mt_alpha)):
1281
+ parts.append(f"α={float(mt_alpha):.3g}")
1282
+ if isinstance(mt_m, int | float) and math.isfinite(float(mt_m)):
1283
+ parts.append(f"m={int(mt_m)}")
1284
+ lines.append(
1285
+ f"| Multiple Testing | {', '.join(parts) if parts else '—'} | ℹ️ INFO |"
1286
+ )
1287
+
1288
+ lines.append("")
1289
+
1290
+ caps_by_family = spectral_info.get("caps_applied_by_family") or {}
721
1291
  quantiles = spectral_info.get("family_z_quantiles") or {}
722
- if quantiles:
723
- lines.append("| Family | q95 | q99 | Max | Samples |")
724
- lines.append("|--------|-----|-----|-----|---------|")
725
- for family, stats in quantiles.items():
726
- q95 = stats.get("q95")
727
- q99 = stats.get("q99")
728
- max_z = stats.get("max")
729
- count = stats.get("count")
1292
+ if any(
1293
+ bool(x)
1294
+ for x in (caps_by_family, quantiles, family_caps, top_scores)
1295
+ if isinstance(x, dict)
1296
+ ):
1297
+ lines.append("<details>")
1298
+ lines.append("<summary>Per-family details</summary>")
1299
+ lines.append("")
1300
+ lines.append("| Family | κ | q95 | Max |z| | Violations |")
1301
+ lines.append("|--------|---|-----|--------|------------|")
1302
+
1303
+ families: set[str] = set()
1304
+ for block in (caps_by_family, quantiles, family_caps, top_scores):
1305
+ if isinstance(block, dict):
1306
+ families.update(str(k) for k in block.keys())
1307
+
1308
+ for family in sorted(families):
1309
+ kappa = None
1310
+ if isinstance(family_caps, dict):
1311
+ kappa = (family_caps.get(family, {}) or {}).get("kappa")
1312
+ kappa_str = (
1313
+ f"{float(kappa):.3f}"
1314
+ if isinstance(kappa, int | float) and math.isfinite(float(kappa))
1315
+ else "-"
1316
+ )
1317
+
1318
+ q95 = None
1319
+ max_z = None
1320
+ if isinstance(quantiles, dict):
1321
+ stats = quantiles.get(family) or {}
1322
+ if isinstance(stats, dict):
1323
+ q95 = stats.get("q95")
1324
+ max_z = stats.get("max")
730
1325
  q95_str = f"{q95:.3f}" if isinstance(q95, int | float) else "-"
731
- q99_str = f"{q99:.3f}" if isinstance(q99, int | float) else "-"
732
1326
  max_str = f"{max_z:.3f}" if isinstance(max_z, int | float) else "-"
733
- count_str = str(count) if isinstance(count, int | float) else "-"
1327
+
1328
+ violations = None
1329
+ if isinstance(caps_by_family, dict):
1330
+ violations = caps_by_family.get(family)
1331
+ v_str = (
1332
+ str(int(violations)) if isinstance(violations, int | float) else "0"
1333
+ )
1334
+
734
1335
  lines.append(
735
- f"| {family} | {q95_str} | {q99_str} | {max_str} | {count_str} |"
1336
+ f"| {family} | {kappa_str} | {q95_str} | {max_str} | {v_str} |"
736
1337
  )
1338
+
1339
+ if isinstance(top_scores, dict) and top_scores:
1340
+ lines.append("")
1341
+ lines.append("Top |z| per family:")
1342
+ for family in sorted(top_scores.keys()):
1343
+ entries = top_scores[family]
1344
+ if not isinstance(entries, list) or not entries:
1345
+ continue
1346
+ formatted_entries = []
1347
+ for entry in entries:
1348
+ if not isinstance(entry, dict):
1349
+ continue
1350
+ module_name = entry.get("module", "unknown")
1351
+ z_val = entry.get("z")
1352
+ if isinstance(z_val, int | float) and math.isfinite(
1353
+ float(z_val)
1354
+ ):
1355
+ z_str = f"{z_val:.3f}"
1356
+ else:
1357
+ z_str = "n/a"
1358
+ formatted_entries.append(f"{module_name} (|z|={z_str})")
1359
+ lines.append(f"- {family}: {', '.join(formatted_entries)}")
1360
+
737
1361
  lines.append("")
738
- policy_caps = spectral_info.get("policy", {}).get("family_caps")
739
- if policy_caps:
740
- lines.append("- **Family κ (policy):**")
741
- lines.append(" ```yaml")
742
- caps_yaml = (
743
- yaml.safe_dump(policy_caps, sort_keys=True, width=70)
744
- .strip()
745
- .splitlines()
746
- )
747
- for line in caps_yaml:
748
- lines.append(f" {line}")
749
- lines.append(" ```")
750
- top_scores = spectral_info.get("top_z_scores") or {}
751
- if top_scores:
752
- lines.append("Top |z| per family:")
753
- for family in sorted(top_scores.keys()):
754
- entries = top_scores[family]
755
- if not entries:
756
- continue
757
- formatted_entries = []
758
- for entry in entries:
759
- module_name = entry.get("module", "unknown")
760
- z_val = entry.get("z")
761
- if isinstance(z_val, int | float) and math.isfinite(float(z_val)):
762
- z_str = f"{z_val:.3f}"
763
- else:
764
- z_str = "n/a"
765
- formatted_entries.append(f"{module_name} (|z|={z_str})")
766
- lines.append(f"- {family}: {', '.join(formatted_entries)}")
1362
+ lines.append("</details>")
767
1363
  lines.append("")
768
1364
 
769
- rmt_info = certificate.get("rmt", {}) or {}
1365
+ rmt_info = evaluation_report.get("rmt", {}) or {}
770
1366
  if rmt_info:
771
1367
  lines.append("### RMT Guard")
772
1368
  lines.append("")
773
1369
  families = rmt_info.get("families") or {}
1370
+ stable = bool(rmt_info.get("stable", True))
1371
+ status = "✅ OK" if stable else "❌ FAIL"
1372
+ delta_total = rmt_info.get("delta_total")
1373
+ if isinstance(delta_total, int):
1374
+ lines.append(f"- Δ total: {delta_total:+d}")
1375
+ lines.append(f"- Status: {status}")
1376
+ lines.append(f"- Families: {len(families)}")
774
1377
  if families:
1378
+ lines.append("")
1379
+ lines.append("<details>")
1380
+ lines.append("<summary>RMT family details</summary>")
1381
+ lines.append("")
775
1382
  lines.append("| Family | ε_f | Bare | Guarded | Δ |")
776
1383
  lines.append("|--------|-----|------|---------|---|")
777
1384
  for family, data in families.items():
@@ -801,14 +1408,12 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
801
1408
  f"| {family} | {epsilon_str} | {bare_str} | {guarded_str} | {delta_str} |"
802
1409
  )
803
1410
  lines.append("")
804
- # Delta total and stability flags
805
- delta_total = rmt_info.get("delta_total")
806
- if isinstance(delta_total, int):
807
- lines.append(f"- Δ total: {delta_total:+d}")
808
- lines.append(f"- Stable: {rmt_info.get('stable', True)}")
809
- lines.append("")
1411
+ lines.append("</details>")
1412
+ lines.append("")
1413
+ else:
1414
+ lines.append("")
810
1415
 
811
- guard_overhead_info = certificate.get("guard_overhead", {}) or {}
1416
+ guard_overhead_info = evaluation_report.get("guard_overhead", {}) or {}
812
1417
  if guard_overhead_info:
813
1418
  lines.append("### Guard Overhead")
814
1419
  lines.append("")
@@ -836,7 +1441,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
836
1441
  overhead_source = guard_overhead_info.get("source")
837
1442
  if overhead_source:
838
1443
  lines.append(f"- Source: {overhead_source}")
839
- plan_ctx = certificate.get("provenance", {}).get("window_plan", {})
1444
+ plan_ctx = evaluation_report.get("provenance", {}).get("window_plan", {})
840
1445
  if isinstance(plan_ctx, dict) and plan_ctx:
841
1446
  plan_preview = (
842
1447
  plan_ctx.get("preview_n")
@@ -855,34 +1460,34 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
855
1460
  lines.append("")
856
1461
 
857
1462
  compression_diag = (
858
- certificate.get("structure", {}).get("compression_diagnostics", {})
859
- if isinstance(certificate.get("structure"), dict)
1463
+ evaluation_report.get("structure", {}).get("compression_diagnostics", {})
1464
+ if isinstance(evaluation_report.get("structure"), dict)
860
1465
  else {}
861
1466
  )
862
1467
  inference_flags = compression_diag.get("inferred") or {}
863
1468
  inference_sources = compression_diag.get("inference_source") or {}
864
1469
  inference_log = compression_diag.get("inference_log") or []
865
1470
  if inference_flags or inference_sources or inference_log:
866
- lines.append("## Inference")
867
- lines.append("")
1471
+ appendix_lines.append("### Inference Diagnostics")
1472
+ appendix_lines.append("")
868
1473
  if inference_flags:
869
- lines.append("- **Fields Inferred:**")
1474
+ appendix_lines.append("- **Fields Inferred:**")
870
1475
  for field, flag in inference_flags.items():
871
- lines.append(f" - {field}: {'yes' if flag else 'no'}")
1476
+ appendix_lines.append(f" - {field}: {'yes' if flag else 'no'}")
872
1477
  if inference_sources:
873
- lines.append("- **Sources:**")
1478
+ appendix_lines.append("- **Sources:**")
874
1479
  for field, source in inference_sources.items():
875
- lines.append(f" - {field}: {source}")
1480
+ appendix_lines.append(f" - {field}: {source}")
876
1481
  if inference_log:
877
- lines.append("- **Inference Log:**")
1482
+ appendix_lines.append("- **Inference Log:**")
878
1483
  for entry in inference_log:
879
- lines.append(f" - {entry}")
880
- lines.append("")
1484
+ appendix_lines.append(f" - {entry}")
1485
+ appendix_lines.append("")
881
1486
 
882
1487
  # Model and Configuration
883
1488
  lines.append("## Model Information")
884
1489
  lines.append("")
885
- meta = certificate["meta"]
1490
+ meta = evaluation_report["meta"]
886
1491
  lines.append(f"- **Model ID:** {meta.get('model_id')}")
887
1492
  lines.append(f"- **Adapter:** {meta.get('adapter')}")
888
1493
  lines.append(f"- **Device:** {meta.get('device')}")
@@ -906,34 +1511,54 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
906
1511
  if invarlock_version:
907
1512
  lines.append(f"- **InvarLock Version:** {invarlock_version}")
908
1513
  env_flags = meta.get("env_flags")
909
- if isinstance(env_flags, dict) and env_flags:
910
- lines.append("- **Env Flags:**")
911
- lines.append(" ```yaml")
912
- for k, v in env_flags.items():
913
- lines.append(f" {k}: {v}")
914
- lines.append(" ```")
915
- # Determinism flags (if present)
916
1514
  cuda_flags = meta.get("cuda_flags")
1515
+
1516
+ # Compressed determinism/environment summary for readability
1517
+ det_parts: list[str] = []
1518
+ for label, keys in (
1519
+ ("torch_det", ("torch_deterministic_algorithms", "deterministic_algorithms")),
1520
+ ("cudnn_det", ("cudnn_deterministic",)),
1521
+ ("cudnn_bench", ("cudnn_benchmark",)),
1522
+ ("tf32_matmul", ("cuda_matmul_allow_tf32",)),
1523
+ ("tf32_cudnn", ("cudnn_allow_tf32",)),
1524
+ ("cublas_ws", ("CUBLAS_WORKSPACE_CONFIG",)),
1525
+ ):
1526
+ val = None
1527
+ for key in keys:
1528
+ if isinstance(env_flags, dict) and env_flags.get(key) is not None:
1529
+ val = env_flags.get(key)
1530
+ break
1531
+ if isinstance(cuda_flags, dict) and cuda_flags.get(key) is not None:
1532
+ val = cuda_flags.get(key)
1533
+ break
1534
+ if val is not None:
1535
+ det_parts.append(f"{label}={val}")
1536
+ if det_parts:
1537
+ lines.append(f"- **Determinism:** {', '.join(det_parts)}")
1538
+
1539
+ full_flags: dict[str, Any] = {}
1540
+ if isinstance(env_flags, dict) and env_flags:
1541
+ full_flags["env_flags"] = env_flags
917
1542
  if isinstance(cuda_flags, dict) and cuda_flags:
918
- parts = []
919
- for key in (
920
- "deterministic_algorithms",
921
- "cudnn_deterministic",
922
- "cudnn_benchmark",
923
- "cudnn_allow_tf32",
924
- "cuda_matmul_allow_tf32",
925
- "CUBLAS_WORKSPACE_CONFIG",
926
- ):
927
- if key in cuda_flags and cuda_flags[key] is not None:
928
- parts.append(f"{key}={cuda_flags[key]}")
929
- if parts:
930
- lines.append(f"- **Determinism Flags:** {', '.join(parts)}")
1543
+ full_flags["cuda_flags"] = cuda_flags
1544
+ if full_flags:
1545
+ lines.append("")
1546
+ lines.append("<details>")
1547
+ lines.append("<summary>Environment flags (full)</summary>")
1548
+ lines.append("")
1549
+ lines.append("```yaml")
1550
+ flags_yaml = yaml.safe_dump(full_flags, sort_keys=True, width=80).strip()
1551
+ for line in flags_yaml.splitlines():
1552
+ lines.append(line)
1553
+ lines.append("```")
1554
+ lines.append("")
1555
+ lines.append("</details>")
931
1556
  lines.append("")
932
1557
 
933
1558
  # Edit Configuration (removed duplicate Edit Information section)
934
1559
 
935
1560
  # Auto-tuning Configuration
936
- auto = certificate["auto"]
1561
+ auto = evaluation_report["auto"]
937
1562
  if auto["tier"] != "none":
938
1563
  lines.append("## Auto-Tuning Configuration")
939
1564
  lines.append("")
@@ -951,275 +1576,18 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
951
1576
  pass
952
1577
  lines.append("")
953
1578
 
954
- resolved_policy = certificate.get("resolved_policy")
955
- if resolved_policy:
956
- lines.append("## Resolved Policy")
957
- lines.append("")
958
- lines.append("```yaml")
959
- resolved_yaml = yaml.safe_dump(
960
- resolved_policy, sort_keys=True, width=80, default_flow_style=False
961
- ).strip()
962
- for line in resolved_yaml.splitlines():
963
- lines.append(line)
964
- lines.append("```")
965
- lines.append("")
966
-
967
- policy_provenance = certificate.get("policy_provenance", {})
968
- if policy_provenance:
969
- lines.append("## Policy Provenance")
970
- lines.append("")
971
- lines.append(f"- **Tier:** {policy_provenance.get('tier')}")
972
- overrides_list = policy_provenance.get("overrides") or []
973
- if overrides_list:
974
- lines.append(f"- **Overrides:** {', '.join(overrides_list)}")
975
- else:
976
- lines.append("- **Overrides:** (none)")
977
- digest_value = policy_provenance.get("policy_digest")
978
- if digest_value:
979
- lines.append(f"- **Policy Digest:** `{digest_value}`")
980
- else:
981
- lines.append("- **Policy Digest:** (not recorded)")
982
- if policy_provenance.get("resolved_at"):
983
- lines.append(f"- **Resolved At:** {policy_provenance.get('resolved_at')}")
984
- lines.append("")
985
-
986
- # Dataset Information
987
- lines.append("## Dataset Configuration")
988
- lines.append("")
989
- dataset = certificate.get("dataset", {}) or {}
990
- prov = (
991
- (dataset.get("provider") or "unknown")
992
- if isinstance(dataset, dict)
993
- else "unknown"
994
- )
995
- lines.append(f"- **Provider:** {prov}")
996
- try:
997
- seq_len_val = (
998
- int(dataset.get("seq_len"))
999
- if isinstance(dataset.get("seq_len"), int | float)
1000
- else dataset.get("seq_len")
1001
- )
1002
- except Exception: # pragma: no cover - defensive
1003
- seq_len_val = dataset.get("seq_len")
1004
- if seq_len_val is not None:
1005
- lines.append(f"- **Sequence Length:** {seq_len_val}")
1006
- windows_blk = (
1007
- dataset.get("windows", {}) if isinstance(dataset.get("windows"), dict) else {}
1008
- )
1009
- win_prev = windows_blk.get("preview")
1010
- win_final = windows_blk.get("final")
1011
- if win_prev is not None and win_final is not None:
1012
- lines.append(f"- **Windows:** {win_prev} preview + {win_final} final")
1013
- if windows_blk.get("seed") is not None:
1014
- lines.append(f"- **Seed:** {windows_blk.get('seed')}")
1015
- hash_blk = dataset.get("hash", {}) if isinstance(dataset.get("hash"), dict) else {}
1016
- if hash_blk.get("preview_tokens") is not None:
1017
- lines.append(f"- **Preview Tokens:** {hash_blk.get('preview_tokens'):,}")
1018
- if hash_blk.get("final_tokens") is not None:
1019
- lines.append(f"- **Final Tokens:** {hash_blk.get('final_tokens'):,}")
1020
- if hash_blk.get("total_tokens") is not None:
1021
- lines.append(f"- **Total Tokens:** {hash_blk.get('total_tokens'):,}")
1022
- if hash_blk.get("dataset"):
1023
- lines.append(f"- **Dataset Hash:** {hash_blk.get('dataset')}")
1024
- tokenizer = dataset.get("tokenizer", {})
1025
- if tokenizer.get("name") or tokenizer.get("hash"):
1026
- vocab_size = tokenizer.get("vocab_size")
1027
- vocab_suffix = f" (vocab {vocab_size})" if isinstance(vocab_size, int) else ""
1028
- lines.append(
1029
- f"- **Tokenizer:** {tokenizer.get('name', 'unknown')}{vocab_suffix}"
1030
- )
1031
- if tokenizer.get("hash"):
1032
- lines.append(f" - Hash: {tokenizer['hash']}")
1033
- lines.append(
1034
- f" - BOS/EOS: {tokenizer.get('bos_token')} / {tokenizer.get('eos_token')}"
1035
- )
1036
- if tokenizer.get("pad_token") is not None:
1037
- lines.append(f" - PAD: {tokenizer.get('pad_token')}")
1038
- if tokenizer.get("add_prefix_space") is not None:
1039
- lines.append(f" - add_prefix_space: {tokenizer.get('add_prefix_space')}")
1040
- lines.append("")
1041
-
1042
- provenance_info = certificate.get("provenance", {}) or {}
1043
- if provenance_info:
1044
- lines.append("## Run Provenance")
1045
- lines.append("")
1046
- baseline_info = provenance_info.get("baseline", {}) or {}
1047
- if baseline_info:
1048
- lines.append(f"- **Baseline Run ID:** {baseline_info.get('run_id')}")
1049
- if baseline_info.get("report_hash"):
1050
- lines.append(f" - Report Hash: `{baseline_info.get('report_hash')}`")
1051
- if baseline_info.get("report_path"):
1052
- lines.append(f" - Report Path: {baseline_info.get('report_path')}")
1053
- edited_info = provenance_info.get("edited", {}) or {}
1054
- if edited_info:
1055
- lines.append(f"- **Edited Run ID:** {edited_info.get('run_id')}")
1056
- if edited_info.get("report_hash"):
1057
- lines.append(f" - Report Hash: `{edited_info.get('report_hash')}`")
1058
- if edited_info.get("report_path"):
1059
- lines.append(f" - Report Path: {edited_info.get('report_path')}")
1060
- window_plan = provenance_info.get("window_plan")
1061
- if isinstance(window_plan, dict) and window_plan:
1062
- preview_val = window_plan.get(
1063
- "preview_n", window_plan.get("actual_preview")
1064
- )
1065
- final_val = window_plan.get("final_n", window_plan.get("actual_final"))
1066
- lines.append(
1067
- f"- **Window Plan:** profile={window_plan.get('profile')}, preview={preview_val}, final={final_val}"
1068
- )
1069
- provider_digest = provenance_info.get("provider_digest")
1070
- if isinstance(provider_digest, dict) and provider_digest:
1071
- ids_d = provider_digest.get("ids_sha256")
1072
- tok_d = provider_digest.get("tokenizer_sha256")
1073
- mask_d = provider_digest.get("masking_sha256")
1074
-
1075
- lines.append("- **Provider Digest:**")
1076
- if tok_d:
1077
- lines.append(
1078
- f" - tokenizer_sha256: `{_short_digest(tok_d)}` (full in JSON)"
1079
- )
1080
- if ids_d:
1081
- lines.append(f" - ids_sha256: `{_short_digest(ids_d)}` (full in JSON)")
1082
- if mask_d:
1083
- lines.append(
1084
- f" - masking_sha256: `{_short_digest(mask_d)}` (full in JSON)"
1085
- )
1086
- # Surface confidence label prominently
1087
- try:
1088
- conf = certificate.get("confidence", {}) or {}
1089
- if isinstance(conf, dict) and conf.get("label"):
1090
- lines.append(f"- **Confidence:** {conf.get('label')}")
1091
- except Exception:
1092
- pass
1093
- # Surface policy version + thresholds hash (short)
1094
- try:
1095
- pd = certificate.get("policy_digest", {}) or {}
1096
- if isinstance(pd, dict) and pd:
1097
- pv = pd.get("policy_version")
1098
- th = pd.get("thresholds_hash")
1099
- if pv:
1100
- lines.append(f"- **Policy Version:** {pv}")
1101
- if isinstance(th, str) and th:
1102
- short = th if len(th) <= 16 else (th[:8] + "…" + th[-8:])
1103
- lines.append(f"- **Thresholds Digest:** `{short}` (full in JSON)")
1104
- if pd.get("changed"):
1105
- lines.append("- Note: policy changed")
1106
- except Exception:
1107
- pass
1108
- lines.append("")
1579
+ _append_dataset_and_provenance_section(lines, evaluation_report)
1109
1580
 
1110
1581
  # Structural Changes heading is printed with content later; avoid empty header here
1111
1582
 
1112
- # Primary Metric (metric-v1) snapshot, if present
1113
- try:
1114
- pm = certificate.get("primary_metric")
1115
- if isinstance(pm, dict) and pm:
1116
- kind = pm.get("kind", "unknown")
1117
- lines.append(f"## Primary Metric ({kind})")
1118
- lines.append("")
1119
- unit = pm.get("unit", "-")
1120
- paired = pm.get("paired", False)
1121
- reps = None
1122
- # Snapshot only; bootstrap reps live in ppl.stats.bootstrap for ppl metrics
1123
- # Mark estimated metrics (e.g., pseudo accuracy counts) clearly
1124
- estimated_flag = False
1125
- try:
1126
- if bool(pm.get("estimated")):
1127
- estimated_flag = True
1128
- elif str(pm.get("counts_source", "")).lower() == "pseudo_config":
1129
- estimated_flag = True
1130
- except Exception:
1131
- estimated_flag = False
1132
- est_suffix = " (estimated)" if estimated_flag else ""
1133
- lines.append(f"- Kind: {kind} (unit: {unit}){est_suffix}")
1134
- gating_basis = pm.get("gating_basis") or pm.get("basis")
1135
- if gating_basis:
1136
- lines.append(f"- Basis: {gating_basis}")
1137
- if isinstance(paired, bool):
1138
- lines.append(f"- Paired: {paired}")
1139
- reps = pm.get("reps")
1140
- if isinstance(reps, int | float):
1141
- lines.append(f"- Bootstrap Reps: {int(reps)}")
1142
- ci = pm.get("ci") or pm.get("display_ci")
1143
- if (
1144
- isinstance(ci, list | tuple)
1145
- and len(ci) == 2
1146
- and all(isinstance(x, int | float) for x in ci)
1147
- ):
1148
- lines.append(f"- CI: {ci[0]:.3f}–{ci[1]:.3f}")
1149
- prev = pm.get("preview")
1150
- fin = pm.get("final")
1151
- ratio = pm.get("ratio_vs_baseline")
1152
-
1153
- lines.append("")
1154
- if estimated_flag and str(kind).lower() in {"accuracy", "vqa_accuracy"}:
1155
- lines.append(
1156
- "- Note: Accuracy derived from pseudo counts (quick dev preset); use a labeled preset for measured accuracy."
1157
- )
1158
- lines.append("| Field | Value |")
1159
- lines.append("|-------|-------|")
1160
- lines.append(f"| Preview | {_fmt_by_kind(prev, str(kind))} |")
1161
- lines.append(f"| Final | {_fmt_by_kind(fin, str(kind))} |")
1162
- # For accuracy, ratio field is actually a delta (as per helper); clarify inline
1163
- if kind in {"accuracy", "vqa_accuracy"}:
1164
- lines.append(f"| Δ vs Baseline | {_fmt_by_kind(ratio, str(kind))} |")
1165
- # When baseline accuracy is near-zero, clarify display rule
1166
- try:
1167
- base_pt = pm.get("baseline_point")
1168
- if isinstance(base_pt, int | float) and base_pt < 0.05:
1169
- lines.append(
1170
- "- Note: baseline < 5%; ratio suppressed; showing Δpp"
1171
- )
1172
- except Exception:
1173
- pass
1174
- else:
1175
- try:
1176
- lines.append(f"| Ratio vs Baseline | {float(ratio):.3f} |")
1177
- except Exception:
1178
- lines.append("| Ratio vs Baseline | N/A |")
1179
- lines.append("")
1180
- # Secondary metrics (informational)
1181
- try:
1182
- secs = certificate.get("secondary_metrics")
1183
- if isinstance(secs, list) and secs:
1184
- lines.append("## Secondary Metrics (informational)")
1185
- lines.append("")
1186
- lines.append("| Kind | Preview | Final | vs Baseline | CI |")
1187
- lines.append("|------|---------|-------|-------------|----|")
1188
- for m in secs:
1189
- if not isinstance(m, dict):
1190
- continue
1191
- k = m.get("kind", "?")
1192
- pv = _fmt_by_kind(m.get("preview"), str(k))
1193
- fv = _fmt_by_kind(m.get("final"), str(k))
1194
- rb = m.get("ratio_vs_baseline")
1195
- try:
1196
- rb_str = (
1197
- f"{float(rb):.3f}"
1198
- if (str(k).startswith("ppl"))
1199
- else _fmt_by_kind(rb, str(k))
1200
- )
1201
- except Exception:
1202
- rb_str = "N/A"
1203
- ci = m.get("display_ci") or m.get("ci")
1204
- if isinstance(ci, tuple | list) and len(ci) == 2:
1205
- ci_str = f"{float(ci[0]):.3f}-{float(ci[1]):.3f}"
1206
- else:
1207
- ci_str = "–"
1208
- lines.append(f"| {k} | {pv} | {fv} | {rb_str} | {ci_str} |")
1209
- lines.append("")
1210
- except Exception:
1211
- pass
1212
- except Exception:
1213
- pass
1214
-
1215
1583
  # System Overhead section (latency/throughput)
1216
- sys_over = certificate.get("system_overhead", {}) or {}
1584
+ sys_over = evaluation_report.get("system_overhead", {}) or {}
1217
1585
  if isinstance(sys_over, dict) and sys_over:
1218
1586
  _append_system_overhead_section(lines, sys_over)
1219
1587
 
1220
1588
  # Accuracy Subgroups (informational)
1221
1589
  try:
1222
- cls = certificate.get("classification", {})
1590
+ cls = evaluation_report.get("classification", {})
1223
1591
  sub = cls.get("subgroups") if isinstance(cls, dict) else None
1224
1592
  if isinstance(sub, dict) and sub:
1225
1593
  _append_accuracy_subgroups(lines, sub)
@@ -1227,7 +1595,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
1227
1595
  pass
1228
1596
  # Structural Changes
1229
1597
  try:
1230
- structure = certificate.get("structure", {}) or {}
1598
+ structure = evaluation_report.get("structure", {}) or {}
1231
1599
  params_changed = int(structure.get("params_changed", 0) or 0)
1232
1600
  layers_modified = int(structure.get("layers_modified", 0) or 0)
1233
1601
  bitwidth_changes = 0
@@ -1239,7 +1607,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
1239
1607
  has_changes = any(
1240
1608
  v > 0 for v in (params_changed, layers_modified, bitwidth_changes)
1241
1609
  )
1242
- edit_name = str(certificate.get("edit_name", "unknown"))
1610
+ edit_name = str(evaluation_report.get("edit_name", "unknown"))
1243
1611
  if has_changes:
1244
1612
  lines.append("## Structural Changes")
1245
1613
  lines.append("")
@@ -1369,47 +1737,48 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
1369
1737
  lines.append("")
1370
1738
 
1371
1739
  # Variance Guard (Spectral/RMT summaries are already provided above)
1372
- variance = certificate["variance"]
1373
- lines.append("## Variance Guard")
1740
+ variance = evaluation_report["variance"]
1741
+ appendix_lines.append("### Variance Guard")
1742
+ appendix_lines.append("")
1374
1743
 
1375
1744
  # Display whether VE was enabled after A/B test
1376
- lines.append(f"- **Enabled:** {'Yes' if variance['enabled'] else 'No'}")
1745
+ appendix_lines.append(f"- **Enabled:** {'Yes' if variance['enabled'] else 'No'}")
1377
1746
 
1378
1747
  if variance["enabled"]:
1379
1748
  # VE was enabled - show the gain
1380
1749
  gain_value = variance.get("gain", "N/A")
1381
1750
  if isinstance(gain_value, int | float):
1382
- lines.append(f"- **Gain:** {gain_value:.3f}")
1751
+ appendix_lines.append(f"- **Gain:** {gain_value:.3f}")
1383
1752
  else:
1384
- lines.append(f"- **Gain:** {gain_value}")
1753
+ appendix_lines.append(f"- **Gain:** {gain_value}")
1385
1754
  else:
1386
1755
  # VE was not enabled - show succinct reason if available, else a clear disabled message
1387
1756
  ppl_no_ve = variance.get("ppl_no_ve")
1388
1757
  ppl_with_ve = variance.get("ppl_with_ve")
1389
1758
  ratio_ci = variance.get("ratio_ci")
1390
1759
  if ppl_no_ve is not None and ppl_with_ve is not None and ratio_ci:
1391
- lines.append(f"- **Primary metric without VE:** {ppl_no_ve:.3f}")
1392
- lines.append(f"- **Primary metric with VE:** {ppl_with_ve:.3f}")
1760
+ appendix_lines.append(f"- **Primary metric without VE:** {ppl_no_ve:.3f}")
1761
+ appendix_lines.append(f"- **Primary metric with VE:** {ppl_with_ve:.3f}")
1393
1762
  gain_value = variance.get("gain")
1394
1763
  if isinstance(gain_value, int | float):
1395
- lines.append(f"- **Gain (insufficient):** {gain_value:.3f}")
1764
+ appendix_lines.append(f"- **Gain (insufficient):** {gain_value:.3f}")
1396
1765
  else:
1397
- lines.append(
1766
+ appendix_lines.append(
1398
1767
  "- Variance Guard: Disabled (predictive gate not evaluated for this edit)."
1399
1768
  )
1400
1769
  # Add concise rationale aligned with Balanced predictive gate contract
1401
1770
  try:
1402
- ve_policy = certificate.get("policies", {}).get("variance", {})
1771
+ ve_policy = evaluation_report.get("policies", {}).get("variance", {})
1403
1772
  min_effect = ve_policy.get("min_effect_lognll")
1404
1773
  if isinstance(min_effect, int | float):
1405
- lines.append(
1774
+ appendix_lines.append(
1406
1775
  f"- Predictive gate (Balanced): one-sided; enables only if CI excludes 0 and |mean Δ| ≥ {float(min_effect):.4g}."
1407
1776
  )
1408
1777
  else:
1409
- lines.append(
1778
+ appendix_lines.append(
1410
1779
  "- Predictive gate (Balanced): one-sided; enables only if CI excludes 0 and |mean Δ| ≥ min_effect."
1411
1780
  )
1412
- lines.append(
1781
+ appendix_lines.append(
1413
1782
  "- Predictive Gate: evaluated=false (disabled under current policy/edit)."
1414
1783
  )
1415
1784
  except Exception:
@@ -1417,19 +1786,26 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
1417
1786
 
1418
1787
  if variance.get("ratio_ci"):
1419
1788
  ratio_lo, ratio_hi = variance["ratio_ci"]
1420
- lines.append(f"- **Ratio CI:** [{ratio_lo:.3f}, {ratio_hi:.3f}]")
1789
+ appendix_lines.append(f"- **Ratio CI:** [{ratio_lo:.3f}, {ratio_hi:.3f}]")
1421
1790
 
1422
1791
  if variance.get("calibration") and variance.get("enabled"):
1423
1792
  calib = variance["calibration"]
1424
1793
  coverage = calib.get("coverage")
1425
1794
  requested = calib.get("requested")
1426
1795
  status = calib.get("status", "unknown")
1427
- lines.append(f"- **Calibration:** {coverage}/{requested} windows ({status})")
1796
+ appendix_lines.append(
1797
+ f"- **Calibration:** {coverage}/{requested} windows ({status})"
1798
+ )
1799
+ appendix_lines.append("")
1428
1800
 
1429
1801
  lines.append("")
1430
1802
 
1431
1803
  # MoE Observability (non-gating)
1432
- moe = certificate.get("moe", {}) if isinstance(certificate.get("moe"), dict) else {}
1804
+ moe = (
1805
+ evaluation_report.get("moe", {})
1806
+ if isinstance(evaluation_report.get("moe"), dict)
1807
+ else {}
1808
+ )
1433
1809
  if moe:
1434
1810
  lines.append("## MoE Observability")
1435
1811
  lines.append("")
@@ -1458,46 +1834,36 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
1458
1834
  lines.append(f"- **{label}:** {float(moe[key]):+.4f}")
1459
1835
  lines.append("")
1460
1836
 
1461
- # Policy Summary
1462
- lines.append("## Applied Policies")
1463
- lines.append("")
1464
- policies = certificate["policies"]
1465
- for guard_name, policy in policies.items():
1466
- lines.append(f"### {guard_name.title()}")
1467
- lines.append("")
1468
- policy_yaml = (
1469
- yaml.safe_dump(policy, sort_keys=True, width=80).strip().splitlines()
1470
- )
1471
- lines.append("```yaml")
1472
- for line in policy_yaml:
1473
- lines.append(line)
1474
- lines.append("```")
1475
- lines.append("")
1837
+ _append_policy_configuration_section(lines, evaluation_report)
1476
1838
 
1477
- # Artifacts
1478
- lines.append("## Artifacts")
1479
- lines.append("")
1480
- artifacts = certificate["artifacts"]
1839
+ appendix_lines.append("### Artifacts")
1840
+ appendix_lines.append("")
1841
+ artifacts = evaluation_report["artifacts"]
1481
1842
  if artifacts.get("events_path"):
1482
- lines.append(f"- **Events Log:** `{artifacts['events_path']}`")
1843
+ appendix_lines.append(f"- **Events Log:** `{artifacts['events_path']}`")
1483
1844
  if artifacts.get("report_path"):
1484
- lines.append(f"- **Full Report:** `{artifacts['report_path']}`")
1485
- lines.append(f"- **Certificate Generated:** {artifacts['generated_at']}")
1486
- lines.append("")
1845
+ appendix_lines.append(f"- **Full Report:** `{artifacts['report_path']}`")
1846
+ appendix_lines.append(f"- **Report Generated:** {artifacts['generated_at']}")
1847
+ appendix_lines.append("")
1848
+
1849
+ if appendix_lines:
1850
+ lines.append("## Appendix")
1851
+ lines.append("")
1852
+ lines.extend(appendix_lines)
1487
1853
 
1488
- # Certificate Hash for Integrity
1489
- cert_hash = _compute_certificate_hash(certificate)
1490
- lines.append("## Certificate Integrity")
1854
+ # Report Hash for Integrity
1855
+ cert_hash = _compute_report_hash(evaluation_report)
1856
+ lines.append("## Evaluation Report Integrity")
1491
1857
  lines.append("")
1492
- lines.append(f"**Certificate Hash:** `{cert_hash}`")
1858
+ lines.append(f"**Report Hash:** `{cert_hash}`")
1493
1859
  lines.append("")
1494
1860
  lines.append("---")
1495
1861
  lines.append("")
1496
1862
  lines.append(
1497
- "*This InvarLock safety certificate provides a comprehensive assessment of model compression safety.*"
1863
+ "*This InvarLock Evaluation Report summarizes baseline‑paired evaluation results for a subject model relative to the provided baseline snapshot under the configured profile/preset.*"
1498
1864
  )
1499
1865
  lines.append(
1500
- "*All metrics are compared against the uncompressed baseline model for safety validation.*"
1866
+ "*It reports regression-risk indicators for the measured signals; it is not a broad AI safety, alignment, or content-safety guarantee.*"
1501
1867
  )
1502
1868
 
1503
1869
  return "\n".join(lines)