invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +4 -4
- invarlock/adapters/__init__.py +10 -14
- invarlock/adapters/auto.py +37 -50
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +53 -9
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/assurance/__init__.py +15 -23
- invarlock/cli/adapter_auto.py +32 -26
- invarlock/cli/app.py +128 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/doctor.py +8 -10
- invarlock/cli/commands/evaluate.py +986 -0
- invarlock/cli/commands/explain_gates.py +25 -17
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +326 -92
- invarlock/cli/commands/run.py +1160 -228
- invarlock/cli/commands/verify.py +157 -97
- invarlock/cli/config.py +1 -1
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +4 -4
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/registry.py +9 -11
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +112 -26
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +67 -39
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +14 -10
- invarlock/eval/data.py +68 -23
- invarlock/eval/metrics.py +59 -1
- invarlock/eval/primary_metric.py +1 -1
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/rmt.py +2 -2
- invarlock/guards/spectral.py +1 -1
- invarlock/guards/variance.py +2 -2
- invarlock/model_profile.py +64 -62
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/guards_analysis.py +154 -4
- invarlock/reporting/html.py +61 -11
- invarlock/reporting/normalizer.py +9 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +876 -510
- invarlock/reporting/report.py +72 -30
- invarlock/reporting/{certificate.py → report_builder.py} +252 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock/reporting/report_types.py +6 -1
- invarlock/reporting/telemetry.py +86 -0
- invarlock-0.3.8.dist-info/METADATA +283 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- invarlock/cli/commands/certify.py +0 -422
- invarlock-0.3.6.dist-info/METADATA +0 -588
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
invarlock/reporting/render.py
CHANGED
|
@@ -9,8 +9,7 @@ from typing import Any
|
|
|
9
9
|
|
|
10
10
|
import yaml
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
from . import certificate as C
|
|
12
|
+
from .report_schema import validate_report
|
|
14
13
|
|
|
15
14
|
# Console Validation Block helpers (allow-list driven)
|
|
16
15
|
_CONSOLE_LABELS_DEFAULT = [
|
|
@@ -37,8 +36,10 @@ def _load_console_labels() -> list[str]:
|
|
|
37
36
|
return list(_CONSOLE_LABELS_DEFAULT)
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
def compute_console_validation_block(
|
|
41
|
-
|
|
39
|
+
def compute_console_validation_block(
|
|
40
|
+
evaluation_report: dict[str, Any],
|
|
41
|
+
) -> dict[str, Any]:
|
|
42
|
+
"""Produce a normalized console validation block from an evaluation report.
|
|
42
43
|
|
|
43
44
|
Returns a dict with keys:
|
|
44
45
|
- labels: the canonical label list
|
|
@@ -47,8 +48,8 @@ def compute_console_validation_block(certificate: dict[str, Any]) -> dict[str, A
|
|
|
47
48
|
counted only when evaluated.
|
|
48
49
|
"""
|
|
49
50
|
labels = _load_console_labels()
|
|
50
|
-
validation =
|
|
51
|
-
guard_ctx =
|
|
51
|
+
validation = evaluation_report.get("validation", {}) or {}
|
|
52
|
+
guard_ctx = evaluation_report.get("guard_overhead", {}) or {}
|
|
52
53
|
guard_evaluated = (
|
|
53
54
|
bool(guard_ctx.get("evaluated")) if isinstance(guard_ctx, dict) else False
|
|
54
55
|
)
|
|
@@ -113,6 +114,462 @@ def _short_digest(v: str) -> str:
|
|
|
113
114
|
return v if len(v) <= 16 else (v[:8] + "…" + v[-8:])
|
|
114
115
|
|
|
115
116
|
|
|
117
|
+
def _render_executive_dashboard(cert: dict[str, Any]) -> str:
|
|
118
|
+
"""Render executive summary dashboard table."""
|
|
119
|
+
lines: list[str] = []
|
|
120
|
+
_append_safety_dashboard_section(lines, cert)
|
|
121
|
+
return "\n".join(lines).rstrip()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _append_safety_dashboard_section(
|
|
125
|
+
lines: list[str], evaluation_report: dict[str, Any]
|
|
126
|
+
) -> None:
|
|
127
|
+
"""Append a concise, first-screen dashboard for the evaluation report."""
|
|
128
|
+
block = compute_console_validation_block(evaluation_report)
|
|
129
|
+
overall_pass = bool(block.get("overall_pass"))
|
|
130
|
+
overall_status = (
|
|
131
|
+
f"{'✅' if overall_pass else '❌'} {'PASS' if overall_pass else 'FAIL'}"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
validation = evaluation_report.get("validation", {}) or {}
|
|
135
|
+
pm = evaluation_report.get("primary_metric", {}) or {}
|
|
136
|
+
auto = evaluation_report.get("auto", {}) or {}
|
|
137
|
+
tier = str(auto.get("tier") or "balanced").lower()
|
|
138
|
+
|
|
139
|
+
# Primary metric summary
|
|
140
|
+
pm_kind = str(pm.get("kind", "")).lower()
|
|
141
|
+
pm_basis = pm.get("gating_basis") or pm.get("basis") or "point"
|
|
142
|
+
pm_ok: bool | None
|
|
143
|
+
if isinstance(validation, dict) and "primary_metric_acceptable" in validation:
|
|
144
|
+
pm_ok = bool(validation.get("primary_metric_acceptable"))
|
|
145
|
+
else:
|
|
146
|
+
pm_ok = None
|
|
147
|
+
pm_value = pm.get("ratio_vs_baseline")
|
|
148
|
+
|
|
149
|
+
if pm_kind in {"accuracy", "vqa_accuracy"}:
|
|
150
|
+
measured = f"{pm_value:+.2f} pp" if isinstance(pm_value, int | float) else "N/A"
|
|
151
|
+
th_map = {
|
|
152
|
+
"conservative": -0.5,
|
|
153
|
+
"balanced": -1.0,
|
|
154
|
+
"aggressive": -2.0,
|
|
155
|
+
"none": -1.0,
|
|
156
|
+
}
|
|
157
|
+
th = th_map.get(tier, -1.0)
|
|
158
|
+
threshold = f"≥ {th:+.2f} pp ({pm_basis})"
|
|
159
|
+
else:
|
|
160
|
+
measured = f"{pm_value:.3f}×" if isinstance(pm_value, int | float) else "N/A"
|
|
161
|
+
tier_thresholds = {
|
|
162
|
+
"conservative": 1.05,
|
|
163
|
+
"balanced": 1.10,
|
|
164
|
+
"aggressive": 1.20,
|
|
165
|
+
"none": 1.10,
|
|
166
|
+
}
|
|
167
|
+
ratio_limit = tier_thresholds.get(tier, 1.10)
|
|
168
|
+
target_ratio = auto.get("target_pm_ratio")
|
|
169
|
+
if isinstance(target_ratio, int | float) and target_ratio > 0:
|
|
170
|
+
ratio_limit = min(ratio_limit, float(target_ratio))
|
|
171
|
+
threshold = f"≤ {ratio_limit:.2f}× ({pm_basis})"
|
|
172
|
+
|
|
173
|
+
pm_status = (
|
|
174
|
+
f"{'✅' if pm_ok else '❌'} {measured}"
|
|
175
|
+
if isinstance(pm_ok, bool)
|
|
176
|
+
else f"ℹ️ {measured}"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Drift summary (final/preview ratio) when preview/final are numeric
|
|
180
|
+
drift_ok: bool | None
|
|
181
|
+
if isinstance(validation, dict) and "preview_final_drift_acceptable" in validation:
|
|
182
|
+
drift_ok = bool(validation.get("preview_final_drift_acceptable"))
|
|
183
|
+
else:
|
|
184
|
+
drift_ok = None
|
|
185
|
+
drift_val = "N/A"
|
|
186
|
+
try:
|
|
187
|
+
pv = (
|
|
188
|
+
float(pm.get("preview"))
|
|
189
|
+
if isinstance(pm.get("preview"), int | float)
|
|
190
|
+
else float("nan")
|
|
191
|
+
)
|
|
192
|
+
fv = (
|
|
193
|
+
float(pm.get("final"))
|
|
194
|
+
if isinstance(pm.get("final"), int | float)
|
|
195
|
+
else float("nan")
|
|
196
|
+
)
|
|
197
|
+
drift = (
|
|
198
|
+
fv / pv
|
|
199
|
+
if (math.isfinite(pv) and pv > 0 and math.isfinite(fv))
|
|
200
|
+
else float("nan")
|
|
201
|
+
)
|
|
202
|
+
if math.isfinite(drift):
|
|
203
|
+
drift_val = f"{drift:.3f}×"
|
|
204
|
+
except Exception:
|
|
205
|
+
drift_val = "N/A"
|
|
206
|
+
drift_status = (
|
|
207
|
+
f"{'✅' if drift_ok else '❌'} {drift_val}"
|
|
208
|
+
if isinstance(drift_ok, bool)
|
|
209
|
+
else f"ℹ️ {drift_val}"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def _gate_cell(key: str, ok_default: bool | None = None) -> str:
|
|
213
|
+
ok: bool | None
|
|
214
|
+
if not isinstance(validation, dict):
|
|
215
|
+
ok = ok_default
|
|
216
|
+
elif key not in validation:
|
|
217
|
+
ok = ok_default
|
|
218
|
+
else:
|
|
219
|
+
ok = bool(validation.get(key))
|
|
220
|
+
if ok is None:
|
|
221
|
+
return "ℹ️ N/A"
|
|
222
|
+
return "✅ PASS" if ok else "❌ FAIL"
|
|
223
|
+
|
|
224
|
+
overhead_ctx = evaluation_report.get("guard_overhead", {}) or {}
|
|
225
|
+
overhead_evaluated = (
|
|
226
|
+
bool(overhead_ctx.get("evaluated")) if isinstance(overhead_ctx, dict) else False
|
|
227
|
+
)
|
|
228
|
+
overhead_row: tuple[str, str, str] | None = None
|
|
229
|
+
if overhead_evaluated:
|
|
230
|
+
overhead_pct = overhead_ctx.get("overhead_percent")
|
|
231
|
+
overhead_ratio = overhead_ctx.get("overhead_ratio")
|
|
232
|
+
if isinstance(overhead_pct, int | float) and math.isfinite(float(overhead_pct)):
|
|
233
|
+
overhead_measured = f"{float(overhead_pct):+.2f}%"
|
|
234
|
+
elif isinstance(overhead_ratio, int | float) and math.isfinite(
|
|
235
|
+
float(overhead_ratio)
|
|
236
|
+
):
|
|
237
|
+
overhead_measured = f"{float(overhead_ratio):.3f}×"
|
|
238
|
+
else:
|
|
239
|
+
overhead_measured = "N/A"
|
|
240
|
+
threshold_pct = overhead_ctx.get("threshold_percent")
|
|
241
|
+
if isinstance(threshold_pct, int | float) and math.isfinite(
|
|
242
|
+
float(threshold_pct)
|
|
243
|
+
):
|
|
244
|
+
threshold_str = f"≤ +{float(threshold_pct):.1f}%"
|
|
245
|
+
else:
|
|
246
|
+
threshold_str = "≤ +1.0%"
|
|
247
|
+
overhead_row = (
|
|
248
|
+
"Overhead",
|
|
249
|
+
f"{'✅' if bool(validation.get('guard_overhead_acceptable', True)) else '❌'} {overhead_measured}"
|
|
250
|
+
if isinstance(validation, dict)
|
|
251
|
+
else f"ℹ️ {overhead_measured}",
|
|
252
|
+
threshold_str,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
lines.append("## Evaluation Dashboard")
|
|
256
|
+
lines.append("")
|
|
257
|
+
lines.append("| Check | Status | Quick Summary |")
|
|
258
|
+
lines.append("|-------|--------|---------------|")
|
|
259
|
+
lines.append(f"| Overall | {overall_status} | Canonical gate outcomes |")
|
|
260
|
+
lines.append(f"| Primary Metric | {pm_status} | {threshold} |")
|
|
261
|
+
lines.append(f"| Drift | {drift_status} | 0.95–1.05× band |")
|
|
262
|
+
lines.append(
|
|
263
|
+
f"| Invariants | {_gate_cell('invariants_pass')} | Model integrity checks |"
|
|
264
|
+
)
|
|
265
|
+
lines.append(
|
|
266
|
+
f"| Spectral | {_gate_cell('spectral_stable')} | Weight matrix spectral norms |"
|
|
267
|
+
)
|
|
268
|
+
lines.append(f"| RMT | {_gate_cell('rmt_stable')} | Random Matrix Theory guard |")
|
|
269
|
+
if overhead_row:
|
|
270
|
+
lines.append(f"| {overhead_row[0]} | {overhead_row[1]} | {overhead_row[2]} |")
|
|
271
|
+
lines.append("")
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _append_primary_metric_section(
|
|
275
|
+
lines: list[str], evaluation_report: dict[str, Any]
|
|
276
|
+
) -> None:
|
|
277
|
+
"""Append the Primary Metric section early for quick triage."""
|
|
278
|
+
pm = evaluation_report.get("primary_metric")
|
|
279
|
+
if not isinstance(pm, dict) or not pm:
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
kind = pm.get("kind", "unknown")
|
|
283
|
+
lines.append("## Primary Metric")
|
|
284
|
+
lines.append("")
|
|
285
|
+
unit = pm.get("unit", "-")
|
|
286
|
+
paired = pm.get("paired", False)
|
|
287
|
+
|
|
288
|
+
estimated_flag = False
|
|
289
|
+
try:
|
|
290
|
+
if bool(pm.get("estimated")):
|
|
291
|
+
estimated_flag = True
|
|
292
|
+
elif str(pm.get("counts_source", "")).lower() == "pseudo_config":
|
|
293
|
+
estimated_flag = True
|
|
294
|
+
except Exception:
|
|
295
|
+
estimated_flag = False
|
|
296
|
+
est_suffix = " (estimated)" if estimated_flag else ""
|
|
297
|
+
|
|
298
|
+
lines.append(f"- Kind: {kind} (unit: {unit}){est_suffix}")
|
|
299
|
+
gating_basis = pm.get("gating_basis") or pm.get("basis")
|
|
300
|
+
if gating_basis:
|
|
301
|
+
lines.append(f"- Basis: {gating_basis}")
|
|
302
|
+
if isinstance(paired, bool):
|
|
303
|
+
lines.append(f"- Paired: {paired}")
|
|
304
|
+
reps = pm.get("reps")
|
|
305
|
+
if isinstance(reps, int | float):
|
|
306
|
+
lines.append(f"- Bootstrap Reps: {int(reps)}")
|
|
307
|
+
ci = pm.get("ci") or pm.get("display_ci")
|
|
308
|
+
if (
|
|
309
|
+
isinstance(ci, list | tuple)
|
|
310
|
+
and len(ci) == 2
|
|
311
|
+
and all(isinstance(x, int | float) for x in ci)
|
|
312
|
+
):
|
|
313
|
+
lines.append(f"- CI: {ci[0]:.3f}–{ci[1]:.3f}")
|
|
314
|
+
|
|
315
|
+
prev = pm.get("preview")
|
|
316
|
+
fin = pm.get("final")
|
|
317
|
+
ratio = pm.get("ratio_vs_baseline")
|
|
318
|
+
|
|
319
|
+
lines.append("")
|
|
320
|
+
if estimated_flag and str(kind).lower() in {"accuracy", "vqa_accuracy"}:
|
|
321
|
+
lines.append(
|
|
322
|
+
"- Note: Accuracy derived from pseudo counts (quick dev preset); use a labeled preset for measured accuracy."
|
|
323
|
+
)
|
|
324
|
+
lines.append("| Field | Value |")
|
|
325
|
+
lines.append("|-------|-------|")
|
|
326
|
+
lines.append(f"| Preview | {_fmt_by_kind(prev, str(kind))} |")
|
|
327
|
+
lines.append(f"| Final | {_fmt_by_kind(fin, str(kind))} |")
|
|
328
|
+
|
|
329
|
+
if kind in {"accuracy", "vqa_accuracy"}:
|
|
330
|
+
lines.append(f"| Δ vs Baseline | {_fmt_by_kind(ratio, str(kind))} |")
|
|
331
|
+
try:
|
|
332
|
+
base_pt = pm.get("baseline_point")
|
|
333
|
+
if isinstance(base_pt, int | float) and base_pt < 0.05:
|
|
334
|
+
lines.append("- Note: baseline < 5%; ratio suppressed; showing Δpp")
|
|
335
|
+
except Exception:
|
|
336
|
+
pass
|
|
337
|
+
else:
|
|
338
|
+
try:
|
|
339
|
+
lines.append(f"| Ratio vs Baseline | {float(ratio):.3f} |")
|
|
340
|
+
except Exception:
|
|
341
|
+
lines.append("| Ratio vs Baseline | N/A |")
|
|
342
|
+
lines.append("")
|
|
343
|
+
|
|
344
|
+
# Secondary metrics (informational)
|
|
345
|
+
try:
|
|
346
|
+
secs = evaluation_report.get("secondary_metrics")
|
|
347
|
+
if isinstance(secs, list) and secs:
|
|
348
|
+
lines.append("## Secondary Metrics (informational)")
|
|
349
|
+
lines.append("")
|
|
350
|
+
lines.append("| Kind | Preview | Final | vs Baseline | CI |")
|
|
351
|
+
lines.append("|------|---------|-------|-------------|----|")
|
|
352
|
+
for m in secs:
|
|
353
|
+
if not isinstance(m, dict):
|
|
354
|
+
continue
|
|
355
|
+
k = m.get("kind", "?")
|
|
356
|
+
pv = _fmt_by_kind(m.get("preview"), str(k))
|
|
357
|
+
fv = _fmt_by_kind(m.get("final"), str(k))
|
|
358
|
+
rb = m.get("ratio_vs_baseline")
|
|
359
|
+
try:
|
|
360
|
+
rb_str = (
|
|
361
|
+
f"{float(rb):.3f}"
|
|
362
|
+
if (str(k).startswith("ppl"))
|
|
363
|
+
else _fmt_by_kind(rb, str(k))
|
|
364
|
+
)
|
|
365
|
+
except Exception:
|
|
366
|
+
rb_str = "N/A"
|
|
367
|
+
ci = m.get("display_ci") or m.get("ci")
|
|
368
|
+
if isinstance(ci, tuple | list) and len(ci) == 2:
|
|
369
|
+
ci_str = f"{float(ci[0]):.3f}-{float(ci[1]):.3f}"
|
|
370
|
+
else:
|
|
371
|
+
ci_str = "–"
|
|
372
|
+
lines.append(f"| {k} | {pv} | {fv} | {rb_str} | {ci_str} |")
|
|
373
|
+
lines.append("")
|
|
374
|
+
except Exception:
|
|
375
|
+
pass
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _append_policy_configuration_section(
|
|
379
|
+
lines: list[str], evaluation_report: dict[str, Any]
|
|
380
|
+
) -> None:
|
|
381
|
+
resolved_policy = evaluation_report.get("resolved_policy")
|
|
382
|
+
policy_provenance = evaluation_report.get("policy_provenance", {}) or {}
|
|
383
|
+
has_prov = isinstance(policy_provenance, dict) and bool(policy_provenance)
|
|
384
|
+
has_resolved = isinstance(resolved_policy, dict) and bool(resolved_policy)
|
|
385
|
+
if not (has_prov or has_resolved):
|
|
386
|
+
return
|
|
387
|
+
|
|
388
|
+
lines.append("## Policy Configuration")
|
|
389
|
+
lines.append("")
|
|
390
|
+
|
|
391
|
+
tier = None
|
|
392
|
+
if has_prov:
|
|
393
|
+
tier = policy_provenance.get("tier")
|
|
394
|
+
if not tier:
|
|
395
|
+
tier = (evaluation_report.get("auto", {}) or {}).get("tier")
|
|
396
|
+
digest_value = None
|
|
397
|
+
if has_prov:
|
|
398
|
+
digest_value = policy_provenance.get("policy_digest")
|
|
399
|
+
if not digest_value:
|
|
400
|
+
digest_value = (evaluation_report.get("policy_digest", {}) or {}).get(
|
|
401
|
+
"thresholds_hash"
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
summary_parts: list[str] = []
|
|
405
|
+
if tier:
|
|
406
|
+
summary_parts.append(f"**Tier:** {tier}")
|
|
407
|
+
if digest_value:
|
|
408
|
+
summary_parts.append(f"**Digest:** `{_short_digest(str(digest_value))}`")
|
|
409
|
+
if summary_parts:
|
|
410
|
+
lines.append(" | ".join(summary_parts))
|
|
411
|
+
|
|
412
|
+
if has_prov:
|
|
413
|
+
overrides_list = policy_provenance.get("overrides") or []
|
|
414
|
+
if overrides_list:
|
|
415
|
+
lines.append(f"- **Overrides:** {', '.join(overrides_list)}")
|
|
416
|
+
else:
|
|
417
|
+
lines.append("- **Overrides:** (none)")
|
|
418
|
+
if policy_provenance.get("resolved_at"):
|
|
419
|
+
lines.append(f"- **Resolved At:** {policy_provenance.get('resolved_at')}")
|
|
420
|
+
|
|
421
|
+
if has_resolved:
|
|
422
|
+
lines.append("")
|
|
423
|
+
lines.append("<details>")
|
|
424
|
+
lines.append("<summary>Resolved Policy YAML</summary>")
|
|
425
|
+
lines.append("")
|
|
426
|
+
lines.append("```yaml")
|
|
427
|
+
resolved_yaml = yaml.safe_dump(
|
|
428
|
+
resolved_policy, sort_keys=True, width=80, default_flow_style=False
|
|
429
|
+
).strip()
|
|
430
|
+
for line in resolved_yaml.splitlines():
|
|
431
|
+
lines.append(line)
|
|
432
|
+
lines.append("```")
|
|
433
|
+
lines.append("")
|
|
434
|
+
lines.append("</details>")
|
|
435
|
+
|
|
436
|
+
lines.append("")
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _append_dataset_and_provenance_section(
|
|
440
|
+
lines: list[str], evaluation_report: dict[str, Any]
|
|
441
|
+
) -> None:
|
|
442
|
+
dataset = evaluation_report.get("dataset", {}) or {}
|
|
443
|
+
provenance_info = evaluation_report.get("provenance", {}) or {}
|
|
444
|
+
|
|
445
|
+
has_dataset = isinstance(dataset, dict) and bool(dataset)
|
|
446
|
+
has_provenance = isinstance(provenance_info, dict) and bool(provenance_info)
|
|
447
|
+
if not (has_dataset or has_provenance):
|
|
448
|
+
return
|
|
449
|
+
|
|
450
|
+
lines.append("## Dataset and Provenance")
|
|
451
|
+
lines.append("")
|
|
452
|
+
|
|
453
|
+
if has_dataset:
|
|
454
|
+
prov = dataset.get("provider") or "unknown"
|
|
455
|
+
lines.append(f"- **Provider:** {prov}")
|
|
456
|
+
try:
|
|
457
|
+
seq_len_val = (
|
|
458
|
+
int(dataset.get("seq_len"))
|
|
459
|
+
if isinstance(dataset.get("seq_len"), int | float)
|
|
460
|
+
else dataset.get("seq_len")
|
|
461
|
+
)
|
|
462
|
+
except Exception: # pragma: no cover - defensive
|
|
463
|
+
seq_len_val = dataset.get("seq_len")
|
|
464
|
+
if seq_len_val is not None:
|
|
465
|
+
lines.append(f"- **Sequence Length:** {seq_len_val}")
|
|
466
|
+
windows_blk = (
|
|
467
|
+
dataset.get("windows", {})
|
|
468
|
+
if isinstance(dataset.get("windows"), dict)
|
|
469
|
+
else {}
|
|
470
|
+
)
|
|
471
|
+
win_prev = windows_blk.get("preview")
|
|
472
|
+
win_final = windows_blk.get("final")
|
|
473
|
+
if win_prev is not None and win_final is not None:
|
|
474
|
+
lines.append(f"- **Windows:** {win_prev} preview + {win_final} final")
|
|
475
|
+
if windows_blk.get("seed") is not None:
|
|
476
|
+
lines.append(f"- **Seed:** {windows_blk.get('seed')}")
|
|
477
|
+
hash_blk = (
|
|
478
|
+
dataset.get("hash", {}) if isinstance(dataset.get("hash"), dict) else {}
|
|
479
|
+
)
|
|
480
|
+
if hash_blk.get("preview_tokens") is not None:
|
|
481
|
+
lines.append(f"- **Preview Tokens:** {hash_blk.get('preview_tokens'):,}")
|
|
482
|
+
if hash_blk.get("final_tokens") is not None:
|
|
483
|
+
lines.append(f"- **Final Tokens:** {hash_blk.get('final_tokens'):,}")
|
|
484
|
+
if hash_blk.get("total_tokens") is not None:
|
|
485
|
+
lines.append(f"- **Total Tokens:** {hash_blk.get('total_tokens'):,}")
|
|
486
|
+
if hash_blk.get("dataset"):
|
|
487
|
+
lines.append(f"- **Dataset Hash:** {hash_blk.get('dataset')}")
|
|
488
|
+
tokenizer = dataset.get("tokenizer", {})
|
|
489
|
+
if isinstance(tokenizer, dict) and (
|
|
490
|
+
tokenizer.get("name") or tokenizer.get("hash")
|
|
491
|
+
):
|
|
492
|
+
vocab_size = tokenizer.get("vocab_size")
|
|
493
|
+
vocab_suffix = (
|
|
494
|
+
f" (vocab {vocab_size})" if isinstance(vocab_size, int) else ""
|
|
495
|
+
)
|
|
496
|
+
lines.append(
|
|
497
|
+
f"- **Tokenizer:** {tokenizer.get('name', 'unknown')}{vocab_suffix}"
|
|
498
|
+
)
|
|
499
|
+
if tokenizer.get("hash"):
|
|
500
|
+
lines.append(f" - Hash: {tokenizer['hash']}")
|
|
501
|
+
lines.append(
|
|
502
|
+
f" - BOS/EOS: {tokenizer.get('bos_token')} / {tokenizer.get('eos_token')}"
|
|
503
|
+
)
|
|
504
|
+
if tokenizer.get("pad_token") is not None:
|
|
505
|
+
lines.append(f" - PAD: {tokenizer.get('pad_token')}")
|
|
506
|
+
if tokenizer.get("add_prefix_space") is not None:
|
|
507
|
+
lines.append(
|
|
508
|
+
f" - add_prefix_space: {tokenizer.get('add_prefix_space')}"
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
if has_provenance:
|
|
512
|
+
baseline_info = provenance_info.get("baseline", {}) or {}
|
|
513
|
+
edited_info = provenance_info.get("edited", {}) or {}
|
|
514
|
+
|
|
515
|
+
if baseline_info or edited_info:
|
|
516
|
+
lines.append("")
|
|
517
|
+
if baseline_info:
|
|
518
|
+
lines.append(f"- **Baseline Run ID:** {baseline_info.get('run_id')}")
|
|
519
|
+
if baseline_info.get("report_hash"):
|
|
520
|
+
lines.append(f" - Report Hash: `{baseline_info.get('report_hash')}`")
|
|
521
|
+
if baseline_info.get("report_path"):
|
|
522
|
+
lines.append(f" - Report Path: {baseline_info.get('report_path')}")
|
|
523
|
+
if edited_info:
|
|
524
|
+
lines.append(f"- **Edited Run ID:** {edited_info.get('run_id')}")
|
|
525
|
+
if edited_info.get("report_hash"):
|
|
526
|
+
lines.append(f" - Report Hash: `{edited_info.get('report_hash')}`")
|
|
527
|
+
if edited_info.get("report_path"):
|
|
528
|
+
lines.append(f" - Report Path: {edited_info.get('report_path')}")
|
|
529
|
+
|
|
530
|
+
provider_digest = provenance_info.get("provider_digest")
|
|
531
|
+
if isinstance(provider_digest, dict) and provider_digest:
|
|
532
|
+
ids_d = provider_digest.get("ids_sha256")
|
|
533
|
+
tok_d = provider_digest.get("tokenizer_sha256")
|
|
534
|
+
mask_d = provider_digest.get("masking_sha256")
|
|
535
|
+
|
|
536
|
+
lines.append("- **Provider Digest:**")
|
|
537
|
+
if tok_d:
|
|
538
|
+
lines.append(
|
|
539
|
+
f" - tokenizer_sha256: `{_short_digest(tok_d)}` (full in JSON)"
|
|
540
|
+
)
|
|
541
|
+
if ids_d:
|
|
542
|
+
lines.append(f" - ids_sha256: `{_short_digest(ids_d)}` (full in JSON)")
|
|
543
|
+
if mask_d:
|
|
544
|
+
lines.append(
|
|
545
|
+
f" - masking_sha256: `{_short_digest(mask_d)}` (full in JSON)"
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
try:
|
|
549
|
+
conf = evaluation_report.get("confidence", {}) or {}
|
|
550
|
+
if isinstance(conf, dict) and conf.get("label"):
|
|
551
|
+
lines.append(f"- **Confidence:** {conf.get('label')}")
|
|
552
|
+
except Exception:
|
|
553
|
+
pass
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
pd = evaluation_report.get("policy_digest", {}) or {}
|
|
557
|
+
if isinstance(pd, dict) and pd:
|
|
558
|
+
pv = pd.get("policy_version")
|
|
559
|
+
th = pd.get("thresholds_hash")
|
|
560
|
+
if pv:
|
|
561
|
+
lines.append(f"- **Policy Version:** {pv}")
|
|
562
|
+
if isinstance(th, str) and th:
|
|
563
|
+
short = th if len(th) <= 16 else (th[:8] + "…" + th[-8:])
|
|
564
|
+
lines.append(f"- **Thresholds Digest:** `{short}` (full in JSON)")
|
|
565
|
+
if pd.get("changed"):
|
|
566
|
+
lines.append("- Note: policy changed")
|
|
567
|
+
except Exception:
|
|
568
|
+
pass
|
|
569
|
+
|
|
570
|
+
lines.append("")
|
|
571
|
+
|
|
572
|
+
|
|
116
573
|
def _fmt_by_kind(x: Any, k: str) -> str:
|
|
117
574
|
try:
|
|
118
575
|
xv = float(x)
|
|
@@ -215,13 +672,13 @@ def _append_accuracy_subgroups(lines: list[str], subgroups: dict[str, Any]) -> N
|
|
|
215
672
|
lines.append("")
|
|
216
673
|
|
|
217
674
|
|
|
218
|
-
def
|
|
219
|
-
"""Compute integrity hash for the
|
|
675
|
+
def _compute_report_hash(evaluation_report: dict[str, Any]) -> str:
|
|
676
|
+
"""Compute integrity hash for the evaluation_report.
|
|
220
677
|
|
|
221
678
|
Hash ignores the `artifacts` section for stability across saves.
|
|
222
679
|
"""
|
|
223
680
|
# Create a copy without the artifacts section for stable hashing
|
|
224
|
-
cert_copy = dict(
|
|
681
|
+
cert_copy = dict(evaluation_report or {})
|
|
225
682
|
cert_copy.pop("artifacts", None)
|
|
226
683
|
|
|
227
684
|
# Sort keys for deterministic hashing
|
|
@@ -231,8 +688,8 @@ def _compute_certificate_hash(certificate: dict[str, Any]) -> str:
|
|
|
231
688
|
return _hash.sha256(cert_str.encode()).hexdigest()[:16]
|
|
232
689
|
|
|
233
690
|
|
|
234
|
-
def build_console_summary_pack(
|
|
235
|
-
"""Build a small, reusable console summary pack from a
|
|
691
|
+
def build_console_summary_pack(evaluation_report: dict[str, Any]) -> dict[str, Any]:
|
|
692
|
+
"""Build a small, reusable console summary pack from a evaluation_report.
|
|
236
693
|
|
|
237
694
|
Returns a dict with:
|
|
238
695
|
- overall_pass: bool
|
|
@@ -240,7 +697,7 @@ def build_console_summary_pack(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
240
697
|
- gate_lines: list of "<Label>: <Status>" strings for each evaluated gate
|
|
241
698
|
- labels: the canonical label list used
|
|
242
699
|
"""
|
|
243
|
-
block = compute_console_validation_block(
|
|
700
|
+
block = compute_console_validation_block(evaluation_report)
|
|
244
701
|
overall_pass = bool(block.get("overall_pass"))
|
|
245
702
|
emoji = "✅" if overall_pass else "❌"
|
|
246
703
|
overall_line = f"Overall Status: {emoji} {'PASS' if overall_pass else 'FAIL'}"
|
|
@@ -261,38 +718,38 @@ def build_console_summary_pack(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
261
718
|
}
|
|
262
719
|
|
|
263
720
|
|
|
264
|
-
def
|
|
721
|
+
def render_report_markdown(evaluation_report: dict[str, Any]) -> str:
|
|
265
722
|
"""
|
|
266
|
-
Render
|
|
723
|
+
Render an evaluation report as a formatted Markdown report with pretty tables.
|
|
267
724
|
|
|
268
|
-
This implementation is moved from
|
|
269
|
-
To avoid circular import issues, we alias helpers from the certificate
|
|
270
|
-
module inside the function body.
|
|
725
|
+
This implementation is moved from report_builder.py to keep that module lean.
|
|
271
726
|
"""
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
if not validate_certificate(certificate):
|
|
276
|
-
raise ValueError("Invalid certificate structure")
|
|
727
|
+
if not validate_report(evaluation_report):
|
|
728
|
+
raise ValueError("Invalid evaluation report structure")
|
|
277
729
|
|
|
278
|
-
lines = []
|
|
279
|
-
|
|
730
|
+
lines: list[str] = []
|
|
731
|
+
appendix_lines: list[str] = []
|
|
732
|
+
edit_name = str(evaluation_report.get("edit_name") or "").lower()
|
|
280
733
|
|
|
281
734
|
# Header
|
|
282
|
-
lines.append("# InvarLock
|
|
735
|
+
lines.append("# InvarLock Evaluation Report")
|
|
283
736
|
lines.append("")
|
|
284
737
|
lines.append(
|
|
285
738
|
"> *Basis: “point” gates check the point estimate; “upper” gates check the CI "
|
|
286
739
|
"upper bound; “point & upper” requires both to pass.*"
|
|
287
740
|
)
|
|
288
741
|
lines.append("")
|
|
289
|
-
lines.append(f"**Schema Version:** {
|
|
290
|
-
lines.append(f"**Run ID:** `{
|
|
291
|
-
lines.append(f"**Generated:** {
|
|
292
|
-
lines.append(f"**Edit Type:** {
|
|
742
|
+
lines.append(f"**Schema Version:** {evaluation_report['schema_version']}")
|
|
743
|
+
lines.append(f"**Run ID:** `{evaluation_report['run_id']}`")
|
|
744
|
+
lines.append(f"**Generated:** {evaluation_report['artifacts']['generated_at']}")
|
|
745
|
+
lines.append(f"**Edit Type:** {evaluation_report.get('edit_name', 'Unknown')}")
|
|
746
|
+
lines.append("")
|
|
747
|
+
lines.append(
|
|
748
|
+
"> Full evidence: see [`evaluation.report.json`](evaluation.report.json) for complete provenance, digests, and raw measurements."
|
|
749
|
+
)
|
|
293
750
|
lines.append("")
|
|
294
751
|
|
|
295
|
-
plugins =
|
|
752
|
+
plugins = evaluation_report.get("plugins", {})
|
|
296
753
|
if isinstance(plugins, dict) and plugins:
|
|
297
754
|
lines.append("## Plugin Provenance")
|
|
298
755
|
lines.append("")
|
|
@@ -314,12 +771,12 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
314
771
|
]
|
|
315
772
|
if guard_entries:
|
|
316
773
|
lines.append("- Guards:\n - " + "\n - ".join(guard_entries))
|
|
317
|
-
|
|
774
|
+
lines.append("")
|
|
318
775
|
|
|
319
776
|
# Executive Summary with validation status (canonical, from console block)
|
|
320
777
|
lines.append("## Executive Summary")
|
|
321
778
|
lines.append("")
|
|
322
|
-
_block = compute_console_validation_block(
|
|
779
|
+
_block = compute_console_validation_block(evaluation_report)
|
|
323
780
|
overall_pass = bool(_block.get("overall_pass"))
|
|
324
781
|
status_emoji = "✅" if overall_pass else "❌"
|
|
325
782
|
lines.append(
|
|
@@ -328,13 +785,13 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
328
785
|
# Window Plan one-liner for quick audit
|
|
329
786
|
try:
|
|
330
787
|
plan_ctx = (
|
|
331
|
-
|
|
332
|
-
or
|
|
333
|
-
or
|
|
788
|
+
evaluation_report.get("window_plan")
|
|
789
|
+
or evaluation_report.get("dataset", {}).get("windows", {})
|
|
790
|
+
or evaluation_report.get("ppl", {}).get("window_plan")
|
|
334
791
|
)
|
|
335
|
-
seq_len =
|
|
336
|
-
"
|
|
337
|
-
).get("sequence_length")
|
|
792
|
+
seq_len = evaluation_report.get("dataset", {}).get(
|
|
793
|
+
"seq_len"
|
|
794
|
+
) or evaluation_report.get("dataset", {}).get("sequence_length")
|
|
338
795
|
if isinstance(plan_ctx, dict):
|
|
339
796
|
profile = plan_ctx.get("profile")
|
|
340
797
|
preview_n = (
|
|
@@ -354,15 +811,34 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
354
811
|
pass
|
|
355
812
|
lines.append("")
|
|
356
813
|
|
|
814
|
+
dashboard = _render_executive_dashboard(evaluation_report)
|
|
815
|
+
if dashboard:
|
|
816
|
+
lines.extend(dashboard.splitlines())
|
|
817
|
+
lines.append("")
|
|
818
|
+
|
|
819
|
+
lines.append("## Contents")
|
|
820
|
+
lines.append("")
|
|
821
|
+
lines.append("- [Evaluation Dashboard](#evaluation-dashboard)")
|
|
822
|
+
lines.append("- [Quality Gates](#quality-gates)")
|
|
823
|
+
lines.append("- [Guard Check Details](#guard-check-details)")
|
|
824
|
+
lines.append("- [Primary Metric](#primary-metric)")
|
|
825
|
+
lines.append("- [Guard Observability](#guard-observability)")
|
|
826
|
+
lines.append("- [Model Information](#model-information)")
|
|
827
|
+
lines.append("- [Dataset and Provenance](#dataset-and-provenance)")
|
|
828
|
+
lines.append("- [Policy Configuration](#policy-configuration)")
|
|
829
|
+
lines.append("- [Appendix](#appendix)")
|
|
830
|
+
lines.append("- [Evaluation Report Integrity](#evaluation-report-integrity)")
|
|
831
|
+
lines.append("")
|
|
832
|
+
|
|
357
833
|
# Validation table with canonical gates (mirrors console allow-list)
|
|
358
834
|
lines.append("## Quality Gates")
|
|
359
835
|
lines.append("")
|
|
360
836
|
lines.append("| Gate | Status | Measured | Threshold | Basis | Description |")
|
|
361
837
|
lines.append("|------|--------|----------|-----------|-------|-------------|")
|
|
362
838
|
|
|
363
|
-
pm_block =
|
|
839
|
+
pm_block = evaluation_report.get("primary_metric", {}) or {}
|
|
364
840
|
has_pm = isinstance(pm_block, dict) and bool(pm_block)
|
|
365
|
-
auto_info =
|
|
841
|
+
auto_info = evaluation_report.get("auto", {})
|
|
366
842
|
tier = (auto_info.get("tier") or "balanced").lower()
|
|
367
843
|
|
|
368
844
|
# Helper to emit Primary Metric Acceptable row
|
|
@@ -371,7 +847,9 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
371
847
|
value = pm_block.get("ratio_vs_baseline")
|
|
372
848
|
gating_basis = pm_block.get("gating_basis") or "point"
|
|
373
849
|
ok = bool(
|
|
374
|
-
|
|
850
|
+
evaluation_report.get("validation", {}).get(
|
|
851
|
+
"primary_metric_acceptable", True
|
|
852
|
+
)
|
|
375
853
|
)
|
|
376
854
|
status = "✅ PASS" if ok else "❌ FAIL"
|
|
377
855
|
if pm_kind in {"accuracy", "vqa_accuracy"}:
|
|
@@ -405,11 +883,36 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
405
883
|
# Helper to emit Preview Final Drift Acceptable row
|
|
406
884
|
def _emit_drift_gate_row() -> None:
|
|
407
885
|
ok = bool(
|
|
408
|
-
|
|
886
|
+
evaluation_report.get("validation", {}).get(
|
|
409
887
|
"preview_final_drift_acceptable", True
|
|
410
888
|
)
|
|
411
889
|
)
|
|
412
890
|
status = "✅ PASS" if ok else "❌ FAIL"
|
|
891
|
+
drift_min = 0.95
|
|
892
|
+
drift_max = 1.05
|
|
893
|
+
try:
|
|
894
|
+
drift_band = (
|
|
895
|
+
pm_block.get("drift_band") if isinstance(pm_block, dict) else None
|
|
896
|
+
)
|
|
897
|
+
if isinstance(drift_band, dict):
|
|
898
|
+
lo = drift_band.get("min")
|
|
899
|
+
hi = drift_band.get("max")
|
|
900
|
+
if isinstance(lo, int | float) and isinstance(hi, int | float):
|
|
901
|
+
lo_f = float(lo)
|
|
902
|
+
hi_f = float(hi)
|
|
903
|
+
if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
|
|
904
|
+
drift_min = lo_f
|
|
905
|
+
drift_max = hi_f
|
|
906
|
+
elif isinstance(drift_band, list | tuple) and len(drift_band) == 2:
|
|
907
|
+
lo_raw, hi_raw = drift_band[0], drift_band[1]
|
|
908
|
+
if isinstance(lo_raw, int | float) and isinstance(hi_raw, int | float):
|
|
909
|
+
lo_f = float(lo_raw)
|
|
910
|
+
hi_f = float(hi_raw)
|
|
911
|
+
if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
|
|
912
|
+
drift_min = lo_f
|
|
913
|
+
drift_max = hi_f
|
|
914
|
+
except Exception:
|
|
915
|
+
pass
|
|
413
916
|
# Compute drift from PM preview/final when available
|
|
414
917
|
try:
|
|
415
918
|
pv = (
|
|
@@ -430,18 +933,21 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
430
933
|
except Exception:
|
|
431
934
|
drift = float("nan")
|
|
432
935
|
measured = f"{drift:.3f}x" if math.isfinite(drift) else "N/A"
|
|
936
|
+
band_label = f"{drift_min:.2f}–{drift_max:.2f}x"
|
|
433
937
|
lines.append(
|
|
434
|
-
f"| Preview Final Drift Acceptable | {status} | {measured} |
|
|
938
|
+
f"| Preview Final Drift Acceptable | {status} | {measured} | {band_label} | point | Final/Preview ratio stability |"
|
|
435
939
|
)
|
|
436
940
|
|
|
437
941
|
# Helper to emit Guard Overhead Acceptable row (only when evaluated)
|
|
438
942
|
def _emit_overhead_gate_row() -> None:
|
|
439
|
-
guard_overhead =
|
|
943
|
+
guard_overhead = evaluation_report.get("guard_overhead", {}) or {}
|
|
440
944
|
evaluated = bool(guard_overhead.get("evaluated"))
|
|
441
945
|
if not evaluated:
|
|
442
946
|
return
|
|
443
947
|
ok = bool(
|
|
444
|
-
|
|
948
|
+
evaluation_report.get("validation", {}).get(
|
|
949
|
+
"guard_overhead_acceptable", True
|
|
950
|
+
)
|
|
445
951
|
)
|
|
446
952
|
status = "✅ PASS" if ok else "❌ FAIL"
|
|
447
953
|
overhead_pct = guard_overhead.get("overhead_percent")
|
|
@@ -469,7 +975,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
469
975
|
)
|
|
470
976
|
|
|
471
977
|
def _emit_pm_tail_gate_row() -> None:
|
|
472
|
-
pm_tail =
|
|
978
|
+
pm_tail = evaluation_report.get("primary_metric_tail", {}) or {}
|
|
473
979
|
if not isinstance(pm_tail, dict) or not pm_tail:
|
|
474
980
|
return
|
|
475
981
|
|
|
@@ -479,7 +985,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
479
985
|
warned = bool(pm_tail.get("warned", False))
|
|
480
986
|
|
|
481
987
|
if not evaluated:
|
|
482
|
-
status = "
|
|
988
|
+
status = "ℹ️ INFO"
|
|
483
989
|
elif passed:
|
|
484
990
|
status = "✅ PASS"
|
|
485
991
|
elif mode == "fail":
|
|
@@ -536,17 +1042,17 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
536
1042
|
_emit_overhead_gate_row()
|
|
537
1043
|
|
|
538
1044
|
# Annotate hysteresis usage if applied
|
|
539
|
-
if
|
|
1045
|
+
if evaluation_report.get("validation", {}).get("hysteresis_applied"):
|
|
540
1046
|
lines.append("- Note: hysteresis applied to gate boundary")
|
|
541
1047
|
|
|
542
1048
|
lines.append("")
|
|
543
|
-
lines.append("##
|
|
1049
|
+
lines.append("## Guard Check Details")
|
|
544
1050
|
lines.append("")
|
|
545
|
-
lines.append("|
|
|
1051
|
+
lines.append("| Guard Check | Status | Measured | Threshold | Description |")
|
|
546
1052
|
lines.append("|--------------|--------|----------|-----------|-------------|")
|
|
547
1053
|
|
|
548
|
-
inv_summary =
|
|
549
|
-
validation =
|
|
1054
|
+
inv_summary = evaluation_report["invariants"]
|
|
1055
|
+
validation = evaluation_report.get("validation", {})
|
|
550
1056
|
inv_status = "✅ PASS" if validation.get("invariants_pass", False) else "❌ FAIL"
|
|
551
1057
|
inv_counts = inv_summary.get("summary", {}) or {}
|
|
552
1058
|
inv_measure = inv_summary.get("status", "pass").upper()
|
|
@@ -578,23 +1084,23 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
578
1084
|
lines.append(f"- Non-fatal: {non_fatal_message}")
|
|
579
1085
|
|
|
580
1086
|
spec_status = "✅ PASS" if validation.get("spectral_stable", False) else "❌ FAIL"
|
|
581
|
-
caps_applied =
|
|
1087
|
+
caps_applied = evaluation_report["spectral"]["caps_applied"]
|
|
582
1088
|
lines.append(
|
|
583
1089
|
f"| Spectral Stability | {spec_status} | {caps_applied} violations | < 5 | Weight matrix spectral norms |"
|
|
584
1090
|
)
|
|
585
1091
|
|
|
586
1092
|
# Catastrophic spike safety stop row is now driven by primary metric flags
|
|
587
|
-
if isinstance(
|
|
1093
|
+
if isinstance(evaluation_report.get("primary_metric"), dict):
|
|
588
1094
|
pm_ok = bool(validation.get("primary_metric_acceptable", True))
|
|
589
|
-
pm_ratio =
|
|
1095
|
+
pm_ratio = evaluation_report.get("primary_metric", {}).get("ratio_vs_baseline")
|
|
590
1096
|
if isinstance(pm_ratio, int | float):
|
|
591
1097
|
lines.append(
|
|
592
|
-
f"| Catastrophic Spike Gate (
|
|
1098
|
+
f"| Catastrophic Spike Gate (hard stop) | {'✅ PASS' if pm_ok else '❌ FAIL'} | {pm_ratio:.3f}x | ≤ 2.0x | Hard stop @ 2.0× |"
|
|
593
1099
|
)
|
|
594
1100
|
|
|
595
1101
|
# Include RMT Health row for compatibility and clarity
|
|
596
1102
|
rmt_status = "✅ PASS" if validation.get("rmt_stable", False) else "❌ FAIL"
|
|
597
|
-
rmt_state =
|
|
1103
|
+
rmt_state = evaluation_report.get("rmt", {}).get("status", "unknown").title()
|
|
598
1104
|
lines.append(
|
|
599
1105
|
f"| RMT Health | {rmt_status} | {rmt_state} | ε-rule | Random Matrix Theory guard status |"
|
|
600
1106
|
)
|
|
@@ -602,8 +1108,8 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
602
1108
|
# Pairing + Bootstrap snapshot (quick audit surface)
|
|
603
1109
|
try:
|
|
604
1110
|
stats = (
|
|
605
|
-
|
|
606
|
-
or
|
|
1111
|
+
evaluation_report.get("dataset", {}).get("windows", {}).get("stats", {})
|
|
1112
|
+
or evaluation_report.get("ppl", {}).get("stats", {})
|
|
607
1113
|
or {}
|
|
608
1114
|
)
|
|
609
1115
|
paired_windows = stats.get("paired_windows")
|
|
@@ -616,24 +1122,51 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
616
1122
|
or overlap_frac is not None
|
|
617
1123
|
):
|
|
618
1124
|
lines.append("")
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
1125
|
+
parts: list[str] = []
|
|
1126
|
+
if paired_windows is not None:
|
|
1127
|
+
try:
|
|
1128
|
+
parts.append(f"{int(paired_windows)} windows")
|
|
1129
|
+
except Exception:
|
|
1130
|
+
parts.append(f"windows={paired_windows}")
|
|
1131
|
+
if isinstance(match_frac, int | float) and math.isfinite(float(match_frac)):
|
|
1132
|
+
parts.append(f"{float(match_frac) * 100.0:.1f}% match")
|
|
1133
|
+
elif match_frac is not None:
|
|
1134
|
+
parts.append(f"match={match_frac}")
|
|
1135
|
+
if isinstance(overlap_frac, int | float) and math.isfinite(
|
|
1136
|
+
float(overlap_frac)
|
|
1137
|
+
):
|
|
1138
|
+
parts.append(f"{float(overlap_frac) * 100.0:.1f}% overlap")
|
|
1139
|
+
elif overlap_frac is not None:
|
|
1140
|
+
parts.append(f"overlap={overlap_frac}")
|
|
1141
|
+
lines.append(f"- ✅ Pairing: {', '.join(parts) if parts else 'N/A'}")
|
|
622
1142
|
if isinstance(bootstrap, dict):
|
|
623
1143
|
reps = bootstrap.get("replicates")
|
|
624
1144
|
bseed = bootstrap.get("seed")
|
|
625
1145
|
if reps is not None or bseed is not None:
|
|
626
|
-
|
|
1146
|
+
bits: list[str] = []
|
|
1147
|
+
if reps is not None:
|
|
1148
|
+
try:
|
|
1149
|
+
bits.append(f"{int(reps)} replicates")
|
|
1150
|
+
except Exception:
|
|
1151
|
+
bits.append(f"replicates={reps}")
|
|
1152
|
+
if bseed is not None:
|
|
1153
|
+
try:
|
|
1154
|
+
bits.append(f"seed={int(bseed)}")
|
|
1155
|
+
except Exception:
|
|
1156
|
+
bits.append(f"seed={bseed}")
|
|
1157
|
+
lines.append(f"- ✅ Bootstrap: {', '.join(bits) if bits else 'N/A'}")
|
|
627
1158
|
# Optional: show log-space paired Δ CI next to ratio CI for clarity
|
|
628
|
-
delta_ci =
|
|
629
|
-
"
|
|
630
|
-
).get("logloss_delta_ci")
|
|
1159
|
+
delta_ci = evaluation_report.get("primary_metric", {}).get(
|
|
1160
|
+
"ci"
|
|
1161
|
+
) or evaluation_report.get("ppl", {}).get("logloss_delta_ci")
|
|
631
1162
|
if (
|
|
632
1163
|
isinstance(delta_ci, tuple | list)
|
|
633
1164
|
and len(delta_ci) == 2
|
|
634
1165
|
and all(isinstance(x, int | float) for x in delta_ci)
|
|
635
1166
|
):
|
|
636
|
-
lines.append(
|
|
1167
|
+
lines.append(
|
|
1168
|
+
f"- ℹ️ Log Δ (paired) CI: [{delta_ci[0]:.6f}, {delta_ci[1]:.6f}]"
|
|
1169
|
+
)
|
|
637
1170
|
except Exception:
|
|
638
1171
|
pass
|
|
639
1172
|
|
|
@@ -654,124 +1187,198 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
654
1187
|
|
|
655
1188
|
lines.append("")
|
|
656
1189
|
|
|
1190
|
+
_append_primary_metric_section(lines, evaluation_report)
|
|
1191
|
+
|
|
657
1192
|
# Guard observability snapshots
|
|
658
1193
|
lines.append("## Guard Observability")
|
|
659
1194
|
lines.append("")
|
|
660
1195
|
|
|
661
|
-
spectral_info =
|
|
1196
|
+
spectral_info = evaluation_report.get("spectral", {}) or {}
|
|
662
1197
|
if spectral_info:
|
|
663
|
-
lines.append("### Spectral Guard")
|
|
1198
|
+
lines.append("### Spectral Guard Summary")
|
|
664
1199
|
lines.append("")
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
spec_sigma = spectral_info.get("sigma_quantile")
|
|
677
|
-
spec_deadband = spectral_info.get("deadband")
|
|
678
|
-
spec_max_caps = spectral_info.get("max_caps")
|
|
679
|
-
summary_yaml = {
|
|
680
|
-
"sigma_quantile": float(spec_sigma)
|
|
681
|
-
if isinstance(spec_sigma, int | float)
|
|
682
|
-
else None,
|
|
683
|
-
"deadband": float(spec_deadband)
|
|
684
|
-
if isinstance(spec_deadband, int | float)
|
|
685
|
-
else None,
|
|
686
|
-
"max_caps": int(spec_max_caps)
|
|
687
|
-
if isinstance(spec_max_caps, int | float)
|
|
688
|
-
else None,
|
|
689
|
-
}
|
|
690
|
-
# Drop Nones from summary
|
|
691
|
-
summary_yaml = {k: v for k, v in summary_yaml.items() if v is not None}
|
|
692
|
-
if summary_yaml:
|
|
693
|
-
lines.append("- **Spectral Summary:**")
|
|
694
|
-
lines.append(" ```yaml")
|
|
695
|
-
for line in (
|
|
696
|
-
yaml.safe_dump(summary_yaml, sort_keys=True, width=70)
|
|
697
|
-
.strip()
|
|
698
|
-
.splitlines()
|
|
699
|
-
):
|
|
700
|
-
lines.append(f" {line}")
|
|
701
|
-
lines.append(" ```")
|
|
1200
|
+
lines.append("| Metric | Value | Status |")
|
|
1201
|
+
lines.append("|--------|-------|--------|")
|
|
1202
|
+
|
|
1203
|
+
spectral_ok = bool(validation.get("spectral_stable", False))
|
|
1204
|
+
caps_applied = spectral_info.get("caps_applied")
|
|
1205
|
+
max_caps = spectral_info.get("max_caps")
|
|
1206
|
+
caps_val = (
|
|
1207
|
+
f"{caps_applied}/{max_caps}"
|
|
1208
|
+
if caps_applied is not None and max_caps is not None
|
|
1209
|
+
else "-"
|
|
1210
|
+
)
|
|
702
1211
|
lines.append(
|
|
703
|
-
f"
|
|
1212
|
+
f"| Caps Applied | {caps_val} | {'✅ OK' if spectral_ok else '❌ FAIL'} |"
|
|
704
1213
|
)
|
|
1214
|
+
|
|
705
1215
|
summary = spectral_info.get("summary", {}) or {}
|
|
706
|
-
|
|
707
|
-
|
|
1216
|
+
caps_exceeded = summary.get("caps_exceeded")
|
|
1217
|
+
if caps_exceeded is not None:
|
|
1218
|
+
cap_status = "✅ OK" if not bool(caps_exceeded) else "⚠️ WARN"
|
|
1219
|
+
lines.append(f"| Caps Exceeded | {caps_exceeded} | {cap_status} |")
|
|
1220
|
+
|
|
1221
|
+
top_scores = spectral_info.get("top_z_scores") or {}
|
|
1222
|
+
max_family: str | None = None
|
|
1223
|
+
max_module: str | None = None
|
|
1224
|
+
max_abs_z: float | None = None
|
|
1225
|
+
if isinstance(top_scores, dict):
|
|
1226
|
+
for family, entries in top_scores.items():
|
|
1227
|
+
if not isinstance(entries, list):
|
|
1228
|
+
continue
|
|
1229
|
+
for entry in entries:
|
|
1230
|
+
if not isinstance(entry, dict):
|
|
1231
|
+
continue
|
|
1232
|
+
z_val = entry.get("z")
|
|
1233
|
+
if not (
|
|
1234
|
+
isinstance(z_val, int | float) and math.isfinite(float(z_val))
|
|
1235
|
+
):
|
|
1236
|
+
continue
|
|
1237
|
+
z_abs = abs(float(z_val))
|
|
1238
|
+
if max_abs_z is None or z_abs > max_abs_z:
|
|
1239
|
+
max_abs_z = z_abs
|
|
1240
|
+
max_family = str(family)
|
|
1241
|
+
max_module = (
|
|
1242
|
+
str(entry.get("module")) if entry.get("module") else None
|
|
1243
|
+
)
|
|
1244
|
+
|
|
708
1245
|
family_caps = spectral_info.get("family_caps") or {}
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
kappa =
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
1246
|
+
kappa = None
|
|
1247
|
+
if max_family and isinstance(family_caps, dict):
|
|
1248
|
+
try:
|
|
1249
|
+
kappa = (family_caps.get(max_family, {}) or {}).get("kappa")
|
|
1250
|
+
except Exception:
|
|
1251
|
+
kappa = None
|
|
1252
|
+
kappa_f = (
|
|
1253
|
+
float(kappa)
|
|
1254
|
+
if isinstance(kappa, int | float) and math.isfinite(float(kappa))
|
|
1255
|
+
else None
|
|
1256
|
+
)
|
|
1257
|
+
|
|
1258
|
+
if max_abs_z is not None:
|
|
1259
|
+
max_val = f"{max_abs_z:.3f}"
|
|
1260
|
+
if max_family:
|
|
1261
|
+
max_val += f" ({max_family})"
|
|
1262
|
+
if max_module:
|
|
1263
|
+
max_val += f" – {max_module}"
|
|
1264
|
+
if kappa_f is None:
|
|
1265
|
+
max_status = "ℹ️ No κ"
|
|
1266
|
+
elif max_abs_z <= kappa_f:
|
|
1267
|
+
max_status = f"✅ Within κ={kappa_f:.3f}"
|
|
1268
|
+
else:
|
|
1269
|
+
max_status = f"❌ Exceeds κ={kappa_f:.3f}"
|
|
1270
|
+
lines.append(f"| Max |z| | {max_val} | {max_status} |")
|
|
1271
|
+
|
|
1272
|
+
mt_info = spectral_info.get("multiple_testing", {}) or {}
|
|
1273
|
+
if isinstance(mt_info, dict) and mt_info:
|
|
1274
|
+
mt_method = mt_info.get("method")
|
|
1275
|
+
mt_alpha = mt_info.get("alpha")
|
|
1276
|
+
mt_m = mt_info.get("m")
|
|
1277
|
+
parts: list[str] = []
|
|
1278
|
+
if mt_method:
|
|
1279
|
+
parts.append(f"method={mt_method}")
|
|
1280
|
+
if isinstance(mt_alpha, int | float) and math.isfinite(float(mt_alpha)):
|
|
1281
|
+
parts.append(f"α={float(mt_alpha):.3g}")
|
|
1282
|
+
if isinstance(mt_m, int | float) and math.isfinite(float(mt_m)):
|
|
1283
|
+
parts.append(f"m={int(mt_m)}")
|
|
1284
|
+
lines.append(
|
|
1285
|
+
f"| Multiple Testing | {', '.join(parts) if parts else '—'} | ℹ️ INFO |"
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
lines.append("")
|
|
1289
|
+
|
|
1290
|
+
caps_by_family = spectral_info.get("caps_applied_by_family") or {}
|
|
721
1291
|
quantiles = spectral_info.get("family_z_quantiles") or {}
|
|
722
|
-
if
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
1292
|
+
if any(
|
|
1293
|
+
bool(x)
|
|
1294
|
+
for x in (caps_by_family, quantiles, family_caps, top_scores)
|
|
1295
|
+
if isinstance(x, dict)
|
|
1296
|
+
):
|
|
1297
|
+
lines.append("<details>")
|
|
1298
|
+
lines.append("<summary>Per-family details</summary>")
|
|
1299
|
+
lines.append("")
|
|
1300
|
+
lines.append("| Family | κ | q95 | Max |z| | Violations |")
|
|
1301
|
+
lines.append("|--------|---|-----|--------|------------|")
|
|
1302
|
+
|
|
1303
|
+
families: set[str] = set()
|
|
1304
|
+
for block in (caps_by_family, quantiles, family_caps, top_scores):
|
|
1305
|
+
if isinstance(block, dict):
|
|
1306
|
+
families.update(str(k) for k in block.keys())
|
|
1307
|
+
|
|
1308
|
+
for family in sorted(families):
|
|
1309
|
+
kappa = None
|
|
1310
|
+
if isinstance(family_caps, dict):
|
|
1311
|
+
kappa = (family_caps.get(family, {}) or {}).get("kappa")
|
|
1312
|
+
kappa_str = (
|
|
1313
|
+
f"{float(kappa):.3f}"
|
|
1314
|
+
if isinstance(kappa, int | float) and math.isfinite(float(kappa))
|
|
1315
|
+
else "-"
|
|
1316
|
+
)
|
|
1317
|
+
|
|
1318
|
+
q95 = None
|
|
1319
|
+
max_z = None
|
|
1320
|
+
if isinstance(quantiles, dict):
|
|
1321
|
+
stats = quantiles.get(family) or {}
|
|
1322
|
+
if isinstance(stats, dict):
|
|
1323
|
+
q95 = stats.get("q95")
|
|
1324
|
+
max_z = stats.get("max")
|
|
730
1325
|
q95_str = f"{q95:.3f}" if isinstance(q95, int | float) else "-"
|
|
731
|
-
q99_str = f"{q99:.3f}" if isinstance(q99, int | float) else "-"
|
|
732
1326
|
max_str = f"{max_z:.3f}" if isinstance(max_z, int | float) else "-"
|
|
733
|
-
|
|
1327
|
+
|
|
1328
|
+
violations = None
|
|
1329
|
+
if isinstance(caps_by_family, dict):
|
|
1330
|
+
violations = caps_by_family.get(family)
|
|
1331
|
+
v_str = (
|
|
1332
|
+
str(int(violations)) if isinstance(violations, int | float) else "0"
|
|
1333
|
+
)
|
|
1334
|
+
|
|
734
1335
|
lines.append(
|
|
735
|
-
f"| {family} | {
|
|
1336
|
+
f"| {family} | {kappa_str} | {q95_str} | {max_str} | {v_str} |"
|
|
736
1337
|
)
|
|
1338
|
+
|
|
1339
|
+
if isinstance(top_scores, dict) and top_scores:
|
|
1340
|
+
lines.append("")
|
|
1341
|
+
lines.append("Top |z| per family:")
|
|
1342
|
+
for family in sorted(top_scores.keys()):
|
|
1343
|
+
entries = top_scores[family]
|
|
1344
|
+
if not isinstance(entries, list) or not entries:
|
|
1345
|
+
continue
|
|
1346
|
+
formatted_entries = []
|
|
1347
|
+
for entry in entries:
|
|
1348
|
+
if not isinstance(entry, dict):
|
|
1349
|
+
continue
|
|
1350
|
+
module_name = entry.get("module", "unknown")
|
|
1351
|
+
z_val = entry.get("z")
|
|
1352
|
+
if isinstance(z_val, int | float) and math.isfinite(
|
|
1353
|
+
float(z_val)
|
|
1354
|
+
):
|
|
1355
|
+
z_str = f"{z_val:.3f}"
|
|
1356
|
+
else:
|
|
1357
|
+
z_str = "n/a"
|
|
1358
|
+
formatted_entries.append(f"{module_name} (|z|={z_str})")
|
|
1359
|
+
lines.append(f"- {family}: {', '.join(formatted_entries)}")
|
|
1360
|
+
|
|
737
1361
|
lines.append("")
|
|
738
|
-
|
|
739
|
-
if policy_caps:
|
|
740
|
-
lines.append("- **Family κ (policy):**")
|
|
741
|
-
lines.append(" ```yaml")
|
|
742
|
-
caps_yaml = (
|
|
743
|
-
yaml.safe_dump(policy_caps, sort_keys=True, width=70)
|
|
744
|
-
.strip()
|
|
745
|
-
.splitlines()
|
|
746
|
-
)
|
|
747
|
-
for line in caps_yaml:
|
|
748
|
-
lines.append(f" {line}")
|
|
749
|
-
lines.append(" ```")
|
|
750
|
-
top_scores = spectral_info.get("top_z_scores") or {}
|
|
751
|
-
if top_scores:
|
|
752
|
-
lines.append("Top |z| per family:")
|
|
753
|
-
for family in sorted(top_scores.keys()):
|
|
754
|
-
entries = top_scores[family]
|
|
755
|
-
if not entries:
|
|
756
|
-
continue
|
|
757
|
-
formatted_entries = []
|
|
758
|
-
for entry in entries:
|
|
759
|
-
module_name = entry.get("module", "unknown")
|
|
760
|
-
z_val = entry.get("z")
|
|
761
|
-
if isinstance(z_val, int | float) and math.isfinite(float(z_val)):
|
|
762
|
-
z_str = f"{z_val:.3f}"
|
|
763
|
-
else:
|
|
764
|
-
z_str = "n/a"
|
|
765
|
-
formatted_entries.append(f"{module_name} (|z|={z_str})")
|
|
766
|
-
lines.append(f"- {family}: {', '.join(formatted_entries)}")
|
|
1362
|
+
lines.append("</details>")
|
|
767
1363
|
lines.append("")
|
|
768
1364
|
|
|
769
|
-
rmt_info =
|
|
1365
|
+
rmt_info = evaluation_report.get("rmt", {}) or {}
|
|
770
1366
|
if rmt_info:
|
|
771
1367
|
lines.append("### RMT Guard")
|
|
772
1368
|
lines.append("")
|
|
773
1369
|
families = rmt_info.get("families") or {}
|
|
1370
|
+
stable = bool(rmt_info.get("stable", True))
|
|
1371
|
+
status = "✅ OK" if stable else "❌ FAIL"
|
|
1372
|
+
delta_total = rmt_info.get("delta_total")
|
|
1373
|
+
if isinstance(delta_total, int):
|
|
1374
|
+
lines.append(f"- Δ total: {delta_total:+d}")
|
|
1375
|
+
lines.append(f"- Status: {status}")
|
|
1376
|
+
lines.append(f"- Families: {len(families)}")
|
|
774
1377
|
if families:
|
|
1378
|
+
lines.append("")
|
|
1379
|
+
lines.append("<details>")
|
|
1380
|
+
lines.append("<summary>RMT family details</summary>")
|
|
1381
|
+
lines.append("")
|
|
775
1382
|
lines.append("| Family | ε_f | Bare | Guarded | Δ |")
|
|
776
1383
|
lines.append("|--------|-----|------|---------|---|")
|
|
777
1384
|
for family, data in families.items():
|
|
@@ -801,14 +1408,12 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
801
1408
|
f"| {family} | {epsilon_str} | {bare_str} | {guarded_str} | {delta_str} |"
|
|
802
1409
|
)
|
|
803
1410
|
lines.append("")
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
lines.append(
|
|
808
|
-
lines.append(f"- Stable: {rmt_info.get('stable', True)}")
|
|
809
|
-
lines.append("")
|
|
1411
|
+
lines.append("</details>")
|
|
1412
|
+
lines.append("")
|
|
1413
|
+
else:
|
|
1414
|
+
lines.append("")
|
|
810
1415
|
|
|
811
|
-
guard_overhead_info =
|
|
1416
|
+
guard_overhead_info = evaluation_report.get("guard_overhead", {}) or {}
|
|
812
1417
|
if guard_overhead_info:
|
|
813
1418
|
lines.append("### Guard Overhead")
|
|
814
1419
|
lines.append("")
|
|
@@ -836,7 +1441,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
836
1441
|
overhead_source = guard_overhead_info.get("source")
|
|
837
1442
|
if overhead_source:
|
|
838
1443
|
lines.append(f"- Source: {overhead_source}")
|
|
839
|
-
plan_ctx =
|
|
1444
|
+
plan_ctx = evaluation_report.get("provenance", {}).get("window_plan", {})
|
|
840
1445
|
if isinstance(plan_ctx, dict) and plan_ctx:
|
|
841
1446
|
plan_preview = (
|
|
842
1447
|
plan_ctx.get("preview_n")
|
|
@@ -855,34 +1460,34 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
855
1460
|
lines.append("")
|
|
856
1461
|
|
|
857
1462
|
compression_diag = (
|
|
858
|
-
|
|
859
|
-
if isinstance(
|
|
1463
|
+
evaluation_report.get("structure", {}).get("compression_diagnostics", {})
|
|
1464
|
+
if isinstance(evaluation_report.get("structure"), dict)
|
|
860
1465
|
else {}
|
|
861
1466
|
)
|
|
862
1467
|
inference_flags = compression_diag.get("inferred") or {}
|
|
863
1468
|
inference_sources = compression_diag.get("inference_source") or {}
|
|
864
1469
|
inference_log = compression_diag.get("inference_log") or []
|
|
865
1470
|
if inference_flags or inference_sources or inference_log:
|
|
866
|
-
|
|
867
|
-
|
|
1471
|
+
appendix_lines.append("### Inference Diagnostics")
|
|
1472
|
+
appendix_lines.append("")
|
|
868
1473
|
if inference_flags:
|
|
869
|
-
|
|
1474
|
+
appendix_lines.append("- **Fields Inferred:**")
|
|
870
1475
|
for field, flag in inference_flags.items():
|
|
871
|
-
|
|
1476
|
+
appendix_lines.append(f" - {field}: {'yes' if flag else 'no'}")
|
|
872
1477
|
if inference_sources:
|
|
873
|
-
|
|
1478
|
+
appendix_lines.append("- **Sources:**")
|
|
874
1479
|
for field, source in inference_sources.items():
|
|
875
|
-
|
|
1480
|
+
appendix_lines.append(f" - {field}: {source}")
|
|
876
1481
|
if inference_log:
|
|
877
|
-
|
|
1482
|
+
appendix_lines.append("- **Inference Log:**")
|
|
878
1483
|
for entry in inference_log:
|
|
879
|
-
|
|
880
|
-
|
|
1484
|
+
appendix_lines.append(f" - {entry}")
|
|
1485
|
+
appendix_lines.append("")
|
|
881
1486
|
|
|
882
1487
|
# Model and Configuration
|
|
883
1488
|
lines.append("## Model Information")
|
|
884
1489
|
lines.append("")
|
|
885
|
-
meta =
|
|
1490
|
+
meta = evaluation_report["meta"]
|
|
886
1491
|
lines.append(f"- **Model ID:** {meta.get('model_id')}")
|
|
887
1492
|
lines.append(f"- **Adapter:** {meta.get('adapter')}")
|
|
888
1493
|
lines.append(f"- **Device:** {meta.get('device')}")
|
|
@@ -906,34 +1511,54 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
906
1511
|
if invarlock_version:
|
|
907
1512
|
lines.append(f"- **InvarLock Version:** {invarlock_version}")
|
|
908
1513
|
env_flags = meta.get("env_flags")
|
|
909
|
-
if isinstance(env_flags, dict) and env_flags:
|
|
910
|
-
lines.append("- **Env Flags:**")
|
|
911
|
-
lines.append(" ```yaml")
|
|
912
|
-
for k, v in env_flags.items():
|
|
913
|
-
lines.append(f" {k}: {v}")
|
|
914
|
-
lines.append(" ```")
|
|
915
|
-
# Determinism flags (if present)
|
|
916
1514
|
cuda_flags = meta.get("cuda_flags")
|
|
1515
|
+
|
|
1516
|
+
# Compressed determinism/environment summary for readability
|
|
1517
|
+
det_parts: list[str] = []
|
|
1518
|
+
for label, keys in (
|
|
1519
|
+
("torch_det", ("torch_deterministic_algorithms", "deterministic_algorithms")),
|
|
1520
|
+
("cudnn_det", ("cudnn_deterministic",)),
|
|
1521
|
+
("cudnn_bench", ("cudnn_benchmark",)),
|
|
1522
|
+
("tf32_matmul", ("cuda_matmul_allow_tf32",)),
|
|
1523
|
+
("tf32_cudnn", ("cudnn_allow_tf32",)),
|
|
1524
|
+
("cublas_ws", ("CUBLAS_WORKSPACE_CONFIG",)),
|
|
1525
|
+
):
|
|
1526
|
+
val = None
|
|
1527
|
+
for key in keys:
|
|
1528
|
+
if isinstance(env_flags, dict) and env_flags.get(key) is not None:
|
|
1529
|
+
val = env_flags.get(key)
|
|
1530
|
+
break
|
|
1531
|
+
if isinstance(cuda_flags, dict) and cuda_flags.get(key) is not None:
|
|
1532
|
+
val = cuda_flags.get(key)
|
|
1533
|
+
break
|
|
1534
|
+
if val is not None:
|
|
1535
|
+
det_parts.append(f"{label}={val}")
|
|
1536
|
+
if det_parts:
|
|
1537
|
+
lines.append(f"- **Determinism:** {', '.join(det_parts)}")
|
|
1538
|
+
|
|
1539
|
+
full_flags: dict[str, Any] = {}
|
|
1540
|
+
if isinstance(env_flags, dict) and env_flags:
|
|
1541
|
+
full_flags["env_flags"] = env_flags
|
|
917
1542
|
if isinstance(cuda_flags, dict) and cuda_flags:
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
):
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
1543
|
+
full_flags["cuda_flags"] = cuda_flags
|
|
1544
|
+
if full_flags:
|
|
1545
|
+
lines.append("")
|
|
1546
|
+
lines.append("<details>")
|
|
1547
|
+
lines.append("<summary>Environment flags (full)</summary>")
|
|
1548
|
+
lines.append("")
|
|
1549
|
+
lines.append("```yaml")
|
|
1550
|
+
flags_yaml = yaml.safe_dump(full_flags, sort_keys=True, width=80).strip()
|
|
1551
|
+
for line in flags_yaml.splitlines():
|
|
1552
|
+
lines.append(line)
|
|
1553
|
+
lines.append("```")
|
|
1554
|
+
lines.append("")
|
|
1555
|
+
lines.append("</details>")
|
|
931
1556
|
lines.append("")
|
|
932
1557
|
|
|
933
1558
|
# Edit Configuration (removed duplicate Edit Information section)
|
|
934
1559
|
|
|
935
1560
|
# Auto-tuning Configuration
|
|
936
|
-
auto =
|
|
1561
|
+
auto = evaluation_report["auto"]
|
|
937
1562
|
if auto["tier"] != "none":
|
|
938
1563
|
lines.append("## Auto-Tuning Configuration")
|
|
939
1564
|
lines.append("")
|
|
@@ -951,275 +1576,18 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
951
1576
|
pass
|
|
952
1577
|
lines.append("")
|
|
953
1578
|
|
|
954
|
-
|
|
955
|
-
if resolved_policy:
|
|
956
|
-
lines.append("## Resolved Policy")
|
|
957
|
-
lines.append("")
|
|
958
|
-
lines.append("```yaml")
|
|
959
|
-
resolved_yaml = yaml.safe_dump(
|
|
960
|
-
resolved_policy, sort_keys=True, width=80, default_flow_style=False
|
|
961
|
-
).strip()
|
|
962
|
-
for line in resolved_yaml.splitlines():
|
|
963
|
-
lines.append(line)
|
|
964
|
-
lines.append("```")
|
|
965
|
-
lines.append("")
|
|
966
|
-
|
|
967
|
-
policy_provenance = certificate.get("policy_provenance", {})
|
|
968
|
-
if policy_provenance:
|
|
969
|
-
lines.append("## Policy Provenance")
|
|
970
|
-
lines.append("")
|
|
971
|
-
lines.append(f"- **Tier:** {policy_provenance.get('tier')}")
|
|
972
|
-
overrides_list = policy_provenance.get("overrides") or []
|
|
973
|
-
if overrides_list:
|
|
974
|
-
lines.append(f"- **Overrides:** {', '.join(overrides_list)}")
|
|
975
|
-
else:
|
|
976
|
-
lines.append("- **Overrides:** (none)")
|
|
977
|
-
digest_value = policy_provenance.get("policy_digest")
|
|
978
|
-
if digest_value:
|
|
979
|
-
lines.append(f"- **Policy Digest:** `{digest_value}`")
|
|
980
|
-
else:
|
|
981
|
-
lines.append("- **Policy Digest:** (not recorded)")
|
|
982
|
-
if policy_provenance.get("resolved_at"):
|
|
983
|
-
lines.append(f"- **Resolved At:** {policy_provenance.get('resolved_at')}")
|
|
984
|
-
lines.append("")
|
|
985
|
-
|
|
986
|
-
# Dataset Information
|
|
987
|
-
lines.append("## Dataset Configuration")
|
|
988
|
-
lines.append("")
|
|
989
|
-
dataset = certificate.get("dataset", {}) or {}
|
|
990
|
-
prov = (
|
|
991
|
-
(dataset.get("provider") or "unknown")
|
|
992
|
-
if isinstance(dataset, dict)
|
|
993
|
-
else "unknown"
|
|
994
|
-
)
|
|
995
|
-
lines.append(f"- **Provider:** {prov}")
|
|
996
|
-
try:
|
|
997
|
-
seq_len_val = (
|
|
998
|
-
int(dataset.get("seq_len"))
|
|
999
|
-
if isinstance(dataset.get("seq_len"), int | float)
|
|
1000
|
-
else dataset.get("seq_len")
|
|
1001
|
-
)
|
|
1002
|
-
except Exception: # pragma: no cover - defensive
|
|
1003
|
-
seq_len_val = dataset.get("seq_len")
|
|
1004
|
-
if seq_len_val is not None:
|
|
1005
|
-
lines.append(f"- **Sequence Length:** {seq_len_val}")
|
|
1006
|
-
windows_blk = (
|
|
1007
|
-
dataset.get("windows", {}) if isinstance(dataset.get("windows"), dict) else {}
|
|
1008
|
-
)
|
|
1009
|
-
win_prev = windows_blk.get("preview")
|
|
1010
|
-
win_final = windows_blk.get("final")
|
|
1011
|
-
if win_prev is not None and win_final is not None:
|
|
1012
|
-
lines.append(f"- **Windows:** {win_prev} preview + {win_final} final")
|
|
1013
|
-
if windows_blk.get("seed") is not None:
|
|
1014
|
-
lines.append(f"- **Seed:** {windows_blk.get('seed')}")
|
|
1015
|
-
hash_blk = dataset.get("hash", {}) if isinstance(dataset.get("hash"), dict) else {}
|
|
1016
|
-
if hash_blk.get("preview_tokens") is not None:
|
|
1017
|
-
lines.append(f"- **Preview Tokens:** {hash_blk.get('preview_tokens'):,}")
|
|
1018
|
-
if hash_blk.get("final_tokens") is not None:
|
|
1019
|
-
lines.append(f"- **Final Tokens:** {hash_blk.get('final_tokens'):,}")
|
|
1020
|
-
if hash_blk.get("total_tokens") is not None:
|
|
1021
|
-
lines.append(f"- **Total Tokens:** {hash_blk.get('total_tokens'):,}")
|
|
1022
|
-
if hash_blk.get("dataset"):
|
|
1023
|
-
lines.append(f"- **Dataset Hash:** {hash_blk.get('dataset')}")
|
|
1024
|
-
tokenizer = dataset.get("tokenizer", {})
|
|
1025
|
-
if tokenizer.get("name") or tokenizer.get("hash"):
|
|
1026
|
-
vocab_size = tokenizer.get("vocab_size")
|
|
1027
|
-
vocab_suffix = f" (vocab {vocab_size})" if isinstance(vocab_size, int) else ""
|
|
1028
|
-
lines.append(
|
|
1029
|
-
f"- **Tokenizer:** {tokenizer.get('name', 'unknown')}{vocab_suffix}"
|
|
1030
|
-
)
|
|
1031
|
-
if tokenizer.get("hash"):
|
|
1032
|
-
lines.append(f" - Hash: {tokenizer['hash']}")
|
|
1033
|
-
lines.append(
|
|
1034
|
-
f" - BOS/EOS: {tokenizer.get('bos_token')} / {tokenizer.get('eos_token')}"
|
|
1035
|
-
)
|
|
1036
|
-
if tokenizer.get("pad_token") is not None:
|
|
1037
|
-
lines.append(f" - PAD: {tokenizer.get('pad_token')}")
|
|
1038
|
-
if tokenizer.get("add_prefix_space") is not None:
|
|
1039
|
-
lines.append(f" - add_prefix_space: {tokenizer.get('add_prefix_space')}")
|
|
1040
|
-
lines.append("")
|
|
1041
|
-
|
|
1042
|
-
provenance_info = certificate.get("provenance", {}) or {}
|
|
1043
|
-
if provenance_info:
|
|
1044
|
-
lines.append("## Run Provenance")
|
|
1045
|
-
lines.append("")
|
|
1046
|
-
baseline_info = provenance_info.get("baseline", {}) or {}
|
|
1047
|
-
if baseline_info:
|
|
1048
|
-
lines.append(f"- **Baseline Run ID:** {baseline_info.get('run_id')}")
|
|
1049
|
-
if baseline_info.get("report_hash"):
|
|
1050
|
-
lines.append(f" - Report Hash: `{baseline_info.get('report_hash')}`")
|
|
1051
|
-
if baseline_info.get("report_path"):
|
|
1052
|
-
lines.append(f" - Report Path: {baseline_info.get('report_path')}")
|
|
1053
|
-
edited_info = provenance_info.get("edited", {}) or {}
|
|
1054
|
-
if edited_info:
|
|
1055
|
-
lines.append(f"- **Edited Run ID:** {edited_info.get('run_id')}")
|
|
1056
|
-
if edited_info.get("report_hash"):
|
|
1057
|
-
lines.append(f" - Report Hash: `{edited_info.get('report_hash')}`")
|
|
1058
|
-
if edited_info.get("report_path"):
|
|
1059
|
-
lines.append(f" - Report Path: {edited_info.get('report_path')}")
|
|
1060
|
-
window_plan = provenance_info.get("window_plan")
|
|
1061
|
-
if isinstance(window_plan, dict) and window_plan:
|
|
1062
|
-
preview_val = window_plan.get(
|
|
1063
|
-
"preview_n", window_plan.get("actual_preview")
|
|
1064
|
-
)
|
|
1065
|
-
final_val = window_plan.get("final_n", window_plan.get("actual_final"))
|
|
1066
|
-
lines.append(
|
|
1067
|
-
f"- **Window Plan:** profile={window_plan.get('profile')}, preview={preview_val}, final={final_val}"
|
|
1068
|
-
)
|
|
1069
|
-
provider_digest = provenance_info.get("provider_digest")
|
|
1070
|
-
if isinstance(provider_digest, dict) and provider_digest:
|
|
1071
|
-
ids_d = provider_digest.get("ids_sha256")
|
|
1072
|
-
tok_d = provider_digest.get("tokenizer_sha256")
|
|
1073
|
-
mask_d = provider_digest.get("masking_sha256")
|
|
1074
|
-
|
|
1075
|
-
lines.append("- **Provider Digest:**")
|
|
1076
|
-
if tok_d:
|
|
1077
|
-
lines.append(
|
|
1078
|
-
f" - tokenizer_sha256: `{_short_digest(tok_d)}` (full in JSON)"
|
|
1079
|
-
)
|
|
1080
|
-
if ids_d:
|
|
1081
|
-
lines.append(f" - ids_sha256: `{_short_digest(ids_d)}` (full in JSON)")
|
|
1082
|
-
if mask_d:
|
|
1083
|
-
lines.append(
|
|
1084
|
-
f" - masking_sha256: `{_short_digest(mask_d)}` (full in JSON)"
|
|
1085
|
-
)
|
|
1086
|
-
# Surface confidence label prominently
|
|
1087
|
-
try:
|
|
1088
|
-
conf = certificate.get("confidence", {}) or {}
|
|
1089
|
-
if isinstance(conf, dict) and conf.get("label"):
|
|
1090
|
-
lines.append(f"- **Confidence:** {conf.get('label')}")
|
|
1091
|
-
except Exception:
|
|
1092
|
-
pass
|
|
1093
|
-
# Surface policy version + thresholds hash (short)
|
|
1094
|
-
try:
|
|
1095
|
-
pd = certificate.get("policy_digest", {}) or {}
|
|
1096
|
-
if isinstance(pd, dict) and pd:
|
|
1097
|
-
pv = pd.get("policy_version")
|
|
1098
|
-
th = pd.get("thresholds_hash")
|
|
1099
|
-
if pv:
|
|
1100
|
-
lines.append(f"- **Policy Version:** {pv}")
|
|
1101
|
-
if isinstance(th, str) and th:
|
|
1102
|
-
short = th if len(th) <= 16 else (th[:8] + "…" + th[-8:])
|
|
1103
|
-
lines.append(f"- **Thresholds Digest:** `{short}` (full in JSON)")
|
|
1104
|
-
if pd.get("changed"):
|
|
1105
|
-
lines.append("- Note: policy changed")
|
|
1106
|
-
except Exception:
|
|
1107
|
-
pass
|
|
1108
|
-
lines.append("")
|
|
1579
|
+
_append_dataset_and_provenance_section(lines, evaluation_report)
|
|
1109
1580
|
|
|
1110
1581
|
# Structural Changes heading is printed with content later; avoid empty header here
|
|
1111
1582
|
|
|
1112
|
-
# Primary Metric (metric-v1) snapshot, if present
|
|
1113
|
-
try:
|
|
1114
|
-
pm = certificate.get("primary_metric")
|
|
1115
|
-
if isinstance(pm, dict) and pm:
|
|
1116
|
-
kind = pm.get("kind", "unknown")
|
|
1117
|
-
lines.append(f"## Primary Metric ({kind})")
|
|
1118
|
-
lines.append("")
|
|
1119
|
-
unit = pm.get("unit", "-")
|
|
1120
|
-
paired = pm.get("paired", False)
|
|
1121
|
-
reps = None
|
|
1122
|
-
# Snapshot only; bootstrap reps live in ppl.stats.bootstrap for ppl metrics
|
|
1123
|
-
# Mark estimated metrics (e.g., pseudo accuracy counts) clearly
|
|
1124
|
-
estimated_flag = False
|
|
1125
|
-
try:
|
|
1126
|
-
if bool(pm.get("estimated")):
|
|
1127
|
-
estimated_flag = True
|
|
1128
|
-
elif str(pm.get("counts_source", "")).lower() == "pseudo_config":
|
|
1129
|
-
estimated_flag = True
|
|
1130
|
-
except Exception:
|
|
1131
|
-
estimated_flag = False
|
|
1132
|
-
est_suffix = " (estimated)" if estimated_flag else ""
|
|
1133
|
-
lines.append(f"- Kind: {kind} (unit: {unit}){est_suffix}")
|
|
1134
|
-
gating_basis = pm.get("gating_basis") or pm.get("basis")
|
|
1135
|
-
if gating_basis:
|
|
1136
|
-
lines.append(f"- Basis: {gating_basis}")
|
|
1137
|
-
if isinstance(paired, bool):
|
|
1138
|
-
lines.append(f"- Paired: {paired}")
|
|
1139
|
-
reps = pm.get("reps")
|
|
1140
|
-
if isinstance(reps, int | float):
|
|
1141
|
-
lines.append(f"- Bootstrap Reps: {int(reps)}")
|
|
1142
|
-
ci = pm.get("ci") or pm.get("display_ci")
|
|
1143
|
-
if (
|
|
1144
|
-
isinstance(ci, list | tuple)
|
|
1145
|
-
and len(ci) == 2
|
|
1146
|
-
and all(isinstance(x, int | float) for x in ci)
|
|
1147
|
-
):
|
|
1148
|
-
lines.append(f"- CI: {ci[0]:.3f}–{ci[1]:.3f}")
|
|
1149
|
-
prev = pm.get("preview")
|
|
1150
|
-
fin = pm.get("final")
|
|
1151
|
-
ratio = pm.get("ratio_vs_baseline")
|
|
1152
|
-
|
|
1153
|
-
lines.append("")
|
|
1154
|
-
if estimated_flag and str(kind).lower() in {"accuracy", "vqa_accuracy"}:
|
|
1155
|
-
lines.append(
|
|
1156
|
-
"- Note: Accuracy derived from pseudo counts (quick dev preset); use a labeled preset for measured accuracy."
|
|
1157
|
-
)
|
|
1158
|
-
lines.append("| Field | Value |")
|
|
1159
|
-
lines.append("|-------|-------|")
|
|
1160
|
-
lines.append(f"| Preview | {_fmt_by_kind(prev, str(kind))} |")
|
|
1161
|
-
lines.append(f"| Final | {_fmt_by_kind(fin, str(kind))} |")
|
|
1162
|
-
# For accuracy, ratio field is actually a delta (as per helper); clarify inline
|
|
1163
|
-
if kind in {"accuracy", "vqa_accuracy"}:
|
|
1164
|
-
lines.append(f"| Δ vs Baseline | {_fmt_by_kind(ratio, str(kind))} |")
|
|
1165
|
-
# When baseline accuracy is near-zero, clarify display rule
|
|
1166
|
-
try:
|
|
1167
|
-
base_pt = pm.get("baseline_point")
|
|
1168
|
-
if isinstance(base_pt, int | float) and base_pt < 0.05:
|
|
1169
|
-
lines.append(
|
|
1170
|
-
"- Note: baseline < 5%; ratio suppressed; showing Δpp"
|
|
1171
|
-
)
|
|
1172
|
-
except Exception:
|
|
1173
|
-
pass
|
|
1174
|
-
else:
|
|
1175
|
-
try:
|
|
1176
|
-
lines.append(f"| Ratio vs Baseline | {float(ratio):.3f} |")
|
|
1177
|
-
except Exception:
|
|
1178
|
-
lines.append("| Ratio vs Baseline | N/A |")
|
|
1179
|
-
lines.append("")
|
|
1180
|
-
# Secondary metrics (informational)
|
|
1181
|
-
try:
|
|
1182
|
-
secs = certificate.get("secondary_metrics")
|
|
1183
|
-
if isinstance(secs, list) and secs:
|
|
1184
|
-
lines.append("## Secondary Metrics (informational)")
|
|
1185
|
-
lines.append("")
|
|
1186
|
-
lines.append("| Kind | Preview | Final | vs Baseline | CI |")
|
|
1187
|
-
lines.append("|------|---------|-------|-------------|----|")
|
|
1188
|
-
for m in secs:
|
|
1189
|
-
if not isinstance(m, dict):
|
|
1190
|
-
continue
|
|
1191
|
-
k = m.get("kind", "?")
|
|
1192
|
-
pv = _fmt_by_kind(m.get("preview"), str(k))
|
|
1193
|
-
fv = _fmt_by_kind(m.get("final"), str(k))
|
|
1194
|
-
rb = m.get("ratio_vs_baseline")
|
|
1195
|
-
try:
|
|
1196
|
-
rb_str = (
|
|
1197
|
-
f"{float(rb):.3f}"
|
|
1198
|
-
if (str(k).startswith("ppl"))
|
|
1199
|
-
else _fmt_by_kind(rb, str(k))
|
|
1200
|
-
)
|
|
1201
|
-
except Exception:
|
|
1202
|
-
rb_str = "N/A"
|
|
1203
|
-
ci = m.get("display_ci") or m.get("ci")
|
|
1204
|
-
if isinstance(ci, tuple | list) and len(ci) == 2:
|
|
1205
|
-
ci_str = f"{float(ci[0]):.3f}-{float(ci[1]):.3f}"
|
|
1206
|
-
else:
|
|
1207
|
-
ci_str = "–"
|
|
1208
|
-
lines.append(f"| {k} | {pv} | {fv} | {rb_str} | {ci_str} |")
|
|
1209
|
-
lines.append("")
|
|
1210
|
-
except Exception:
|
|
1211
|
-
pass
|
|
1212
|
-
except Exception:
|
|
1213
|
-
pass
|
|
1214
|
-
|
|
1215
1583
|
# System Overhead section (latency/throughput)
|
|
1216
|
-
sys_over =
|
|
1584
|
+
sys_over = evaluation_report.get("system_overhead", {}) or {}
|
|
1217
1585
|
if isinstance(sys_over, dict) and sys_over:
|
|
1218
1586
|
_append_system_overhead_section(lines, sys_over)
|
|
1219
1587
|
|
|
1220
1588
|
# Accuracy Subgroups (informational)
|
|
1221
1589
|
try:
|
|
1222
|
-
cls =
|
|
1590
|
+
cls = evaluation_report.get("classification", {})
|
|
1223
1591
|
sub = cls.get("subgroups") if isinstance(cls, dict) else None
|
|
1224
1592
|
if isinstance(sub, dict) and sub:
|
|
1225
1593
|
_append_accuracy_subgroups(lines, sub)
|
|
@@ -1227,7 +1595,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1227
1595
|
pass
|
|
1228
1596
|
# Structural Changes
|
|
1229
1597
|
try:
|
|
1230
|
-
structure =
|
|
1598
|
+
structure = evaluation_report.get("structure", {}) or {}
|
|
1231
1599
|
params_changed = int(structure.get("params_changed", 0) or 0)
|
|
1232
1600
|
layers_modified = int(structure.get("layers_modified", 0) or 0)
|
|
1233
1601
|
bitwidth_changes = 0
|
|
@@ -1239,7 +1607,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1239
1607
|
has_changes = any(
|
|
1240
1608
|
v > 0 for v in (params_changed, layers_modified, bitwidth_changes)
|
|
1241
1609
|
)
|
|
1242
|
-
edit_name = str(
|
|
1610
|
+
edit_name = str(evaluation_report.get("edit_name", "unknown"))
|
|
1243
1611
|
if has_changes:
|
|
1244
1612
|
lines.append("## Structural Changes")
|
|
1245
1613
|
lines.append("")
|
|
@@ -1369,47 +1737,48 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1369
1737
|
lines.append("")
|
|
1370
1738
|
|
|
1371
1739
|
# Variance Guard (Spectral/RMT summaries are already provided above)
|
|
1372
|
-
variance =
|
|
1373
|
-
|
|
1740
|
+
variance = evaluation_report["variance"]
|
|
1741
|
+
appendix_lines.append("### Variance Guard")
|
|
1742
|
+
appendix_lines.append("")
|
|
1374
1743
|
|
|
1375
1744
|
# Display whether VE was enabled after A/B test
|
|
1376
|
-
|
|
1745
|
+
appendix_lines.append(f"- **Enabled:** {'Yes' if variance['enabled'] else 'No'}")
|
|
1377
1746
|
|
|
1378
1747
|
if variance["enabled"]:
|
|
1379
1748
|
# VE was enabled - show the gain
|
|
1380
1749
|
gain_value = variance.get("gain", "N/A")
|
|
1381
1750
|
if isinstance(gain_value, int | float):
|
|
1382
|
-
|
|
1751
|
+
appendix_lines.append(f"- **Gain:** {gain_value:.3f}")
|
|
1383
1752
|
else:
|
|
1384
|
-
|
|
1753
|
+
appendix_lines.append(f"- **Gain:** {gain_value}")
|
|
1385
1754
|
else:
|
|
1386
1755
|
# VE was not enabled - show succinct reason if available, else a clear disabled message
|
|
1387
1756
|
ppl_no_ve = variance.get("ppl_no_ve")
|
|
1388
1757
|
ppl_with_ve = variance.get("ppl_with_ve")
|
|
1389
1758
|
ratio_ci = variance.get("ratio_ci")
|
|
1390
1759
|
if ppl_no_ve is not None and ppl_with_ve is not None and ratio_ci:
|
|
1391
|
-
|
|
1392
|
-
|
|
1760
|
+
appendix_lines.append(f"- **Primary metric without VE:** {ppl_no_ve:.3f}")
|
|
1761
|
+
appendix_lines.append(f"- **Primary metric with VE:** {ppl_with_ve:.3f}")
|
|
1393
1762
|
gain_value = variance.get("gain")
|
|
1394
1763
|
if isinstance(gain_value, int | float):
|
|
1395
|
-
|
|
1764
|
+
appendix_lines.append(f"- **Gain (insufficient):** {gain_value:.3f}")
|
|
1396
1765
|
else:
|
|
1397
|
-
|
|
1766
|
+
appendix_lines.append(
|
|
1398
1767
|
"- Variance Guard: Disabled (predictive gate not evaluated for this edit)."
|
|
1399
1768
|
)
|
|
1400
1769
|
# Add concise rationale aligned with Balanced predictive gate contract
|
|
1401
1770
|
try:
|
|
1402
|
-
ve_policy =
|
|
1771
|
+
ve_policy = evaluation_report.get("policies", {}).get("variance", {})
|
|
1403
1772
|
min_effect = ve_policy.get("min_effect_lognll")
|
|
1404
1773
|
if isinstance(min_effect, int | float):
|
|
1405
|
-
|
|
1774
|
+
appendix_lines.append(
|
|
1406
1775
|
f"- Predictive gate (Balanced): one-sided; enables only if CI excludes 0 and |mean Δ| ≥ {float(min_effect):.4g}."
|
|
1407
1776
|
)
|
|
1408
1777
|
else:
|
|
1409
|
-
|
|
1778
|
+
appendix_lines.append(
|
|
1410
1779
|
"- Predictive gate (Balanced): one-sided; enables only if CI excludes 0 and |mean Δ| ≥ min_effect."
|
|
1411
1780
|
)
|
|
1412
|
-
|
|
1781
|
+
appendix_lines.append(
|
|
1413
1782
|
"- Predictive Gate: evaluated=false (disabled under current policy/edit)."
|
|
1414
1783
|
)
|
|
1415
1784
|
except Exception:
|
|
@@ -1417,19 +1786,26 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1417
1786
|
|
|
1418
1787
|
if variance.get("ratio_ci"):
|
|
1419
1788
|
ratio_lo, ratio_hi = variance["ratio_ci"]
|
|
1420
|
-
|
|
1789
|
+
appendix_lines.append(f"- **Ratio CI:** [{ratio_lo:.3f}, {ratio_hi:.3f}]")
|
|
1421
1790
|
|
|
1422
1791
|
if variance.get("calibration") and variance.get("enabled"):
|
|
1423
1792
|
calib = variance["calibration"]
|
|
1424
1793
|
coverage = calib.get("coverage")
|
|
1425
1794
|
requested = calib.get("requested")
|
|
1426
1795
|
status = calib.get("status", "unknown")
|
|
1427
|
-
|
|
1796
|
+
appendix_lines.append(
|
|
1797
|
+
f"- **Calibration:** {coverage}/{requested} windows ({status})"
|
|
1798
|
+
)
|
|
1799
|
+
appendix_lines.append("")
|
|
1428
1800
|
|
|
1429
1801
|
lines.append("")
|
|
1430
1802
|
|
|
1431
1803
|
# MoE Observability (non-gating)
|
|
1432
|
-
moe =
|
|
1804
|
+
moe = (
|
|
1805
|
+
evaluation_report.get("moe", {})
|
|
1806
|
+
if isinstance(evaluation_report.get("moe"), dict)
|
|
1807
|
+
else {}
|
|
1808
|
+
)
|
|
1433
1809
|
if moe:
|
|
1434
1810
|
lines.append("## MoE Observability")
|
|
1435
1811
|
lines.append("")
|
|
@@ -1458,46 +1834,36 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1458
1834
|
lines.append(f"- **{label}:** {float(moe[key]):+.4f}")
|
|
1459
1835
|
lines.append("")
|
|
1460
1836
|
|
|
1461
|
-
|
|
1462
|
-
lines.append("## Applied Policies")
|
|
1463
|
-
lines.append("")
|
|
1464
|
-
policies = certificate["policies"]
|
|
1465
|
-
for guard_name, policy in policies.items():
|
|
1466
|
-
lines.append(f"### {guard_name.title()}")
|
|
1467
|
-
lines.append("")
|
|
1468
|
-
policy_yaml = (
|
|
1469
|
-
yaml.safe_dump(policy, sort_keys=True, width=80).strip().splitlines()
|
|
1470
|
-
)
|
|
1471
|
-
lines.append("```yaml")
|
|
1472
|
-
for line in policy_yaml:
|
|
1473
|
-
lines.append(line)
|
|
1474
|
-
lines.append("```")
|
|
1475
|
-
lines.append("")
|
|
1837
|
+
_append_policy_configuration_section(lines, evaluation_report)
|
|
1476
1838
|
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
artifacts = certificate["artifacts"]
|
|
1839
|
+
appendix_lines.append("### Artifacts")
|
|
1840
|
+
appendix_lines.append("")
|
|
1841
|
+
artifacts = evaluation_report["artifacts"]
|
|
1481
1842
|
if artifacts.get("events_path"):
|
|
1482
|
-
|
|
1843
|
+
appendix_lines.append(f"- **Events Log:** `{artifacts['events_path']}`")
|
|
1483
1844
|
if artifacts.get("report_path"):
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1845
|
+
appendix_lines.append(f"- **Full Report:** `{artifacts['report_path']}`")
|
|
1846
|
+
appendix_lines.append(f"- **Report Generated:** {artifacts['generated_at']}")
|
|
1847
|
+
appendix_lines.append("")
|
|
1848
|
+
|
|
1849
|
+
if appendix_lines:
|
|
1850
|
+
lines.append("## Appendix")
|
|
1851
|
+
lines.append("")
|
|
1852
|
+
lines.extend(appendix_lines)
|
|
1487
1853
|
|
|
1488
|
-
#
|
|
1489
|
-
cert_hash =
|
|
1490
|
-
lines.append("##
|
|
1854
|
+
# Report Hash for Integrity
|
|
1855
|
+
cert_hash = _compute_report_hash(evaluation_report)
|
|
1856
|
+
lines.append("## Evaluation Report Integrity")
|
|
1491
1857
|
lines.append("")
|
|
1492
|
-
lines.append(f"**
|
|
1858
|
+
lines.append(f"**Report Hash:** `{cert_hash}`")
|
|
1493
1859
|
lines.append("")
|
|
1494
1860
|
lines.append("---")
|
|
1495
1861
|
lines.append("")
|
|
1496
1862
|
lines.append(
|
|
1497
|
-
"*This InvarLock
|
|
1863
|
+
"*This InvarLock Evaluation Report summarizes baseline‑paired evaluation results for a subject model relative to the provided baseline snapshot under the configured profile/preset.*"
|
|
1498
1864
|
)
|
|
1499
1865
|
lines.append(
|
|
1500
|
-
"*
|
|
1866
|
+
"*It reports regression-risk indicators for the measured signals; it is not a broad AI safety, alignment, or content-safety guarantee.*"
|
|
1501
1867
|
)
|
|
1502
1868
|
|
|
1503
1869
|
return "\n".join(lines)
|