invarlock 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. invarlock/__init__.py +2 -2
  2. invarlock/adapters/__init__.py +10 -14
  3. invarlock/adapters/auto.py +35 -40
  4. invarlock/adapters/capabilities.py +2 -2
  5. invarlock/adapters/hf_causal.py +418 -0
  6. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  7. invarlock/adapters/hf_mixin.py +25 -4
  8. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  9. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  10. invarlock/cli/adapter_auto.py +31 -21
  11. invarlock/cli/app.py +73 -2
  12. invarlock/cli/commands/certify.py +600 -59
  13. invarlock/cli/commands/doctor.py +8 -10
  14. invarlock/cli/commands/plugins.py +13 -9
  15. invarlock/cli/commands/report.py +233 -69
  16. invarlock/cli/commands/run.py +907 -183
  17. invarlock/cli/commands/verify.py +76 -11
  18. invarlock/cli/config.py +1 -1
  19. invarlock/cli/doctor_helpers.py +4 -5
  20. invarlock/cli/output.py +193 -0
  21. invarlock/cli/provenance.py +1 -1
  22. invarlock/core/bootstrap.py +1 -1
  23. invarlock/core/registry.py +9 -11
  24. invarlock/core/runner.py +111 -25
  25. invarlock/edits/quant_rtn.py +65 -37
  26. invarlock/eval/bench.py +3 -3
  27. invarlock/eval/data.py +68 -23
  28. invarlock/eval/metrics.py +59 -1
  29. invarlock/eval/tasks/__init__.py +12 -0
  30. invarlock/eval/tasks/classification.py +48 -0
  31. invarlock/eval/tasks/qa.py +36 -0
  32. invarlock/eval/tasks/text_generation.py +102 -0
  33. invarlock/guards/invariants.py +19 -10
  34. invarlock/guards/rmt.py +2 -2
  35. invarlock/guards/variance.py +2 -2
  36. invarlock/model_profile.py +48 -27
  37. invarlock/observability/health.py +6 -6
  38. invarlock/observability/metrics.py +108 -0
  39. invarlock/reporting/certificate.py +159 -9
  40. invarlock/reporting/certificate_schema.py +1 -1
  41. invarlock/reporting/guards_analysis.py +154 -4
  42. invarlock/reporting/html.py +55 -5
  43. invarlock/reporting/normalizer.py +7 -0
  44. invarlock/reporting/render.py +791 -431
  45. invarlock/reporting/report.py +39 -3
  46. invarlock/reporting/report_types.py +6 -1
  47. invarlock/reporting/telemetry.py +86 -0
  48. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/METADATA +23 -9
  49. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/RECORD +53 -48
  50. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
  51. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
  52. invarlock/adapters/hf_gpt2.py +0 -404
  53. invarlock/adapters/hf_llama.py +0 -487
  54. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
  55. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,9 @@
1
1
  """
2
- InvarLock Safety Certificate Generation
3
- ==================================
2
+ InvarLock Evaluation Certificate Generation
3
+ ==========================================
4
4
 
5
- Generate standardized safety certificates from RunReport and baseline comparison.
5
+ Generate standardized evaluation certificates from RunReport and baseline
6
+ comparison.
6
7
  Certificates are standalone, portable verification artifacts that can be used
7
8
  for CI/CD gates and regulatory compliance.
8
9
  """
@@ -743,7 +744,7 @@ def make_certificate(
743
744
  baseline: RunReport | dict[str, Any],
744
745
  ) -> dict[str, Any]:
745
746
  """
746
- Generate a safety certificate from a RunReport and baseline comparison.
747
+ Generate an evaluation certificate from a RunReport and baseline comparison.
747
748
 
748
749
  The certificate is a standalone, portable artifact that contains all
749
750
  essential metrics and comparisons needed for safety verification.
@@ -764,6 +765,17 @@ def make_certificate(
764
765
  # Normalize baseline input
765
766
  baseline_raw = baseline
766
767
  baseline_normalized = _normalize_baseline(baseline_raw)
768
+ baseline_report: RunReport | None = None
769
+ try:
770
+ if (
771
+ isinstance(baseline_raw, dict)
772
+ and "meta" in baseline_raw
773
+ and "metrics" in baseline_raw
774
+ and "edit" in baseline_raw
775
+ ):
776
+ baseline_report = _normalize_and_validate_report(baseline_raw)
777
+ except Exception: # pragma: no cover - baseline compare is best-effort
778
+ baseline_report = None
767
779
 
768
780
  # Extract core metadata with full seed bundle
769
781
  meta = _extract_certificate_meta(report)
@@ -1440,7 +1452,7 @@ def make_certificate(
1440
1452
  ppl_analysis["window_plan"] = window_plan_ctx
1441
1453
 
1442
1454
  # Extract invariant status
1443
- invariants = _extract_invariants(report)
1455
+ invariants = _extract_invariants(report, baseline=baseline_report)
1444
1456
 
1445
1457
  # Extract spectral analysis
1446
1458
  spectral = _extract_spectral_analysis(report, baseline_normalized)
@@ -1558,7 +1570,13 @@ def make_certificate(
1558
1570
  telemetry: dict[str, Any] = {}
1559
1571
  metrics_section = report.get("metrics", {})
1560
1572
  if isinstance(metrics_section, dict):
1561
- for key in ("latency_ms_per_tok", "memory_mb_peak", "throughput_tok_per_s"):
1573
+ for key in (
1574
+ "latency_ms_per_tok",
1575
+ "memory_mb_peak",
1576
+ "gpu_memory_mb_peak",
1577
+ "gpu_memory_reserved_mb_peak",
1578
+ "throughput_tok_per_s",
1579
+ ):
1562
1580
  value = metrics_section.get(key)
1563
1581
  if isinstance(value, int | float) and math.isfinite(value):
1564
1582
  telemetry[key] = float(value)
@@ -1755,6 +1773,7 @@ def make_certificate(
1755
1773
  capacity_examples = None
1756
1774
 
1757
1775
  pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
1776
+ pm_drift_band = _resolve_pm_drift_band_from_report(report)
1758
1777
 
1759
1778
  # Primary metric tail evidence and gate evaluation (ΔlogNLL vs baseline, per-window).
1760
1779
  pm_tail_result: dict[str, Any] = {}
@@ -1881,6 +1900,12 @@ def make_certificate(
1881
1900
  except Exception: # pragma: no cover - defensive against patched functions
1882
1901
  validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
1883
1902
 
1903
+ try:
1904
+ if "pm_drift_band" in inspect.signature(_compute_validation_flags).parameters:
1905
+ validation_kwargs["pm_drift_band"] = pm_drift_band
1906
+ except Exception: # pragma: no cover - defensive against patched functions
1907
+ validation_kwargs["pm_drift_band"] = pm_drift_band
1908
+
1884
1909
  try:
1885
1910
  if "pm_tail" in inspect.signature(_compute_validation_flags).parameters:
1886
1911
  validation_kwargs["pm_tail"] = pm_tail_result
@@ -2176,6 +2201,13 @@ def make_certificate(
2176
2201
  from .primary_metric_utils import attach_primary_metric as _attach_pm
2177
2202
 
2178
2203
  _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2204
+ try:
2205
+ if isinstance(pm_drift_band, dict) and pm_drift_band:
2206
+ pm_block = certificate.get("primary_metric")
2207
+ if isinstance(pm_block, dict):
2208
+ pm_block.setdefault("drift_band", dict(pm_drift_band))
2209
+ except Exception: # pragma: no cover
2210
+ pass
2179
2211
  _enforce_display_ci_alignment(
2180
2212
  ratio_ci_source,
2181
2213
  certificate.get("primary_metric"),
@@ -2614,7 +2646,7 @@ def _extract_edit_metadata(
2614
2646
  alg_lower = str(algorithm).strip().lower()
2615
2647
  except Exception: # pragma: no cover
2616
2648
  alg_lower = ""
2617
- allowed_algorithms = {"quant_rtn", "noop"}
2649
+ allowed_algorithms = {"quant_rtn", "noop", "custom"}
2618
2650
  if alg_lower not in allowed_algorithms:
2619
2651
  algorithm = ""
2620
2652
 
@@ -3225,6 +3257,105 @@ def _resolve_pm_acceptance_range_from_report(
3225
3257
  return {"min": float(min_val), "max": float(max_val)}
3226
3258
 
3227
3259
 
3260
+ def _resolve_pm_drift_band_from_report(
3261
+ report: dict[str, Any] | None,
3262
+ ) -> dict[str, float]:
3263
+ """Resolve preview→final drift band from report context/meta/env."""
3264
+
3265
+ base_min = 0.95
3266
+ base_max = 1.05
3267
+
3268
+ def _safe_float(val: Any) -> float | None:
3269
+ try:
3270
+ if val is None:
3271
+ return None
3272
+ out = float(val)
3273
+ except Exception:
3274
+ return None
3275
+ return out if math.isfinite(out) else None
3276
+
3277
+ cfg_min = None
3278
+ cfg_max = None
3279
+
3280
+ ctx = report.get("context") if isinstance(report, dict) else None
3281
+ if isinstance(ctx, dict):
3282
+ pm_ctx = ctx.get("primary_metric")
3283
+ if isinstance(pm_ctx, dict):
3284
+ band = pm_ctx.get("drift_band")
3285
+ if isinstance(band, dict):
3286
+ cfg_min = _safe_float(band.get("min"))
3287
+ cfg_max = _safe_float(band.get("max"))
3288
+ elif isinstance(band, list | tuple) and len(band) == 2:
3289
+ cfg_min = _safe_float(band[0])
3290
+ cfg_max = _safe_float(band[1])
3291
+ if cfg_min is None or cfg_max is None:
3292
+ alt = ctx.get("pm_drift_band")
3293
+ if isinstance(alt, dict):
3294
+ cfg_min = (
3295
+ cfg_min if cfg_min is not None else _safe_float(alt.get("min"))
3296
+ )
3297
+ cfg_max = (
3298
+ cfg_max if cfg_max is not None else _safe_float(alt.get("max"))
3299
+ )
3300
+
3301
+ if (cfg_min is None or cfg_max is None) and isinstance(report, dict):
3302
+ meta = report.get("meta")
3303
+ if isinstance(meta, dict):
3304
+ meta_band = meta.get("pm_drift_band")
3305
+ if isinstance(meta_band, dict):
3306
+ cfg_min = (
3307
+ cfg_min
3308
+ if cfg_min is not None
3309
+ else _safe_float(meta_band.get("min"))
3310
+ )
3311
+ cfg_max = (
3312
+ cfg_max
3313
+ if cfg_max is not None
3314
+ else _safe_float(meta_band.get("max"))
3315
+ )
3316
+
3317
+ def _parse_env(name: str) -> float | None:
3318
+ try:
3319
+ raw = os.environ.get(name, "")
3320
+ if raw is None or str(raw).strip() == "":
3321
+ return None
3322
+ return float(raw)
3323
+ except Exception:
3324
+ return None
3325
+
3326
+ env_min = _parse_env("INVARLOCK_PM_DRIFT_MIN")
3327
+ env_max = _parse_env("INVARLOCK_PM_DRIFT_MAX")
3328
+
3329
+ has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
3330
+ if not has_explicit:
3331
+ return {}
3332
+
3333
+ min_val = (
3334
+ env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
3335
+ )
3336
+ max_val = (
3337
+ env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
3338
+ )
3339
+
3340
+ try:
3341
+ if min_val is not None and min_val <= 0:
3342
+ min_val = base_min
3343
+ except Exception:
3344
+ min_val = base_min
3345
+ try:
3346
+ if max_val is not None and max_val <= 0:
3347
+ max_val = base_max
3348
+ except Exception:
3349
+ max_val = base_max
3350
+ try:
3351
+ if min_val is not None and max_val is not None and min_val >= max_val:
3352
+ min_val, max_val = base_min, base_max
3353
+ except Exception:
3354
+ min_val, max_val = base_min, base_max
3355
+
3356
+ return {"min": float(min_val), "max": float(max_val)}
3357
+
3358
+
3228
3359
  def _compute_validation_flags(
3229
3360
  ppl: dict[str, Any],
3230
3361
  spectral: dict[str, Any],
@@ -3238,6 +3369,7 @@ def _compute_validation_flags(
3238
3369
  moe: dict[str, Any] | None = None,
3239
3370
  dataset_capacity: dict[str, Any] | None = None,
3240
3371
  pm_acceptance_range: dict[str, float] | None = None,
3372
+ pm_drift_band: dict[str, float] | None = None,
3241
3373
  pm_tail: dict[str, Any] | None = None,
3242
3374
  ) -> dict[str, bool]:
3243
3375
  """Compute validation flags for the certificate including canonical gates."""
@@ -3301,9 +3433,27 @@ def _compute_validation_flags(
3301
3433
  ratio_limit = min(ratio_limit, float(target_ratio))
3302
3434
 
3303
3435
  # Canonical Gates
3304
- # 1. Drift gate: 0.95 ≤ final/preview ≤ 1.05
3436
+ # 1. Drift gate: by default 0.95 ≤ final/preview ≤ 1.05 (configurable)
3305
3437
  drift_ratio = ppl.get("preview_final_ratio", 1.0)
3306
- preview_final_drift_acceptable = 0.95 <= drift_ratio <= 1.05
3438
+ drift_min = 0.95
3439
+ drift_max = 1.05
3440
+ if isinstance(pm_drift_band, dict):
3441
+ try:
3442
+ cand_min = pm_drift_band.get("min")
3443
+ cand_max = pm_drift_band.get("max")
3444
+ if isinstance(cand_min, int | float) and isinstance(cand_max, int | float):
3445
+ cand_min_f = float(cand_min)
3446
+ cand_max_f = float(cand_max)
3447
+ if (
3448
+ math.isfinite(cand_min_f)
3449
+ and math.isfinite(cand_max_f)
3450
+ and 0 < cand_min_f < cand_max_f
3451
+ ):
3452
+ drift_min = cand_min_f
3453
+ drift_max = cand_max_f
3454
+ except Exception: # pragma: no cover
3455
+ pass
3456
+ preview_final_drift_acceptable = drift_min <= drift_ratio <= drift_max
3307
3457
  if _tiny_relax:
3308
3458
  # Treat drift identity as informational in tiny dev demos
3309
3459
  preview_final_drift_acceptable = True
@@ -20,7 +20,7 @@ CERTIFICATE_SCHEMA_VERSION = "v1"
20
20
  # separately in metric-specific logic.
21
21
  CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
22
22
  "$schema": "https://json-schema.org/draft/2020-12/schema",
23
- "title": "InvarLock Safety Certificate",
23
+ "title": "InvarLock Evaluation Certificate",
24
24
  "type": "object",
25
25
  "required": [
26
26
  "schema_version",
@@ -23,7 +23,9 @@ def _measurement_contract_digest(contract: Any) -> str | None:
23
23
 
24
24
 
25
25
  @no_type_check
26
- def _extract_invariants(report: RunReport) -> dict[str, Any]:
26
+ def _extract_invariants(
27
+ report: RunReport, baseline: RunReport | None = None
28
+ ) -> dict[str, Any]:
27
29
  """Extract invariant check results (matches the shape used in tests)."""
28
30
  invariants_data = (report.get("metrics", {}) or {}).get("invariants", {})
29
31
  failures: list[dict[str, Any]] = []
@@ -81,6 +83,108 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
81
83
  guard_entry = guard
82
84
  break
83
85
 
86
+ baseline_guard_entry = None
87
+ if baseline is not None:
88
+ for guard in baseline.get("guards", []) or []:
89
+ if str(guard.get("name", "")).lower() == "invariants":
90
+ baseline_guard_entry = guard
91
+ break
92
+
93
+ def _coerce_checks(value: Any) -> dict[str, Any] | None:
94
+ return value if isinstance(value, dict) else None
95
+
96
+ def _extract_guard_checks(
97
+ entry: Any,
98
+ ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
99
+ if not isinstance(entry, dict):
100
+ return None, None
101
+ details = entry.get("details")
102
+ if not isinstance(details, dict):
103
+ return None, None
104
+ return _coerce_checks(details.get("baseline_checks")), _coerce_checks(
105
+ details.get("current_checks")
106
+ )
107
+
108
+ def _compare_invariants(
109
+ baseline_checks: dict[str, Any],
110
+ current_checks: dict[str, Any],
111
+ ) -> tuple[list[dict[str, Any]], int, int]:
112
+ violations: list[dict[str, Any]] = []
113
+
114
+ # LayerNorm coverage check
115
+ baseline_layer_norms = set(baseline_checks.get("layer_norm_paths", ()))
116
+ current_layer_norms = set(current_checks.get("layer_norm_paths", ()))
117
+ missing_layer_norms = sorted(baseline_layer_norms - current_layer_norms)
118
+ if missing_layer_norms:
119
+ violations.append(
120
+ {
121
+ "type": "layer_norm_missing",
122
+ "missing": missing_layer_norms,
123
+ "message": "Expected LayerNorm modules are missing vs baseline",
124
+ }
125
+ )
126
+
127
+ # Tokenizer / vocab alignment
128
+ baseline_vocab_sizes = baseline_checks.get("embedding_vocab_sizes")
129
+ current_vocab_sizes = current_checks.get("embedding_vocab_sizes")
130
+ if isinstance(baseline_vocab_sizes, dict):
131
+ for module_name, baseline_size in baseline_vocab_sizes.items():
132
+ current_size = None
133
+ if isinstance(current_vocab_sizes, dict):
134
+ current_size = current_vocab_sizes.get(module_name)
135
+ if current_size is None or int(current_size) != int(baseline_size):
136
+ mismatch = {
137
+ "module": module_name,
138
+ "baseline": int(baseline_size),
139
+ "current": None if current_size is None else int(current_size),
140
+ }
141
+ violations.append(
142
+ {
143
+ "type": "tokenizer_mismatch",
144
+ "message": "Embedding vocabulary size changed vs baseline",
145
+ **mismatch,
146
+ }
147
+ )
148
+
149
+ handled_keys = {
150
+ "layer_norm_paths",
151
+ "embedding_vocab_sizes",
152
+ "config_vocab_size",
153
+ }
154
+ for check_name, baseline_value in baseline_checks.items():
155
+ if check_name in handled_keys:
156
+ continue
157
+ current_value = current_checks.get(check_name)
158
+ if current_value != baseline_value:
159
+ violations.append(
160
+ {
161
+ "type": "invariant_violation",
162
+ "check": check_name,
163
+ "baseline": baseline_value,
164
+ "current": current_value,
165
+ "message": (
166
+ f"Invariant {check_name} changed from {baseline_value} to {current_value}"
167
+ ),
168
+ }
169
+ )
170
+
171
+ fatal_violation_types = {"tokenizer_mismatch"}
172
+ fatal_count = 0
173
+ warning_count = 0
174
+ annotated: list[dict[str, Any]] = []
175
+ for violation in violations:
176
+ violation_type = str(violation.get("type") or "")
177
+ severity = "fatal" if violation_type in fatal_violation_types else "warning"
178
+ annotated_violation = dict(violation)
179
+ annotated_violation.setdefault("severity", severity)
180
+ annotated.append(annotated_violation)
181
+ if severity == "fatal":
182
+ fatal_count += 1
183
+ else:
184
+ warning_count += 1
185
+
186
+ return annotated, fatal_count, warning_count
187
+
84
188
  severity_status = "pass"
85
189
  if guard_entry:
86
190
  gm = guard_entry.get("metrics", {}) or {}
@@ -108,9 +212,51 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
108
212
  if detail:
109
213
  row["detail"] = detail
110
214
  failures.append(row)
111
- if fatal_count > 0:
215
+ base_fatal = 0
216
+ base_warn = 0
217
+ baseline_failures: list[dict[str, Any]] = []
218
+ if baseline_guard_entry is not None:
219
+ baseline_pre, baseline_post = _extract_guard_checks(baseline_guard_entry)
220
+ current_pre, current_post = _extract_guard_checks(guard_entry)
221
+ baseline_snapshot = baseline_pre or baseline_post
222
+ current_snapshot = current_post or current_pre
223
+ if isinstance(baseline_snapshot, dict) and isinstance(
224
+ current_snapshot, dict
225
+ ):
226
+ baseline_failures, base_fatal, base_warn = _compare_invariants(
227
+ baseline_snapshot, current_snapshot
228
+ )
229
+ for violation in baseline_failures:
230
+ check_name = violation.get("check")
231
+ if not check_name:
232
+ check_name = (
233
+ violation.get("module")
234
+ or violation.get("type")
235
+ or "invariant"
236
+ )
237
+ row = {
238
+ "check": str(check_name),
239
+ "type": str(violation.get("type") or "violation"),
240
+ "severity": str(violation.get("severity") or "warning"),
241
+ }
242
+ detail = {k: v for k, v in violation.items() if k not in row}
243
+ if detail:
244
+ detail.setdefault("source", "baseline_compare")
245
+ row["detail"] = detail
246
+ failures.append(row)
247
+
248
+ fatal_total = fatal_count + base_fatal
249
+ warn_total = warning_count + base_warn
250
+ try:
251
+ summary["fatal_violations"] = fatal_total
252
+ summary["warning_violations"] = warn_total
253
+ summary["violations_found"] = fatal_total + warn_total
254
+ except Exception:
255
+ pass
256
+
257
+ if fatal_total > 0:
112
258
  severity_status = "fail"
113
- elif warning_count > 0 or violations:
259
+ elif warn_total > 0 or violations:
114
260
  severity_status = "warn"
115
261
 
116
262
  # If any error-severity entry exists among failures, escalate to fail
@@ -130,12 +276,16 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
130
276
  "warning_violations": len(failures),
131
277
  }
132
278
 
279
+ details_out = invariants_data
280
+ if not details_out and guard_entry and isinstance(guard_entry.get("details"), dict):
281
+ details_out = guard_entry.get("details", {})
282
+
133
283
  return {
134
284
  "pre": "pass",
135
285
  "post": status,
136
286
  "status": status,
137
287
  "summary": summary,
138
- "details": invariants_data,
288
+ "details": details_out,
139
289
  "failures": failures,
140
290
  }
141
291
 
@@ -12,19 +12,69 @@ from typing import Any
12
12
 
13
13
  from .render import render_certificate_markdown
14
14
 
15
+ markdown_module: Any | None = None
16
+ try:
17
+ import markdown as _markdown # type: ignore[import-untyped]
18
+ except Exception: # pragma: no cover - optional dependency
19
+ _markdown = None
20
+ else:
21
+ markdown_module = _markdown
22
+
23
+
24
+ _STATUS_BADGES = {
25
+ "\u2705 PASS": '<span class="badge pass">PASS</span>',
26
+ "\u2705 OK": '<span class="badge pass">OK</span>',
27
+ "\u274c FAIL": '<span class="badge fail">FAIL</span>',
28
+ "\u26a0\ufe0f WARN": '<span class="badge warn">WARN</span>',
29
+ "\u26a0 WARN": '<span class="badge warn">WARN</span>',
30
+ }
31
+
32
+
33
+ def _apply_status_badges(html_body: str) -> str:
34
+ updated = html_body
35
+ for token, replacement in _STATUS_BADGES.items():
36
+ updated = updated.replace(token, replacement)
37
+ return updated
38
+
15
39
 
16
40
  def render_certificate_html(certificate: dict[str, Any]) -> str:
17
41
  """Render a certificate as a simple HTML document.
18
42
 
19
- Uses the Markdown renderer and embeds the content in a <pre> block to ensure
20
- stable parity for snapshot tests without extra dependencies.
43
+ Uses the Markdown renderer and converts to HTML when available, falling back
44
+ to a <pre> block when the markdown dependency is missing.
21
45
  """
22
46
  md = render_certificate_markdown(certificate)
23
- body = f'<pre class="invarlock-md">{escape(md)}</pre>'
47
+ if markdown_module is None:
48
+ body = f'<pre class="invarlock-md">{escape(md)}</pre>'
49
+ else:
50
+ html_body = markdown_module.markdown(md, extensions=["tables", "fenced_code"])
51
+ html_body = _apply_status_badges(html_body)
52
+ body = f'<div class="invarlock-md">{html_body}</div>'
24
53
  return (
25
54
  '<!DOCTYPE html><html><head><meta charset="utf-8">'
26
- "<title>InvarLock Safety Certificate</title>"
27
- "<style>body{font-family:ui-monospace,Menlo,monospace;white-space:pre-wrap}</style>"
55
+ "<title>InvarLock Evaluation Certificate</title>"
56
+ "<style>"
57
+ ":root{--pass:#2da44e;--fail:#cf222e;--warn:#bf8700;--ink:#1f2328;"
58
+ "--muted:#57606a;--panel:#f6f8fa;--border:#d0d7de}"
59
+ "body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,sans-serif;"
60
+ "color:var(--ink);background:linear-gradient(180deg,#fff, #f6f8fa);"
61
+ "margin:0;padding:32px}"
62
+ ".invarlock-md{max-width:960px;margin:0 auto;padding:24px;background:#fff;"
63
+ "border:1px solid var(--border);border-radius:16px;box-shadow:0 10px 30px rgba(0,0,0,0.05)}"
64
+ "h1,h2,h3{margin-top:1.4em}h1{margin-top:0}"
65
+ "table{border-collapse:collapse;width:100%;margin:12px 0}"
66
+ "th,td{border:1px solid var(--border);padding:6px 8px;text-align:left}"
67
+ "code,pre{background:var(--panel);border-radius:8px}"
68
+ "pre{padding:12px;overflow:auto}"
69
+ ".badge{display:inline-block;padding:2px 8px;border-radius:999px;"
70
+ "font-size:0.75rem;font-weight:700;letter-spacing:0.02em;color:#fff}"
71
+ ".badge.pass{background:var(--pass)}"
72
+ ".badge.fail{background:var(--fail)}"
73
+ ".badge.warn{background:var(--warn)}"
74
+ "@media print{body{background:#fff;padding:0}.invarlock-md{box-shadow:none;"
75
+ "border:0}a{color:inherit;text-decoration:none}.badge{color:#000;"
76
+ "border:1px solid #000;background:transparent}}"
77
+ "</style>"
28
78
  "</head><body>" + body + "</body></html>"
29
79
  )
30
80
 
@@ -55,6 +55,8 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
55
55
  }
56
56
  # Preserve additional provenance knobs used by certificate/digests.
57
57
  for key in (
58
+ "pm_acceptance_range",
59
+ "pm_drift_band",
58
60
  "policy_overrides",
59
61
  "overrides",
60
62
  "plugins",
@@ -179,6 +181,11 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
179
181
  "latency_ms_p50",
180
182
  "latency_ms_p95",
181
183
  "memory_mb_peak",
184
+ "gpu_memory_mb_peak",
185
+ "gpu_memory_reserved_mb_peak",
186
+ "timings",
187
+ "guard_timings",
188
+ "memory_snapshots",
182
189
  "throughput_sps",
183
190
  "spectral",
184
191
  "rmt",