invarlock 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +3 -3
- invarlock/adapters/auto.py +2 -10
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +28 -5
- invarlock/assurance/__init__.py +15 -23
- invarlock/calibration/spectral_null.py +1 -1
- invarlock/cli/adapter_auto.py +1 -5
- invarlock/cli/app.py +57 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/{certify.py → evaluate.py} +69 -46
- invarlock/cli/commands/explain_gates.py +94 -51
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/report.py +121 -47
- invarlock/cli/commands/run.py +274 -66
- invarlock/cli/commands/verify.py +84 -89
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/provenance.py +3 -3
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +1 -1
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +2 -2
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +11 -7
- invarlock/eval/primary_metric.py +1 -1
- invarlock/guards/spectral.py +2 -2
- invarlock/guards_ref/spectral_ref.py +1 -1
- invarlock/model_profile.py +16 -35
- invarlock/observability/health.py +38 -20
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/html.py +7 -7
- invarlock/reporting/normalizer.py +2 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +126 -120
- invarlock/reporting/report.py +43 -37
- invarlock/reporting/{certificate.py → report_builder.py} +103 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock-0.3.9.dist-info/METADATA +303 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/RECORD +46 -46
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/WHEEL +1 -1
- invarlock-0.3.7.dist-info/METADATA +0 -602
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/top_level.txt +0 -0
invarlock/reporting/render.py
CHANGED
|
@@ -9,8 +9,7 @@ from typing import Any
|
|
|
9
9
|
|
|
10
10
|
import yaml
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
from . import certificate as C
|
|
12
|
+
from .report_schema import validate_report
|
|
14
13
|
|
|
15
14
|
# Console Validation Block helpers (allow-list driven)
|
|
16
15
|
_CONSOLE_LABELS_DEFAULT = [
|
|
@@ -37,8 +36,10 @@ def _load_console_labels() -> list[str]:
|
|
|
37
36
|
return list(_CONSOLE_LABELS_DEFAULT)
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
def compute_console_validation_block(
|
|
41
|
-
|
|
39
|
+
def compute_console_validation_block(
|
|
40
|
+
evaluation_report: dict[str, Any],
|
|
41
|
+
) -> dict[str, Any]:
|
|
42
|
+
"""Produce a normalized console validation block from an evaluation report.
|
|
42
43
|
|
|
43
44
|
Returns a dict with keys:
|
|
44
45
|
- labels: the canonical label list
|
|
@@ -47,8 +48,8 @@ def compute_console_validation_block(certificate: dict[str, Any]) -> dict[str, A
|
|
|
47
48
|
counted only when evaluated.
|
|
48
49
|
"""
|
|
49
50
|
labels = _load_console_labels()
|
|
50
|
-
validation =
|
|
51
|
-
guard_ctx =
|
|
51
|
+
validation = evaluation_report.get("validation", {}) or {}
|
|
52
|
+
guard_ctx = evaluation_report.get("guard_overhead", {}) or {}
|
|
52
53
|
guard_evaluated = (
|
|
53
54
|
bool(guard_ctx.get("evaluated")) if isinstance(guard_ctx, dict) else False
|
|
54
55
|
)
|
|
@@ -121,18 +122,18 @@ def _render_executive_dashboard(cert: dict[str, Any]) -> str:
|
|
|
121
122
|
|
|
122
123
|
|
|
123
124
|
def _append_safety_dashboard_section(
|
|
124
|
-
lines: list[str],
|
|
125
|
+
lines: list[str], evaluation_report: dict[str, Any]
|
|
125
126
|
) -> None:
|
|
126
|
-
"""Append a concise, first-screen dashboard for the
|
|
127
|
-
block = compute_console_validation_block(
|
|
127
|
+
"""Append a concise, first-screen dashboard for the evaluation report."""
|
|
128
|
+
block = compute_console_validation_block(evaluation_report)
|
|
128
129
|
overall_pass = bool(block.get("overall_pass"))
|
|
129
130
|
overall_status = (
|
|
130
131
|
f"{'✅' if overall_pass else '❌'} {'PASS' if overall_pass else 'FAIL'}"
|
|
131
132
|
)
|
|
132
133
|
|
|
133
|
-
validation =
|
|
134
|
-
pm =
|
|
135
|
-
auto =
|
|
134
|
+
validation = evaluation_report.get("validation", {}) or {}
|
|
135
|
+
pm = evaluation_report.get("primary_metric", {}) or {}
|
|
136
|
+
auto = evaluation_report.get("auto", {}) or {}
|
|
136
137
|
tier = str(auto.get("tier") or "balanced").lower()
|
|
137
138
|
|
|
138
139
|
# Primary metric summary
|
|
@@ -172,7 +173,7 @@ def _append_safety_dashboard_section(
|
|
|
172
173
|
pm_status = (
|
|
173
174
|
f"{'✅' if pm_ok else '❌'} {measured}"
|
|
174
175
|
if isinstance(pm_ok, bool)
|
|
175
|
-
else f"
|
|
176
|
+
else f"ℹ️ {measured}"
|
|
176
177
|
)
|
|
177
178
|
|
|
178
179
|
# Drift summary (final/preview ratio) when preview/final are numeric
|
|
@@ -205,7 +206,7 @@ def _append_safety_dashboard_section(
|
|
|
205
206
|
drift_status = (
|
|
206
207
|
f"{'✅' if drift_ok else '❌'} {drift_val}"
|
|
207
208
|
if isinstance(drift_ok, bool)
|
|
208
|
-
else f"
|
|
209
|
+
else f"ℹ️ {drift_val}"
|
|
209
210
|
)
|
|
210
211
|
|
|
211
212
|
def _gate_cell(key: str, ok_default: bool | None = None) -> str:
|
|
@@ -217,10 +218,10 @@ def _append_safety_dashboard_section(
|
|
|
217
218
|
else:
|
|
218
219
|
ok = bool(validation.get(key))
|
|
219
220
|
if ok is None:
|
|
220
|
-
return "
|
|
221
|
+
return "ℹ️ N/A"
|
|
221
222
|
return "✅ PASS" if ok else "❌ FAIL"
|
|
222
223
|
|
|
223
|
-
overhead_ctx =
|
|
224
|
+
overhead_ctx = evaluation_report.get("guard_overhead", {}) or {}
|
|
224
225
|
overhead_evaluated = (
|
|
225
226
|
bool(overhead_ctx.get("evaluated")) if isinstance(overhead_ctx, dict) else False
|
|
226
227
|
)
|
|
@@ -247,11 +248,11 @@ def _append_safety_dashboard_section(
|
|
|
247
248
|
"Overhead",
|
|
248
249
|
f"{'✅' if bool(validation.get('guard_overhead_acceptable', True)) else '❌'} {overhead_measured}"
|
|
249
250
|
if isinstance(validation, dict)
|
|
250
|
-
else f"
|
|
251
|
+
else f"ℹ️ {overhead_measured}",
|
|
251
252
|
threshold_str,
|
|
252
253
|
)
|
|
253
254
|
|
|
254
|
-
lines.append("##
|
|
255
|
+
lines.append("## Evaluation Dashboard")
|
|
255
256
|
lines.append("")
|
|
256
257
|
lines.append("| Check | Status | Quick Summary |")
|
|
257
258
|
lines.append("|-------|--------|---------------|")
|
|
@@ -271,10 +272,10 @@ def _append_safety_dashboard_section(
|
|
|
271
272
|
|
|
272
273
|
|
|
273
274
|
def _append_primary_metric_section(
|
|
274
|
-
lines: list[str],
|
|
275
|
+
lines: list[str], evaluation_report: dict[str, Any]
|
|
275
276
|
) -> None:
|
|
276
277
|
"""Append the Primary Metric section early for quick triage."""
|
|
277
|
-
pm =
|
|
278
|
+
pm = evaluation_report.get("primary_metric")
|
|
278
279
|
if not isinstance(pm, dict) or not pm:
|
|
279
280
|
return
|
|
280
281
|
|
|
@@ -342,7 +343,7 @@ def _append_primary_metric_section(
|
|
|
342
343
|
|
|
343
344
|
# Secondary metrics (informational)
|
|
344
345
|
try:
|
|
345
|
-
secs =
|
|
346
|
+
secs = evaluation_report.get("secondary_metrics")
|
|
346
347
|
if isinstance(secs, list) and secs:
|
|
347
348
|
lines.append("## Secondary Metrics (informational)")
|
|
348
349
|
lines.append("")
|
|
@@ -375,10 +376,10 @@ def _append_primary_metric_section(
|
|
|
375
376
|
|
|
376
377
|
|
|
377
378
|
def _append_policy_configuration_section(
|
|
378
|
-
lines: list[str],
|
|
379
|
+
lines: list[str], evaluation_report: dict[str, Any]
|
|
379
380
|
) -> None:
|
|
380
|
-
resolved_policy =
|
|
381
|
-
policy_provenance =
|
|
381
|
+
resolved_policy = evaluation_report.get("resolved_policy")
|
|
382
|
+
policy_provenance = evaluation_report.get("policy_provenance", {}) or {}
|
|
382
383
|
has_prov = isinstance(policy_provenance, dict) and bool(policy_provenance)
|
|
383
384
|
has_resolved = isinstance(resolved_policy, dict) and bool(resolved_policy)
|
|
384
385
|
if not (has_prov or has_resolved):
|
|
@@ -391,12 +392,12 @@ def _append_policy_configuration_section(
|
|
|
391
392
|
if has_prov:
|
|
392
393
|
tier = policy_provenance.get("tier")
|
|
393
394
|
if not tier:
|
|
394
|
-
tier = (
|
|
395
|
+
tier = (evaluation_report.get("auto", {}) or {}).get("tier")
|
|
395
396
|
digest_value = None
|
|
396
397
|
if has_prov:
|
|
397
398
|
digest_value = policy_provenance.get("policy_digest")
|
|
398
399
|
if not digest_value:
|
|
399
|
-
digest_value = (
|
|
400
|
+
digest_value = (evaluation_report.get("policy_digest", {}) or {}).get(
|
|
400
401
|
"thresholds_hash"
|
|
401
402
|
)
|
|
402
403
|
|
|
@@ -436,10 +437,10 @@ def _append_policy_configuration_section(
|
|
|
436
437
|
|
|
437
438
|
|
|
438
439
|
def _append_dataset_and_provenance_section(
|
|
439
|
-
lines: list[str],
|
|
440
|
+
lines: list[str], evaluation_report: dict[str, Any]
|
|
440
441
|
) -> None:
|
|
441
|
-
dataset =
|
|
442
|
-
provenance_info =
|
|
442
|
+
dataset = evaluation_report.get("dataset", {}) or {}
|
|
443
|
+
provenance_info = evaluation_report.get("provenance", {}) or {}
|
|
443
444
|
|
|
444
445
|
has_dataset = isinstance(dataset, dict) and bool(dataset)
|
|
445
446
|
has_provenance = isinstance(provenance_info, dict) and bool(provenance_info)
|
|
@@ -545,14 +546,14 @@ def _append_dataset_and_provenance_section(
|
|
|
545
546
|
)
|
|
546
547
|
|
|
547
548
|
try:
|
|
548
|
-
conf =
|
|
549
|
+
conf = evaluation_report.get("confidence", {}) or {}
|
|
549
550
|
if isinstance(conf, dict) and conf.get("label"):
|
|
550
551
|
lines.append(f"- **Confidence:** {conf.get('label')}")
|
|
551
552
|
except Exception:
|
|
552
553
|
pass
|
|
553
554
|
|
|
554
555
|
try:
|
|
555
|
-
pd =
|
|
556
|
+
pd = evaluation_report.get("policy_digest", {}) or {}
|
|
556
557
|
if isinstance(pd, dict) and pd:
|
|
557
558
|
pv = pd.get("policy_version")
|
|
558
559
|
th = pd.get("thresholds_hash")
|
|
@@ -671,13 +672,13 @@ def _append_accuracy_subgroups(lines: list[str], subgroups: dict[str, Any]) -> N
|
|
|
671
672
|
lines.append("")
|
|
672
673
|
|
|
673
674
|
|
|
674
|
-
def
|
|
675
|
-
"""Compute integrity hash for the
|
|
675
|
+
def _compute_report_hash(evaluation_report: dict[str, Any]) -> str:
|
|
676
|
+
"""Compute integrity hash for the evaluation_report.
|
|
676
677
|
|
|
677
678
|
Hash ignores the `artifacts` section for stability across saves.
|
|
678
679
|
"""
|
|
679
680
|
# Create a copy without the artifacts section for stable hashing
|
|
680
|
-
cert_copy = dict(
|
|
681
|
+
cert_copy = dict(evaluation_report or {})
|
|
681
682
|
cert_copy.pop("artifacts", None)
|
|
682
683
|
|
|
683
684
|
# Sort keys for deterministic hashing
|
|
@@ -687,8 +688,8 @@ def _compute_certificate_hash(certificate: dict[str, Any]) -> str:
|
|
|
687
688
|
return _hash.sha256(cert_str.encode()).hexdigest()[:16]
|
|
688
689
|
|
|
689
690
|
|
|
690
|
-
def build_console_summary_pack(
|
|
691
|
-
"""Build a small, reusable console summary pack from a
|
|
691
|
+
def build_console_summary_pack(evaluation_report: dict[str, Any]) -> dict[str, Any]:
|
|
692
|
+
"""Build a small, reusable console summary pack from a evaluation_report.
|
|
692
693
|
|
|
693
694
|
Returns a dict with:
|
|
694
695
|
- overall_pass: bool
|
|
@@ -696,7 +697,7 @@ def build_console_summary_pack(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
696
697
|
- gate_lines: list of "<Label>: <Status>" strings for each evaluated gate
|
|
697
698
|
- labels: the canonical label list used
|
|
698
699
|
"""
|
|
699
|
-
block = compute_console_validation_block(
|
|
700
|
+
block = compute_console_validation_block(evaluation_report)
|
|
700
701
|
overall_pass = bool(block.get("overall_pass"))
|
|
701
702
|
emoji = "✅" if overall_pass else "❌"
|
|
702
703
|
overall_line = f"Overall Status: {emoji} {'PASS' if overall_pass else 'FAIL'}"
|
|
@@ -717,43 +718,38 @@ def build_console_summary_pack(certificate: dict[str, Any]) -> dict[str, Any]:
|
|
|
717
718
|
}
|
|
718
719
|
|
|
719
720
|
|
|
720
|
-
def
|
|
721
|
+
def render_report_markdown(evaluation_report: dict[str, Any]) -> str:
|
|
721
722
|
"""
|
|
722
|
-
Render
|
|
723
|
+
Render an evaluation report as a formatted Markdown report with pretty tables.
|
|
723
724
|
|
|
724
|
-
This implementation is moved from
|
|
725
|
-
To avoid circular import issues, we alias helpers from the certificate
|
|
726
|
-
module inside the function body.
|
|
725
|
+
This implementation is moved from report_builder.py to keep that module lean.
|
|
727
726
|
"""
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
if not validate_certificate(certificate):
|
|
732
|
-
raise ValueError("Invalid certificate structure")
|
|
727
|
+
if not validate_report(evaluation_report):
|
|
728
|
+
raise ValueError("Invalid evaluation report structure")
|
|
733
729
|
|
|
734
730
|
lines: list[str] = []
|
|
735
731
|
appendix_lines: list[str] = []
|
|
736
|
-
edit_name = str(
|
|
732
|
+
edit_name = str(evaluation_report.get("edit_name") or "").lower()
|
|
737
733
|
|
|
738
734
|
# Header
|
|
739
|
-
lines.append("# InvarLock Evaluation
|
|
735
|
+
lines.append("# InvarLock Evaluation Report")
|
|
740
736
|
lines.append("")
|
|
741
737
|
lines.append(
|
|
742
738
|
"> *Basis: “point” gates check the point estimate; “upper” gates check the CI "
|
|
743
739
|
"upper bound; “point & upper” requires both to pass.*"
|
|
744
740
|
)
|
|
745
741
|
lines.append("")
|
|
746
|
-
lines.append(f"**Schema Version:** {
|
|
747
|
-
lines.append(f"**Run ID:** `{
|
|
748
|
-
lines.append(f"**Generated:** {
|
|
749
|
-
lines.append(f"**Edit Type:** {
|
|
742
|
+
lines.append(f"**Schema Version:** {evaluation_report['schema_version']}")
|
|
743
|
+
lines.append(f"**Run ID:** `{evaluation_report['run_id']}`")
|
|
744
|
+
lines.append(f"**Generated:** {evaluation_report['artifacts']['generated_at']}")
|
|
745
|
+
lines.append(f"**Edit Type:** {evaluation_report.get('edit_name', 'Unknown')}")
|
|
750
746
|
lines.append("")
|
|
751
747
|
lines.append(
|
|
752
|
-
"> Full evidence: see [`evaluation.
|
|
748
|
+
"> Full evidence: see [`evaluation.report.json`](evaluation.report.json) for complete provenance, digests, and raw measurements."
|
|
753
749
|
)
|
|
754
750
|
lines.append("")
|
|
755
751
|
|
|
756
|
-
plugins =
|
|
752
|
+
plugins = evaluation_report.get("plugins", {})
|
|
757
753
|
if isinstance(plugins, dict) and plugins:
|
|
758
754
|
lines.append("## Plugin Provenance")
|
|
759
755
|
lines.append("")
|
|
@@ -780,7 +776,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
780
776
|
# Executive Summary with validation status (canonical, from console block)
|
|
781
777
|
lines.append("## Executive Summary")
|
|
782
778
|
lines.append("")
|
|
783
|
-
_block = compute_console_validation_block(
|
|
779
|
+
_block = compute_console_validation_block(evaluation_report)
|
|
784
780
|
overall_pass = bool(_block.get("overall_pass"))
|
|
785
781
|
status_emoji = "✅" if overall_pass else "❌"
|
|
786
782
|
lines.append(
|
|
@@ -789,13 +785,13 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
789
785
|
# Window Plan one-liner for quick audit
|
|
790
786
|
try:
|
|
791
787
|
plan_ctx = (
|
|
792
|
-
|
|
793
|
-
or
|
|
794
|
-
or
|
|
788
|
+
evaluation_report.get("window_plan")
|
|
789
|
+
or evaluation_report.get("dataset", {}).get("windows", {})
|
|
790
|
+
or evaluation_report.get("ppl", {}).get("window_plan")
|
|
795
791
|
)
|
|
796
|
-
seq_len =
|
|
797
|
-
"
|
|
798
|
-
).get("sequence_length")
|
|
792
|
+
seq_len = evaluation_report.get("dataset", {}).get(
|
|
793
|
+
"seq_len"
|
|
794
|
+
) or evaluation_report.get("dataset", {}).get("sequence_length")
|
|
799
795
|
if isinstance(plan_ctx, dict):
|
|
800
796
|
profile = plan_ctx.get("profile")
|
|
801
797
|
preview_n = (
|
|
@@ -815,23 +811,23 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
815
811
|
pass
|
|
816
812
|
lines.append("")
|
|
817
813
|
|
|
818
|
-
dashboard = _render_executive_dashboard(
|
|
814
|
+
dashboard = _render_executive_dashboard(evaluation_report)
|
|
819
815
|
if dashboard:
|
|
820
816
|
lines.extend(dashboard.splitlines())
|
|
821
817
|
lines.append("")
|
|
822
818
|
|
|
823
819
|
lines.append("## Contents")
|
|
824
820
|
lines.append("")
|
|
825
|
-
lines.append("- [
|
|
821
|
+
lines.append("- [Evaluation Dashboard](#evaluation-dashboard)")
|
|
826
822
|
lines.append("- [Quality Gates](#quality-gates)")
|
|
827
|
-
lines.append("- [
|
|
823
|
+
lines.append("- [Guard Check Details](#guard-check-details)")
|
|
828
824
|
lines.append("- [Primary Metric](#primary-metric)")
|
|
829
825
|
lines.append("- [Guard Observability](#guard-observability)")
|
|
830
826
|
lines.append("- [Model Information](#model-information)")
|
|
831
827
|
lines.append("- [Dataset and Provenance](#dataset-and-provenance)")
|
|
832
828
|
lines.append("- [Policy Configuration](#policy-configuration)")
|
|
833
829
|
lines.append("- [Appendix](#appendix)")
|
|
834
|
-
lines.append("- [
|
|
830
|
+
lines.append("- [Evaluation Report Integrity](#evaluation-report-integrity)")
|
|
835
831
|
lines.append("")
|
|
836
832
|
|
|
837
833
|
# Validation table with canonical gates (mirrors console allow-list)
|
|
@@ -840,9 +836,9 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
840
836
|
lines.append("| Gate | Status | Measured | Threshold | Basis | Description |")
|
|
841
837
|
lines.append("|------|--------|----------|-----------|-------|-------------|")
|
|
842
838
|
|
|
843
|
-
pm_block =
|
|
839
|
+
pm_block = evaluation_report.get("primary_metric", {}) or {}
|
|
844
840
|
has_pm = isinstance(pm_block, dict) and bool(pm_block)
|
|
845
|
-
auto_info =
|
|
841
|
+
auto_info = evaluation_report.get("auto", {})
|
|
846
842
|
tier = (auto_info.get("tier") or "balanced").lower()
|
|
847
843
|
|
|
848
844
|
# Helper to emit Primary Metric Acceptable row
|
|
@@ -851,7 +847,9 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
851
847
|
value = pm_block.get("ratio_vs_baseline")
|
|
852
848
|
gating_basis = pm_block.get("gating_basis") or "point"
|
|
853
849
|
ok = bool(
|
|
854
|
-
|
|
850
|
+
evaluation_report.get("validation", {}).get(
|
|
851
|
+
"primary_metric_acceptable", True
|
|
852
|
+
)
|
|
855
853
|
)
|
|
856
854
|
status = "✅ PASS" if ok else "❌ FAIL"
|
|
857
855
|
if pm_kind in {"accuracy", "vqa_accuracy"}:
|
|
@@ -885,7 +883,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
885
883
|
# Helper to emit Preview Final Drift Acceptable row
|
|
886
884
|
def _emit_drift_gate_row() -> None:
|
|
887
885
|
ok = bool(
|
|
888
|
-
|
|
886
|
+
evaluation_report.get("validation", {}).get(
|
|
889
887
|
"preview_final_drift_acceptable", True
|
|
890
888
|
)
|
|
891
889
|
)
|
|
@@ -942,12 +940,14 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
942
940
|
|
|
943
941
|
# Helper to emit Guard Overhead Acceptable row (only when evaluated)
|
|
944
942
|
def _emit_overhead_gate_row() -> None:
|
|
945
|
-
guard_overhead =
|
|
943
|
+
guard_overhead = evaluation_report.get("guard_overhead", {}) or {}
|
|
946
944
|
evaluated = bool(guard_overhead.get("evaluated"))
|
|
947
945
|
if not evaluated:
|
|
948
946
|
return
|
|
949
947
|
ok = bool(
|
|
950
|
-
|
|
948
|
+
evaluation_report.get("validation", {}).get(
|
|
949
|
+
"guard_overhead_acceptable", True
|
|
950
|
+
)
|
|
951
951
|
)
|
|
952
952
|
status = "✅ PASS" if ok else "❌ FAIL"
|
|
953
953
|
overhead_pct = guard_overhead.get("overhead_percent")
|
|
@@ -975,7 +975,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
975
975
|
)
|
|
976
976
|
|
|
977
977
|
def _emit_pm_tail_gate_row() -> None:
|
|
978
|
-
pm_tail =
|
|
978
|
+
pm_tail = evaluation_report.get("primary_metric_tail", {}) or {}
|
|
979
979
|
if not isinstance(pm_tail, dict) or not pm_tail:
|
|
980
980
|
return
|
|
981
981
|
|
|
@@ -985,7 +985,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
985
985
|
warned = bool(pm_tail.get("warned", False))
|
|
986
986
|
|
|
987
987
|
if not evaluated:
|
|
988
|
-
status = "
|
|
988
|
+
status = "ℹ️ INFO"
|
|
989
989
|
elif passed:
|
|
990
990
|
status = "✅ PASS"
|
|
991
991
|
elif mode == "fail":
|
|
@@ -1042,17 +1042,17 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1042
1042
|
_emit_overhead_gate_row()
|
|
1043
1043
|
|
|
1044
1044
|
# Annotate hysteresis usage if applied
|
|
1045
|
-
if
|
|
1045
|
+
if evaluation_report.get("validation", {}).get("hysteresis_applied"):
|
|
1046
1046
|
lines.append("- Note: hysteresis applied to gate boundary")
|
|
1047
1047
|
|
|
1048
1048
|
lines.append("")
|
|
1049
|
-
lines.append("##
|
|
1049
|
+
lines.append("## Guard Check Details")
|
|
1050
1050
|
lines.append("")
|
|
1051
|
-
lines.append("|
|
|
1051
|
+
lines.append("| Guard Check | Status | Measured | Threshold | Description |")
|
|
1052
1052
|
lines.append("|--------------|--------|----------|-----------|-------------|")
|
|
1053
1053
|
|
|
1054
|
-
inv_summary =
|
|
1055
|
-
validation =
|
|
1054
|
+
inv_summary = evaluation_report["invariants"]
|
|
1055
|
+
validation = evaluation_report.get("validation", {})
|
|
1056
1056
|
inv_status = "✅ PASS" if validation.get("invariants_pass", False) else "❌ FAIL"
|
|
1057
1057
|
inv_counts = inv_summary.get("summary", {}) or {}
|
|
1058
1058
|
inv_measure = inv_summary.get("status", "pass").upper()
|
|
@@ -1084,23 +1084,23 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1084
1084
|
lines.append(f"- Non-fatal: {non_fatal_message}")
|
|
1085
1085
|
|
|
1086
1086
|
spec_status = "✅ PASS" if validation.get("spectral_stable", False) else "❌ FAIL"
|
|
1087
|
-
caps_applied =
|
|
1087
|
+
caps_applied = evaluation_report["spectral"]["caps_applied"]
|
|
1088
1088
|
lines.append(
|
|
1089
1089
|
f"| Spectral Stability | {spec_status} | {caps_applied} violations | < 5 | Weight matrix spectral norms |"
|
|
1090
1090
|
)
|
|
1091
1091
|
|
|
1092
1092
|
# Catastrophic spike safety stop row is now driven by primary metric flags
|
|
1093
|
-
if isinstance(
|
|
1093
|
+
if isinstance(evaluation_report.get("primary_metric"), dict):
|
|
1094
1094
|
pm_ok = bool(validation.get("primary_metric_acceptable", True))
|
|
1095
|
-
pm_ratio =
|
|
1095
|
+
pm_ratio = evaluation_report.get("primary_metric", {}).get("ratio_vs_baseline")
|
|
1096
1096
|
if isinstance(pm_ratio, int | float):
|
|
1097
1097
|
lines.append(
|
|
1098
|
-
f"| Catastrophic Spike Gate (
|
|
1098
|
+
f"| Catastrophic Spike Gate (hard stop) | {'✅ PASS' if pm_ok else '❌ FAIL'} | {pm_ratio:.3f}x | ≤ 2.0x | Hard stop @ 2.0× |"
|
|
1099
1099
|
)
|
|
1100
1100
|
|
|
1101
1101
|
# Include RMT Health row for compatibility and clarity
|
|
1102
1102
|
rmt_status = "✅ PASS" if validation.get("rmt_stable", False) else "❌ FAIL"
|
|
1103
|
-
rmt_state =
|
|
1103
|
+
rmt_state = evaluation_report.get("rmt", {}).get("status", "unknown").title()
|
|
1104
1104
|
lines.append(
|
|
1105
1105
|
f"| RMT Health | {rmt_status} | {rmt_state} | ε-rule | Random Matrix Theory guard status |"
|
|
1106
1106
|
)
|
|
@@ -1108,8 +1108,8 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1108
1108
|
# Pairing + Bootstrap snapshot (quick audit surface)
|
|
1109
1109
|
try:
|
|
1110
1110
|
stats = (
|
|
1111
|
-
|
|
1112
|
-
or
|
|
1111
|
+
evaluation_report.get("dataset", {}).get("windows", {}).get("stats", {})
|
|
1112
|
+
or evaluation_report.get("ppl", {}).get("stats", {})
|
|
1113
1113
|
or {}
|
|
1114
1114
|
)
|
|
1115
1115
|
paired_windows = stats.get("paired_windows")
|
|
@@ -1138,7 +1138,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1138
1138
|
parts.append(f"{float(overlap_frac) * 100.0:.1f}% overlap")
|
|
1139
1139
|
elif overlap_frac is not None:
|
|
1140
1140
|
parts.append(f"overlap={overlap_frac}")
|
|
1141
|
-
lines.append(f"✅ Pairing: {', '.join(parts) if parts else 'N/A'}")
|
|
1141
|
+
lines.append(f"- ✅ Pairing: {', '.join(parts) if parts else 'N/A'}")
|
|
1142
1142
|
if isinstance(bootstrap, dict):
|
|
1143
1143
|
reps = bootstrap.get("replicates")
|
|
1144
1144
|
bseed = bootstrap.get("seed")
|
|
@@ -1154,17 +1154,19 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1154
1154
|
bits.append(f"seed={int(bseed)}")
|
|
1155
1155
|
except Exception:
|
|
1156
1156
|
bits.append(f"seed={bseed}")
|
|
1157
|
-
lines.append(f"✅ Bootstrap: {', '.join(bits) if bits else 'N/A'}")
|
|
1157
|
+
lines.append(f"- ✅ Bootstrap: {', '.join(bits) if bits else 'N/A'}")
|
|
1158
1158
|
# Optional: show log-space paired Δ CI next to ratio CI for clarity
|
|
1159
|
-
delta_ci =
|
|
1160
|
-
"
|
|
1161
|
-
).get("logloss_delta_ci")
|
|
1159
|
+
delta_ci = evaluation_report.get("primary_metric", {}).get(
|
|
1160
|
+
"ci"
|
|
1161
|
+
) or evaluation_report.get("ppl", {}).get("logloss_delta_ci")
|
|
1162
1162
|
if (
|
|
1163
1163
|
isinstance(delta_ci, tuple | list)
|
|
1164
1164
|
and len(delta_ci) == 2
|
|
1165
1165
|
and all(isinstance(x, int | float) for x in delta_ci)
|
|
1166
1166
|
):
|
|
1167
|
-
lines.append(
|
|
1167
|
+
lines.append(
|
|
1168
|
+
f"- ℹ️ Log Δ (paired) CI: [{delta_ci[0]:.6f}, {delta_ci[1]:.6f}]"
|
|
1169
|
+
)
|
|
1168
1170
|
except Exception:
|
|
1169
1171
|
pass
|
|
1170
1172
|
|
|
@@ -1185,13 +1187,13 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1185
1187
|
|
|
1186
1188
|
lines.append("")
|
|
1187
1189
|
|
|
1188
|
-
_append_primary_metric_section(lines,
|
|
1190
|
+
_append_primary_metric_section(lines, evaluation_report)
|
|
1189
1191
|
|
|
1190
1192
|
# Guard observability snapshots
|
|
1191
1193
|
lines.append("## Guard Observability")
|
|
1192
1194
|
lines.append("")
|
|
1193
1195
|
|
|
1194
|
-
spectral_info =
|
|
1196
|
+
spectral_info = evaluation_report.get("spectral", {}) or {}
|
|
1195
1197
|
if spectral_info:
|
|
1196
1198
|
lines.append("### Spectral Guard Summary")
|
|
1197
1199
|
lines.append("")
|
|
@@ -1260,7 +1262,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1260
1262
|
if max_module:
|
|
1261
1263
|
max_val += f" – {max_module}"
|
|
1262
1264
|
if kappa_f is None:
|
|
1263
|
-
max_status = "
|
|
1265
|
+
max_status = "ℹ️ No κ"
|
|
1264
1266
|
elif max_abs_z <= kappa_f:
|
|
1265
1267
|
max_status = f"✅ Within κ={kappa_f:.3f}"
|
|
1266
1268
|
else:
|
|
@@ -1280,7 +1282,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1280
1282
|
if isinstance(mt_m, int | float) and math.isfinite(float(mt_m)):
|
|
1281
1283
|
parts.append(f"m={int(mt_m)}")
|
|
1282
1284
|
lines.append(
|
|
1283
|
-
f"| Multiple Testing | {', '.join(parts) if parts else '—'} |
|
|
1285
|
+
f"| Multiple Testing | {', '.join(parts) if parts else '—'} | ℹ️ INFO |"
|
|
1284
1286
|
)
|
|
1285
1287
|
|
|
1286
1288
|
lines.append("")
|
|
@@ -1360,7 +1362,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1360
1362
|
lines.append("</details>")
|
|
1361
1363
|
lines.append("")
|
|
1362
1364
|
|
|
1363
|
-
rmt_info =
|
|
1365
|
+
rmt_info = evaluation_report.get("rmt", {}) or {}
|
|
1364
1366
|
if rmt_info:
|
|
1365
1367
|
lines.append("### RMT Guard")
|
|
1366
1368
|
lines.append("")
|
|
@@ -1411,7 +1413,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1411
1413
|
else:
|
|
1412
1414
|
lines.append("")
|
|
1413
1415
|
|
|
1414
|
-
guard_overhead_info =
|
|
1416
|
+
guard_overhead_info = evaluation_report.get("guard_overhead", {}) or {}
|
|
1415
1417
|
if guard_overhead_info:
|
|
1416
1418
|
lines.append("### Guard Overhead")
|
|
1417
1419
|
lines.append("")
|
|
@@ -1439,7 +1441,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1439
1441
|
overhead_source = guard_overhead_info.get("source")
|
|
1440
1442
|
if overhead_source:
|
|
1441
1443
|
lines.append(f"- Source: {overhead_source}")
|
|
1442
|
-
plan_ctx =
|
|
1444
|
+
plan_ctx = evaluation_report.get("provenance", {}).get("window_plan", {})
|
|
1443
1445
|
if isinstance(plan_ctx, dict) and plan_ctx:
|
|
1444
1446
|
plan_preview = (
|
|
1445
1447
|
plan_ctx.get("preview_n")
|
|
@@ -1458,8 +1460,8 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1458
1460
|
lines.append("")
|
|
1459
1461
|
|
|
1460
1462
|
compression_diag = (
|
|
1461
|
-
|
|
1462
|
-
if isinstance(
|
|
1463
|
+
evaluation_report.get("structure", {}).get("compression_diagnostics", {})
|
|
1464
|
+
if isinstance(evaluation_report.get("structure"), dict)
|
|
1463
1465
|
else {}
|
|
1464
1466
|
)
|
|
1465
1467
|
inference_flags = compression_diag.get("inferred") or {}
|
|
@@ -1485,7 +1487,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1485
1487
|
# Model and Configuration
|
|
1486
1488
|
lines.append("## Model Information")
|
|
1487
1489
|
lines.append("")
|
|
1488
|
-
meta =
|
|
1490
|
+
meta = evaluation_report["meta"]
|
|
1489
1491
|
lines.append(f"- **Model ID:** {meta.get('model_id')}")
|
|
1490
1492
|
lines.append(f"- **Adapter:** {meta.get('adapter')}")
|
|
1491
1493
|
lines.append(f"- **Device:** {meta.get('device')}")
|
|
@@ -1556,7 +1558,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1556
1558
|
# Edit Configuration (removed duplicate Edit Information section)
|
|
1557
1559
|
|
|
1558
1560
|
# Auto-tuning Configuration
|
|
1559
|
-
auto =
|
|
1561
|
+
auto = evaluation_report["auto"]
|
|
1560
1562
|
if auto["tier"] != "none":
|
|
1561
1563
|
lines.append("## Auto-Tuning Configuration")
|
|
1562
1564
|
lines.append("")
|
|
@@ -1574,18 +1576,18 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1574
1576
|
pass
|
|
1575
1577
|
lines.append("")
|
|
1576
1578
|
|
|
1577
|
-
_append_dataset_and_provenance_section(lines,
|
|
1579
|
+
_append_dataset_and_provenance_section(lines, evaluation_report)
|
|
1578
1580
|
|
|
1579
1581
|
# Structural Changes heading is printed with content later; avoid empty header here
|
|
1580
1582
|
|
|
1581
1583
|
# System Overhead section (latency/throughput)
|
|
1582
|
-
sys_over =
|
|
1584
|
+
sys_over = evaluation_report.get("system_overhead", {}) or {}
|
|
1583
1585
|
if isinstance(sys_over, dict) and sys_over:
|
|
1584
1586
|
_append_system_overhead_section(lines, sys_over)
|
|
1585
1587
|
|
|
1586
1588
|
# Accuracy Subgroups (informational)
|
|
1587
1589
|
try:
|
|
1588
|
-
cls =
|
|
1590
|
+
cls = evaluation_report.get("classification", {})
|
|
1589
1591
|
sub = cls.get("subgroups") if isinstance(cls, dict) else None
|
|
1590
1592
|
if isinstance(sub, dict) and sub:
|
|
1591
1593
|
_append_accuracy_subgroups(lines, sub)
|
|
@@ -1593,7 +1595,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1593
1595
|
pass
|
|
1594
1596
|
# Structural Changes
|
|
1595
1597
|
try:
|
|
1596
|
-
structure =
|
|
1598
|
+
structure = evaluation_report.get("structure", {}) or {}
|
|
1597
1599
|
params_changed = int(structure.get("params_changed", 0) or 0)
|
|
1598
1600
|
layers_modified = int(structure.get("layers_modified", 0) or 0)
|
|
1599
1601
|
bitwidth_changes = 0
|
|
@@ -1605,7 +1607,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1605
1607
|
has_changes = any(
|
|
1606
1608
|
v > 0 for v in (params_changed, layers_modified, bitwidth_changes)
|
|
1607
1609
|
)
|
|
1608
|
-
edit_name = str(
|
|
1610
|
+
edit_name = str(evaluation_report.get("edit_name", "unknown"))
|
|
1609
1611
|
if has_changes:
|
|
1610
1612
|
lines.append("## Structural Changes")
|
|
1611
1613
|
lines.append("")
|
|
@@ -1735,7 +1737,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1735
1737
|
lines.append("")
|
|
1736
1738
|
|
|
1737
1739
|
# Variance Guard (Spectral/RMT summaries are already provided above)
|
|
1738
|
-
variance =
|
|
1740
|
+
variance = evaluation_report["variance"]
|
|
1739
1741
|
appendix_lines.append("### Variance Guard")
|
|
1740
1742
|
appendix_lines.append("")
|
|
1741
1743
|
|
|
@@ -1766,7 +1768,7 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1766
1768
|
)
|
|
1767
1769
|
# Add concise rationale aligned with Balanced predictive gate contract
|
|
1768
1770
|
try:
|
|
1769
|
-
ve_policy =
|
|
1771
|
+
ve_policy = evaluation_report.get("policies", {}).get("variance", {})
|
|
1770
1772
|
min_effect = ve_policy.get("min_effect_lognll")
|
|
1771
1773
|
if isinstance(min_effect, int | float):
|
|
1772
1774
|
appendix_lines.append(
|
|
@@ -1799,7 +1801,11 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1799
1801
|
lines.append("")
|
|
1800
1802
|
|
|
1801
1803
|
# MoE Observability (non-gating)
|
|
1802
|
-
moe =
|
|
1804
|
+
moe = (
|
|
1805
|
+
evaluation_report.get("moe", {})
|
|
1806
|
+
if isinstance(evaluation_report.get("moe"), dict)
|
|
1807
|
+
else {}
|
|
1808
|
+
)
|
|
1803
1809
|
if moe:
|
|
1804
1810
|
lines.append("## MoE Observability")
|
|
1805
1811
|
lines.append("")
|
|
@@ -1828,16 +1834,16 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1828
1834
|
lines.append(f"- **{label}:** {float(moe[key]):+.4f}")
|
|
1829
1835
|
lines.append("")
|
|
1830
1836
|
|
|
1831
|
-
_append_policy_configuration_section(lines,
|
|
1837
|
+
_append_policy_configuration_section(lines, evaluation_report)
|
|
1832
1838
|
|
|
1833
1839
|
appendix_lines.append("### Artifacts")
|
|
1834
1840
|
appendix_lines.append("")
|
|
1835
|
-
artifacts =
|
|
1841
|
+
artifacts = evaluation_report["artifacts"]
|
|
1836
1842
|
if artifacts.get("events_path"):
|
|
1837
1843
|
appendix_lines.append(f"- **Events Log:** `{artifacts['events_path']}`")
|
|
1838
1844
|
if artifacts.get("report_path"):
|
|
1839
1845
|
appendix_lines.append(f"- **Full Report:** `{artifacts['report_path']}`")
|
|
1840
|
-
appendix_lines.append(f"- **
|
|
1846
|
+
appendix_lines.append(f"- **Report Generated:** {artifacts['generated_at']}")
|
|
1841
1847
|
appendix_lines.append("")
|
|
1842
1848
|
|
|
1843
1849
|
if appendix_lines:
|
|
@@ -1845,19 +1851,19 @@ def render_certificate_markdown(certificate: dict[str, Any]) -> str:
|
|
|
1845
1851
|
lines.append("")
|
|
1846
1852
|
lines.extend(appendix_lines)
|
|
1847
1853
|
|
|
1848
|
-
#
|
|
1849
|
-
cert_hash =
|
|
1850
|
-
lines.append("##
|
|
1854
|
+
# Report Hash for Integrity
|
|
1855
|
+
cert_hash = _compute_report_hash(evaluation_report)
|
|
1856
|
+
lines.append("## Evaluation Report Integrity")
|
|
1851
1857
|
lines.append("")
|
|
1852
|
-
lines.append(f"**
|
|
1858
|
+
lines.append(f"**Report Hash:** `{cert_hash}`")
|
|
1853
1859
|
lines.append("")
|
|
1854
1860
|
lines.append("---")
|
|
1855
1861
|
lines.append("")
|
|
1856
1862
|
lines.append(
|
|
1857
|
-
"*This InvarLock evaluation
|
|
1863
|
+
"*This InvarLock Evaluation Report summarizes baseline‑paired evaluation results for a subject model relative to the provided baseline snapshot under the configured profile/preset.*"
|
|
1858
1864
|
)
|
|
1859
1865
|
lines.append(
|
|
1860
|
-
"*
|
|
1866
|
+
"*It reports regression-risk indicators for the measured signals; it is not a broad AI safety, alignment, or content-safety guarantee.*"
|
|
1861
1867
|
)
|
|
1862
1868
|
|
|
1863
1869
|
return "\n".join(lines)
|