invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +2 -2
- invarlock/_data/runtime/tiers.yaml +57 -30
- invarlock/adapters/__init__.py +11 -15
- invarlock/adapters/auto.py +35 -40
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_mixin.py +25 -4
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/calibration/spectral_null.py +15 -10
- invarlock/calibration/variance_ve.py +0 -2
- invarlock/cli/adapter_auto.py +31 -21
- invarlock/cli/app.py +73 -2
- invarlock/cli/commands/calibrate.py +6 -2
- invarlock/cli/commands/certify.py +651 -91
- invarlock/cli/commands/doctor.py +11 -11
- invarlock/cli/commands/explain_gates.py +57 -8
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +233 -69
- invarlock/cli/commands/run.py +1066 -244
- invarlock/cli/commands/verify.py +154 -15
- invarlock/cli/config.py +22 -6
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +1 -1
- invarlock/core/api.py +45 -5
- invarlock/core/auto_tuning.py +65 -20
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/contracts.py +7 -1
- invarlock/core/registry.py +11 -13
- invarlock/core/runner.py +425 -75
- invarlock/edits/quant_rtn.py +65 -37
- invarlock/eval/bench.py +3 -16
- invarlock/eval/data.py +82 -51
- invarlock/eval/metrics.py +63 -2
- invarlock/eval/primary_metric.py +23 -0
- invarlock/eval/tail_stats.py +230 -0
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/_estimators.py +154 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/policies.py +16 -6
- invarlock/guards/rmt.py +627 -546
- invarlock/guards/spectral.py +348 -110
- invarlock/guards/tier_config.py +32 -30
- invarlock/guards/variance.py +7 -31
- invarlock/guards_ref/rmt_ref.py +23 -23
- invarlock/model_profile.py +90 -42
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/reporting/certificate.py +384 -55
- invarlock/reporting/certificate_schema.py +3 -2
- invarlock/reporting/dataset_hashing.py +15 -2
- invarlock/reporting/guards_analysis.py +350 -277
- invarlock/reporting/html.py +55 -5
- invarlock/reporting/normalizer.py +13 -0
- invarlock/reporting/policy_utils.py +38 -36
- invarlock/reporting/primary_metric_utils.py +71 -17
- invarlock/reporting/render.py +852 -431
- invarlock/reporting/report.py +40 -4
- invarlock/reporting/report_types.py +11 -3
- invarlock/reporting/telemetry.py +86 -0
- invarlock/reporting/validate.py +1 -18
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
invarlock/cli/commands/doctor.py
CHANGED
|
@@ -188,7 +188,9 @@ def doctor_command(
|
|
|
188
188
|
None, "--config", "-c", help="Path to YAML config for preflight lints"
|
|
189
189
|
),
|
|
190
190
|
profile: str | None = typer.Option(
|
|
191
|
-
None,
|
|
191
|
+
None,
|
|
192
|
+
"--profile",
|
|
193
|
+
help="Profile to apply for preflight (e.g. ci, release, ci_cpu; dev is a no-op)",
|
|
192
194
|
),
|
|
193
195
|
baseline: str | None = typer.Option(
|
|
194
196
|
None, "--baseline", help="Optional baseline report to check pairing readiness"
|
|
@@ -1040,8 +1042,7 @@ def doctor_command(
|
|
|
1040
1042
|
module = str(info.get("module") or "")
|
|
1041
1043
|
support = (
|
|
1042
1044
|
"auto"
|
|
1043
|
-
if module.startswith("invarlock.adapters")
|
|
1044
|
-
and n in {"hf_causal_auto", "hf_mlm_auto"}
|
|
1045
|
+
if module.startswith("invarlock.adapters") and n in {"hf_auto"}
|
|
1045
1046
|
else (
|
|
1046
1047
|
"core"
|
|
1047
1048
|
if module.startswith("invarlock.adapters")
|
|
@@ -1056,11 +1057,10 @@ def doctor_command(
|
|
|
1056
1057
|
|
|
1057
1058
|
# Heuristic backend mapping without heavy imports
|
|
1058
1059
|
if n in {
|
|
1059
|
-
"
|
|
1060
|
-
"
|
|
1061
|
-
"
|
|
1062
|
-
"
|
|
1063
|
-
"hf_mlm_auto",
|
|
1060
|
+
"hf_causal",
|
|
1061
|
+
"hf_mlm",
|
|
1062
|
+
"hf_seq2seq",
|
|
1063
|
+
"hf_auto",
|
|
1064
1064
|
}:
|
|
1065
1065
|
# Transformers-based
|
|
1066
1066
|
backend = "transformers"
|
|
@@ -1095,8 +1095,8 @@ def doctor_command(
|
|
|
1095
1095
|
}.get(n)
|
|
1096
1096
|
if hint:
|
|
1097
1097
|
enable = f"pip install '{hint}'"
|
|
1098
|
-
# Special-case:
|
|
1099
|
-
if n == "
|
|
1098
|
+
# Special-case: ONNX causal adapter is core but requires Optimum/ONNXRuntime
|
|
1099
|
+
if n == "hf_causal_onnx":
|
|
1100
1100
|
backend = backend or "onnxruntime"
|
|
1101
1101
|
present = (
|
|
1102
1102
|
importlib.util.find_spec("optimum.onnxruntime") is not None
|
|
@@ -1320,7 +1320,7 @@ def doctor_command(
|
|
|
1320
1320
|
if "optimum" in str(e).lower():
|
|
1321
1321
|
if not json_out:
|
|
1322
1322
|
console.print(
|
|
1323
|
-
" [yellow]⚠️ Optional Optimum/ONNXRuntime missing;
|
|
1323
|
+
" [yellow]⚠️ Optional Optimum/ONNXRuntime missing; hf_causal_onnx will be shown as needs_extra[/yellow]"
|
|
1324
1324
|
)
|
|
1325
1325
|
# Do not mark overall health as failed for optional extras
|
|
1326
1326
|
else:
|
|
@@ -99,10 +99,6 @@ def explain_gates_command(
|
|
|
99
99
|
pm = cert.get("primary_metric", {})
|
|
100
100
|
ratio = pm.get("ratio_vs_baseline")
|
|
101
101
|
ratio_ci = pm.get("display_ci")
|
|
102
|
-
elif isinstance(cert.get("ppl"), dict): # legacy
|
|
103
|
-
ppl = cert.get("ppl", {})
|
|
104
|
-
ratio = ppl.get("ratio_vs_baseline")
|
|
105
|
-
ratio_ci = ppl.get("ratio_ci")
|
|
106
102
|
hysteresis_applied = bool(validation.get("hysteresis_applied"))
|
|
107
103
|
status = "PASS" if bool(validation.get("primary_metric_acceptable")) else "FAIL"
|
|
108
104
|
console.print("[bold]Gate: Primary Metric vs Baseline[/bold]")
|
|
@@ -125,6 +121,63 @@ def explain_gates_command(
|
|
|
125
121
|
f" note: hysteresis applied → effective threshold = {limit_with_hyst:.3f}x"
|
|
126
122
|
)
|
|
127
123
|
|
|
124
|
+
# Tail gate explanation (warn/fail; based on per-window Δlog-loss vs baseline)
|
|
125
|
+
pm_tail = (
|
|
126
|
+
cert.get("primary_metric_tail", {})
|
|
127
|
+
if isinstance(cert.get("primary_metric_tail"), dict)
|
|
128
|
+
else {}
|
|
129
|
+
)
|
|
130
|
+
if pm_tail:
|
|
131
|
+
mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
|
|
132
|
+
evaluated = bool(pm_tail.get("evaluated", False))
|
|
133
|
+
passed = bool(pm_tail.get("passed", True))
|
|
134
|
+
policy = (
|
|
135
|
+
pm_tail.get("policy", {}) if isinstance(pm_tail.get("policy"), dict) else {}
|
|
136
|
+
)
|
|
137
|
+
stats = (
|
|
138
|
+
pm_tail.get("stats", {}) if isinstance(pm_tail.get("stats"), dict) else {}
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
q = policy.get("quantile", 0.95)
|
|
142
|
+
try:
|
|
143
|
+
qf = float(q)
|
|
144
|
+
except Exception:
|
|
145
|
+
qf = 0.95
|
|
146
|
+
qf = max(0.0, min(1.0, qf))
|
|
147
|
+
q_key = f"q{int(round(100.0 * qf))}"
|
|
148
|
+
q_name = f"P{int(round(100.0 * qf))}"
|
|
149
|
+
q_val = stats.get(q_key)
|
|
150
|
+
qmax = policy.get("quantile_max")
|
|
151
|
+
eps = policy.get("epsilon", stats.get("epsilon"))
|
|
152
|
+
mass = stats.get("tail_mass")
|
|
153
|
+
mmax = policy.get("mass_max")
|
|
154
|
+
|
|
155
|
+
if not evaluated:
|
|
156
|
+
status_tail = "INFO"
|
|
157
|
+
elif passed:
|
|
158
|
+
status_tail = "PASS"
|
|
159
|
+
elif mode == "fail":
|
|
160
|
+
status_tail = "FAIL"
|
|
161
|
+
else:
|
|
162
|
+
status_tail = "WARN"
|
|
163
|
+
|
|
164
|
+
console.print("\n[bold]Gate: Primary Metric Tail (ΔlogNLL)[/bold]")
|
|
165
|
+
console.print(f" mode: {mode}")
|
|
166
|
+
console.print(f" status: {status_tail}")
|
|
167
|
+
if isinstance(q_val, int | float):
|
|
168
|
+
console.print(f" observed: {q_name}={float(q_val):.4f}")
|
|
169
|
+
if isinstance(mass, int | float):
|
|
170
|
+
console.print(f" tail_mass: Pr[ΔlogNLL > ε]={float(mass):.4f}")
|
|
171
|
+
thr_parts: list[str] = []
|
|
172
|
+
if isinstance(qmax, int | float):
|
|
173
|
+
thr_parts.append(f"{q_name}≤{float(qmax):.4f}")
|
|
174
|
+
if isinstance(mmax, int | float):
|
|
175
|
+
thr_parts.append(f"mass≤{float(mmax):.4f}")
|
|
176
|
+
if isinstance(eps, int | float):
|
|
177
|
+
thr_parts.append(f"ε={float(eps):.1e}")
|
|
178
|
+
if thr_parts:
|
|
179
|
+
console.print(" threshold: " + "; ".join(thr_parts))
|
|
180
|
+
|
|
128
181
|
# Dataset split visibility from report provenance
|
|
129
182
|
try:
|
|
130
183
|
split = (report_data.get("provenance", {}) or {}).get("dataset_split")
|
|
@@ -151,10 +204,6 @@ def explain_gates_command(
|
|
|
151
204
|
drift = float(final) / float(preview)
|
|
152
205
|
except Exception:
|
|
153
206
|
drift = None
|
|
154
|
-
if isinstance(cert.get("ppl"), dict): # legacy
|
|
155
|
-
ppl = cert.get("ppl", {})
|
|
156
|
-
drift = ppl.get("preview_final_ratio", drift)
|
|
157
|
-
drift_ci = ppl.get("drift_ci")
|
|
158
207
|
drift_status = (
|
|
159
208
|
"PASS" if bool(validation.get("preview_final_drift_acceptable")) else "FAIL"
|
|
160
209
|
)
|
|
@@ -201,9 +201,9 @@ def plugins_command(
|
|
|
201
201
|
entry = info.get("entry_point")
|
|
202
202
|
# Classify support level independent of origin
|
|
203
203
|
if module.startswith("invarlock.adapters"):
|
|
204
|
-
if n in {"
|
|
204
|
+
if n in {"hf_auto"}:
|
|
205
205
|
support = "auto"
|
|
206
|
-
elif n in {"
|
|
206
|
+
elif n in {"hf_causal_onnx"}:
|
|
207
207
|
# ONNX relies on optional extras (optimum + onnxruntime)
|
|
208
208
|
support = "optional"
|
|
209
209
|
else:
|
|
@@ -236,7 +236,7 @@ def plugins_command(
|
|
|
236
236
|
if backend_name in {"auto-gptq", "autoawq"} and not is_linux:
|
|
237
237
|
status = "unsupported"
|
|
238
238
|
enable = "Linux-only"
|
|
239
|
-
# Extras completeness for certain adapters (e.g.,
|
|
239
|
+
# Extras completeness for certain adapters (e.g., hf_causal_onnx needs optimum + onnxruntime)
|
|
240
240
|
try:
|
|
241
241
|
extras_status = _check_plugin_extras(n, "adapters")
|
|
242
242
|
except Exception:
|
|
@@ -883,10 +883,14 @@ def _check_plugin_extras(plugin_name: str, plugin_type: str) -> str:
|
|
|
883
883
|
"variance": {"packages": [], "extra": ""},
|
|
884
884
|
"rmt": {"packages": [], "extra": ""},
|
|
885
885
|
# Adapter plugins (baked-in only)
|
|
886
|
-
"
|
|
887
|
-
"
|
|
888
|
-
"
|
|
889
|
-
"
|
|
886
|
+
"hf_causal": {"packages": ["transformers"], "extra": "invarlock[adapters]"},
|
|
887
|
+
"hf_mlm": {"packages": ["transformers"], "extra": "invarlock[adapters]"},
|
|
888
|
+
"hf_seq2seq": {"packages": ["transformers"], "extra": "invarlock[adapters]"},
|
|
889
|
+
"hf_auto": {"packages": ["transformers"], "extra": "invarlock[adapters]"},
|
|
890
|
+
"hf_causal_onnx": {
|
|
891
|
+
"packages": ["optimum", "onnxruntime"],
|
|
892
|
+
"extra": "invarlock[onnx]",
|
|
893
|
+
},
|
|
890
894
|
# Optional adapter plugins
|
|
891
895
|
"hf_gptq": {"packages": ["auto_gptq"], "extra": "invarlock[gptq]"},
|
|
892
896
|
"hf_awq": {"packages": ["autoawq"], "extra": "invarlock[awq]"},
|
|
@@ -971,7 +975,7 @@ def _resolve_uninstall_targets(target: str) -> list[str]:
|
|
|
971
975
|
"bitsandbytes": ["bitsandbytes"],
|
|
972
976
|
# ONNX/Optimum family
|
|
973
977
|
"onnx": ["onnxruntime"],
|
|
974
|
-
"
|
|
978
|
+
"hf_causal_onnx": ["onnxruntime"],
|
|
975
979
|
"optimum": ["optimum"],
|
|
976
980
|
}
|
|
977
981
|
return mapping.get(name, [])
|
|
@@ -1010,7 +1014,7 @@ def _resolve_install_targets(target: str) -> list[str]:
|
|
|
1010
1014
|
"transformers": ["invarlock[adapters]"],
|
|
1011
1015
|
# ONNX/Optimum
|
|
1012
1016
|
"onnx": ["invarlock[onnx]"],
|
|
1013
|
-
"
|
|
1017
|
+
"hf_causal_onnx": ["invarlock[onnx]"],
|
|
1014
1018
|
"optimum": ["invarlock[onnx]"],
|
|
1015
1019
|
# Direct packages passthrough
|
|
1016
1020
|
"bitsandbytes": ["bitsandbytes"],
|
invarlock/cli/commands/report.py
CHANGED
|
@@ -8,16 +8,84 @@ Provides the `invarlock report` group with:
|
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import json
|
|
11
|
+
import math
|
|
11
12
|
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
12
14
|
|
|
13
15
|
import typer
|
|
14
16
|
from rich.console import Console
|
|
15
17
|
|
|
18
|
+
from invarlock.cli.output import print_event, resolve_output_style
|
|
16
19
|
from invarlock.reporting import certificate as certificate_lib
|
|
17
20
|
from invarlock.reporting import report as report_lib
|
|
18
21
|
|
|
19
22
|
console = Console()
|
|
20
23
|
|
|
24
|
+
SECTION_WIDTH = 67
|
|
25
|
+
KV_LABEL_WIDTH = 16
|
|
26
|
+
GATE_LABEL_WIDTH = 32
|
|
27
|
+
ARTIFACT_LABEL_WIDTH = 18
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _print_section_header(console: Console, title: str) -> None:
|
|
31
|
+
bar = "═" * SECTION_WIDTH
|
|
32
|
+
console.print(bar)
|
|
33
|
+
console.print(title)
|
|
34
|
+
console.print(bar)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _format_kv_line(label: str, value: str, *, width: int = KV_LABEL_WIDTH) -> str:
|
|
38
|
+
return f" {label:<{width}}: {value}"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _format_status(ok: bool) -> str:
|
|
42
|
+
return "PASS" if ok else "FAIL"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _fmt_metric_value(value: Any) -> str:
|
|
46
|
+
try:
|
|
47
|
+
val = float(value)
|
|
48
|
+
except (TypeError, ValueError):
|
|
49
|
+
return "N/A"
|
|
50
|
+
if not math.isfinite(val):
|
|
51
|
+
return "N/A"
|
|
52
|
+
return f"{val:.3f}"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _fmt_ci_range(ci: Any) -> str:
|
|
56
|
+
if isinstance(ci, (list, tuple)) and len(ci) == 2:
|
|
57
|
+
try:
|
|
58
|
+
lo = float(ci[0])
|
|
59
|
+
hi = float(ci[1])
|
|
60
|
+
except (TypeError, ValueError):
|
|
61
|
+
return "N/A"
|
|
62
|
+
if math.isfinite(lo) and math.isfinite(hi):
|
|
63
|
+
return f"{lo:.3f}–{hi:.3f}"
|
|
64
|
+
return "N/A"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _artifact_entries(
|
|
68
|
+
saved_files: dict[str, str], output_dir: str
|
|
69
|
+
) -> list[tuple[str, str]]:
|
|
70
|
+
order = [
|
|
71
|
+
("cert", "Certificate (JSON)"),
|
|
72
|
+
("cert_md", "Certificate (MD)"),
|
|
73
|
+
("json", "JSON"),
|
|
74
|
+
("markdown", "Markdown"),
|
|
75
|
+
("html", "HTML"),
|
|
76
|
+
]
|
|
77
|
+
entries: list[tuple[str, str]] = [("Output", output_dir)]
|
|
78
|
+
used: set[str] = set()
|
|
79
|
+
for key, label in order:
|
|
80
|
+
if key in saved_files:
|
|
81
|
+
entries.append((label, str(saved_files[key])))
|
|
82
|
+
used.add(key)
|
|
83
|
+
for key in sorted(saved_files.keys()):
|
|
84
|
+
if key in used:
|
|
85
|
+
continue
|
|
86
|
+
entries.append((key.upper(), str(saved_files[key])))
|
|
87
|
+
return entries
|
|
88
|
+
|
|
21
89
|
|
|
22
90
|
# Group with callback so `invarlock report` still generates reports
|
|
23
91
|
report_app = typer.Typer(
|
|
@@ -33,6 +101,8 @@ def _generate_reports(
|
|
|
33
101
|
compare: str | None = None,
|
|
34
102
|
baseline: str | None = None,
|
|
35
103
|
output: str | None = None,
|
|
104
|
+
style: str = "audit",
|
|
105
|
+
no_color: bool = False,
|
|
36
106
|
) -> None:
|
|
37
107
|
# This callback runs only when invoked without subcommand (default Click behavior)
|
|
38
108
|
try:
|
|
@@ -55,21 +125,34 @@ def _generate_reports(
|
|
|
55
125
|
compare = _coerce_option(compare)
|
|
56
126
|
baseline = _coerce_option(baseline)
|
|
57
127
|
output = _coerce_option(output)
|
|
128
|
+
style = _coerce_option(style, "audit")
|
|
129
|
+
no_color = bool(_coerce_option(no_color, False))
|
|
130
|
+
|
|
131
|
+
output_style = resolve_output_style(
|
|
132
|
+
style=str(style),
|
|
133
|
+
profile="ci",
|
|
134
|
+
progress=False,
|
|
135
|
+
timing=False,
|
|
136
|
+
no_color=no_color,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def _event(tag: str, message: str, *, emoji: str | None = None) -> None:
|
|
140
|
+
print_event(console, tag, message, style=output_style, emoji=emoji)
|
|
58
141
|
|
|
59
142
|
# Load primary report
|
|
60
|
-
|
|
143
|
+
_event("DATA", f"Loading run report: {run}", emoji="📊")
|
|
61
144
|
primary_report = _load_run_report(run)
|
|
62
145
|
|
|
63
146
|
# Load comparison report if specified
|
|
64
147
|
compare_report = None
|
|
65
148
|
if compare:
|
|
66
|
-
|
|
149
|
+
_event("DATA", f"Loading comparison report: {compare}", emoji="📊")
|
|
67
150
|
compare_report = _load_run_report(compare)
|
|
68
151
|
|
|
69
152
|
# Load baseline report if specified
|
|
70
153
|
baseline_report = None
|
|
71
154
|
if baseline:
|
|
72
|
-
|
|
155
|
+
_event("DATA", f"Loading baseline report: {baseline}", emoji="📊")
|
|
73
156
|
baseline_report = _load_run_report(baseline)
|
|
74
157
|
|
|
75
158
|
# Determine output directory
|
|
@@ -88,17 +171,20 @@ def _generate_reports(
|
|
|
88
171
|
# Validate certificate requirements
|
|
89
172
|
if "cert" in formats:
|
|
90
173
|
if baseline_report is None:
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
"Use: invarlock report --run <run_dir> --format cert --baseline <baseline_run_dir>"
|
|
174
|
+
_event("FAIL", "Certificate format requires --baseline", emoji="❌")
|
|
175
|
+
_event(
|
|
176
|
+
"INFO",
|
|
177
|
+
"Use: invarlock report --run <run_dir> --format cert --baseline <baseline_run_dir>",
|
|
96
178
|
)
|
|
97
179
|
raise typer.Exit(1)
|
|
98
|
-
|
|
180
|
+
_event(
|
|
181
|
+
"EXEC",
|
|
182
|
+
"Generating evaluation certificate with baseline comparison",
|
|
183
|
+
emoji="📜",
|
|
184
|
+
)
|
|
99
185
|
|
|
100
186
|
# Generate reports
|
|
101
|
-
|
|
187
|
+
_event("EXEC", f"Generating reports in formats: {formats}", emoji="📝")
|
|
102
188
|
saved_files = report_lib.save_report(
|
|
103
189
|
primary_report,
|
|
104
190
|
output_dir,
|
|
@@ -109,40 +195,8 @@ def _generate_reports(
|
|
|
109
195
|
)
|
|
110
196
|
|
|
111
197
|
# Show results
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
for fmt, file_path in saved_files.items():
|
|
116
|
-
if fmt == "cert":
|
|
117
|
-
console.print(f" 📜 CERTIFICATE (JSON): {file_path}")
|
|
118
|
-
elif fmt == "cert_md":
|
|
119
|
-
console.print(f" 📜 CERTIFICATE (MD): {file_path}")
|
|
120
|
-
else:
|
|
121
|
-
console.print(f" 📄 {fmt.upper()}: {file_path}")
|
|
122
|
-
|
|
123
|
-
# Show key metrics (PM-first). Avoid legacy PPL wording.
|
|
124
|
-
console.print("\n📈 Key Metrics:")
|
|
125
|
-
console.print(f" Model: {primary_report['meta']['model_id']}")
|
|
126
|
-
console.print(f" Edit: {primary_report['edit']['name']}")
|
|
127
|
-
pm = (primary_report.get("metrics", {}) or {}).get("primary_metric", {})
|
|
128
|
-
if isinstance(pm, dict) and pm:
|
|
129
|
-
kind = str(pm.get("kind") or "primary")
|
|
130
|
-
console.print(f" Primary Metric: {kind}")
|
|
131
|
-
final = pm.get("final")
|
|
132
|
-
if isinstance(final, int | float):
|
|
133
|
-
console.print(f" point (final): {final:.3f}")
|
|
134
|
-
dci = pm.get("display_ci")
|
|
135
|
-
if isinstance(dci, tuple | list) and len(dci) == 2:
|
|
136
|
-
try:
|
|
137
|
-
lo, hi = float(dci[0]), float(dci[1])
|
|
138
|
-
console.print(f" CI: {lo:.3f}–{hi:.3f}")
|
|
139
|
-
except Exception:
|
|
140
|
-
pass
|
|
141
|
-
ratio = pm.get("ratio_vs_baseline")
|
|
142
|
-
if isinstance(ratio, int | float):
|
|
143
|
-
console.print(f" ratio vs baseline: {ratio:.3f}")
|
|
144
|
-
|
|
145
|
-
# Show certificate validation if generated
|
|
198
|
+
_event("PASS", "Reports generated successfully.", emoji="✅")
|
|
199
|
+
|
|
146
200
|
if "cert" in formats and baseline_report:
|
|
147
201
|
try:
|
|
148
202
|
certificate = certificate_lib.make_certificate(
|
|
@@ -155,36 +209,105 @@ def _generate_reports(
|
|
|
155
209
|
|
|
156
210
|
block = _console_block(certificate)
|
|
157
211
|
overall_pass = bool(block.get("overall_pass"))
|
|
212
|
+
status_text = _format_status(overall_pass)
|
|
158
213
|
|
|
159
|
-
console.print("
|
|
160
|
-
|
|
161
|
-
console.print(
|
|
162
|
-
f" Overall Status: {status_emoji} {'PASS' if overall_pass else 'FAIL'}"
|
|
163
|
-
)
|
|
214
|
+
console.print("")
|
|
215
|
+
_print_section_header(console, "CERTIFICATE SUMMARY")
|
|
216
|
+
console.print(_format_kv_line("Status", status_text))
|
|
164
217
|
|
|
218
|
+
schema_version = certificate.get("schema_version")
|
|
219
|
+
if schema_version:
|
|
220
|
+
console.print(
|
|
221
|
+
_format_kv_line("Schema Version", str(schema_version))
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
run_id = certificate.get("run_id") or (
|
|
225
|
+
(primary_report.get("meta", {}) or {}).get("run_id")
|
|
226
|
+
)
|
|
227
|
+
if run_id:
|
|
228
|
+
console.print(_format_kv_line("Run ID", str(run_id)))
|
|
229
|
+
|
|
230
|
+
model_id = (primary_report.get("meta", {}) or {}).get("model_id")
|
|
231
|
+
edit_name = (primary_report.get("edit", {}) or {}).get("name")
|
|
232
|
+
if model_id:
|
|
233
|
+
console.print(_format_kv_line("Model", str(model_id)))
|
|
234
|
+
if edit_name:
|
|
235
|
+
console.print(_format_kv_line("Edit", str(edit_name)))
|
|
236
|
+
|
|
237
|
+
pm = (primary_report.get("metrics", {}) or {}).get("primary_metric", {})
|
|
238
|
+
console.print(" PRIMARY METRIC")
|
|
239
|
+
pm_entries: list[tuple[str, str]] = []
|
|
240
|
+
if isinstance(pm, dict) and pm:
|
|
241
|
+
kind = str(pm.get("kind") or "primary")
|
|
242
|
+
pm_entries.append(("Kind", kind))
|
|
243
|
+
preview = pm.get("preview")
|
|
244
|
+
if preview is not None:
|
|
245
|
+
pm_entries.append(("Preview", _fmt_metric_value(preview)))
|
|
246
|
+
final = pm.get("final")
|
|
247
|
+
if final is not None:
|
|
248
|
+
pm_entries.append(("Final", _fmt_metric_value(final)))
|
|
249
|
+
ratio = pm.get("ratio_vs_baseline")
|
|
250
|
+
if ratio is not None:
|
|
251
|
+
pm_entries.append(("Ratio", _fmt_metric_value(ratio)))
|
|
252
|
+
dci = pm.get("display_ci")
|
|
253
|
+
if dci is not None:
|
|
254
|
+
pm_entries.append(("CI", _fmt_ci_range(dci)))
|
|
255
|
+
if not pm_entries:
|
|
256
|
+
pm_entries.append(("Status", "Unavailable"))
|
|
257
|
+
for idx, (label, value) in enumerate(pm_entries):
|
|
258
|
+
branch = "└─" if idx == len(pm_entries) - 1 else "├─"
|
|
259
|
+
console.print(f" {branch} {label:<14} {value}")
|
|
260
|
+
|
|
261
|
+
console.print(" VALIDATION GATES")
|
|
165
262
|
rows = block.get("rows", [])
|
|
166
263
|
if isinstance(rows, list) and rows:
|
|
167
|
-
for row in rows:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
264
|
+
for idx, row in enumerate(rows):
|
|
265
|
+
label = str(row.get("label") or "Unknown")
|
|
266
|
+
ok = bool(row.get("ok"))
|
|
267
|
+
status = _format_status(ok)
|
|
268
|
+
mark = "✓" if ok else "✗"
|
|
269
|
+
branch = "└─" if idx == len(rows) - 1 else "├─"
|
|
270
|
+
console.print(
|
|
271
|
+
f" {branch} {label:<{GATE_LABEL_WIDTH}} {mark} {status}"
|
|
272
|
+
)
|
|
273
|
+
else:
|
|
274
|
+
console.print(f" └─ {'No validation rows':<{GATE_LABEL_WIDTH}} -")
|
|
275
|
+
|
|
276
|
+
console.print(" ARTIFACTS")
|
|
277
|
+
entries = _artifact_entries(saved_files, str(output_dir))
|
|
278
|
+
for idx, (label, value) in enumerate(entries):
|
|
279
|
+
branch = "└─" if idx == len(entries) - 1 else "├─"
|
|
280
|
+
console.print(f" {branch} {label:<{ARTIFACT_LABEL_WIDTH}} {value}")
|
|
281
|
+
console.print("═" * SECTION_WIDTH)
|
|
175
282
|
|
|
176
283
|
# In CLI report flow, do not hard-exit on validation failure; just display status.
|
|
177
284
|
# CI gating should be handled by dedicated verify commands.
|
|
178
285
|
|
|
179
286
|
except Exception as e:
|
|
180
|
-
|
|
181
|
-
f" [yellow]⚠️ Certificate validation error: {e}[/yellow]"
|
|
182
|
-
)
|
|
287
|
+
_event("WARN", f"Certificate validation error: {e}", emoji="⚠️")
|
|
183
288
|
# Exit non-zero on certificate generation error
|
|
184
289
|
raise typer.Exit(1) from e
|
|
290
|
+
else:
|
|
291
|
+
console.print(_format_kv_line("Output", str(output_dir)))
|
|
292
|
+
for label, value in _artifact_entries(saved_files, str(output_dir))[1:]:
|
|
293
|
+
console.print(
|
|
294
|
+
_format_kv_line(label, str(value), width=ARTIFACT_LABEL_WIDTH)
|
|
295
|
+
)
|
|
185
296
|
|
|
186
297
|
except Exception as e:
|
|
187
|
-
|
|
298
|
+
print_event(
|
|
299
|
+
console,
|
|
300
|
+
"FAIL",
|
|
301
|
+
f"Report generation failed: {e}",
|
|
302
|
+
style=resolve_output_style(
|
|
303
|
+
style="audit",
|
|
304
|
+
profile="ci",
|
|
305
|
+
progress=False,
|
|
306
|
+
timing=False,
|
|
307
|
+
no_color=False,
|
|
308
|
+
),
|
|
309
|
+
emoji="❌",
|
|
310
|
+
)
|
|
188
311
|
raise typer.Exit(1) from e
|
|
189
312
|
|
|
190
313
|
|
|
@@ -206,15 +329,37 @@ def report_callback(
|
|
|
206
329
|
help="Path to baseline run for certificate generation (required for cert format)",
|
|
207
330
|
),
|
|
208
331
|
output: str | None = typer.Option(None, "--output", "-o", help="Output directory"),
|
|
332
|
+
style: str = typer.Option("audit", "--style", help="Output style (audit|friendly)"),
|
|
333
|
+
no_color: bool = typer.Option(
|
|
334
|
+
False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
|
|
335
|
+
),
|
|
209
336
|
):
|
|
210
337
|
"""Generate a report from a run (default callback)."""
|
|
211
338
|
if getattr(ctx, "resilient_parsing", False) or ctx.invoked_subcommand is not None:
|
|
212
339
|
return
|
|
213
340
|
if not run:
|
|
214
|
-
|
|
341
|
+
print_event(
|
|
342
|
+
console,
|
|
343
|
+
"FAIL",
|
|
344
|
+
"--run is required when no subcommand is provided",
|
|
345
|
+
style=resolve_output_style(
|
|
346
|
+
style=str(style),
|
|
347
|
+
profile="ci",
|
|
348
|
+
progress=False,
|
|
349
|
+
timing=False,
|
|
350
|
+
no_color=no_color,
|
|
351
|
+
),
|
|
352
|
+
emoji="❌",
|
|
353
|
+
)
|
|
215
354
|
raise typer.Exit(2)
|
|
216
355
|
return _generate_reports(
|
|
217
|
-
run=run,
|
|
356
|
+
run=run,
|
|
357
|
+
format=format,
|
|
358
|
+
compare=compare,
|
|
359
|
+
baseline=baseline,
|
|
360
|
+
output=output,
|
|
361
|
+
style=style,
|
|
362
|
+
no_color=no_color,
|
|
218
363
|
)
|
|
219
364
|
|
|
220
365
|
|
|
@@ -225,9 +370,17 @@ def report_command(
|
|
|
225
370
|
compare: str | None = None,
|
|
226
371
|
baseline: str | None = None,
|
|
227
372
|
output: str | None = None,
|
|
373
|
+
style: str = "audit",
|
|
374
|
+
no_color: bool = False,
|
|
228
375
|
):
|
|
229
376
|
return _generate_reports(
|
|
230
|
-
run=run,
|
|
377
|
+
run=run,
|
|
378
|
+
format=format,
|
|
379
|
+
compare=compare,
|
|
380
|
+
baseline=baseline,
|
|
381
|
+
output=output,
|
|
382
|
+
style=style,
|
|
383
|
+
no_color=no_color,
|
|
231
384
|
)
|
|
232
385
|
|
|
233
386
|
|
|
@@ -326,11 +479,22 @@ def report_validate(
|
|
|
326
479
|
),
|
|
327
480
|
):
|
|
328
481
|
"""Validate a certificate JSON against the current schema (v1)."""
|
|
482
|
+
output_style = resolve_output_style(
|
|
483
|
+
style="audit",
|
|
484
|
+
profile="ci",
|
|
485
|
+
progress=False,
|
|
486
|
+
timing=False,
|
|
487
|
+
no_color=False,
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
def _event(tag: str, message: str, *, emoji: str | None = None) -> None:
|
|
491
|
+
print_event(console, tag, message, style=output_style, emoji=emoji)
|
|
492
|
+
|
|
329
493
|
p = Path(report)
|
|
330
494
|
try:
|
|
331
495
|
payload = json.loads(p.read_text(encoding="utf-8"))
|
|
332
496
|
except Exception as exc: # noqa: BLE001
|
|
333
|
-
|
|
497
|
+
_event("FAIL", f"Failed to read input JSON: {exc}", emoji="❌")
|
|
334
498
|
raise typer.Exit(1) from exc
|
|
335
499
|
|
|
336
500
|
try:
|
|
@@ -338,16 +502,16 @@ def report_validate(
|
|
|
338
502
|
|
|
339
503
|
ok = validate_certificate(payload)
|
|
340
504
|
if not ok:
|
|
341
|
-
|
|
505
|
+
_event("FAIL", "Certificate schema validation failed", emoji="❌")
|
|
342
506
|
raise typer.Exit(2)
|
|
343
|
-
|
|
507
|
+
_event("PASS", "Certificate schema is valid", emoji="✅")
|
|
344
508
|
except ValueError as exc:
|
|
345
|
-
|
|
509
|
+
_event("FAIL", f"Certificate validation error: {exc}", emoji="❌")
|
|
346
510
|
raise typer.Exit(2) from exc
|
|
347
511
|
except typer.Exit:
|
|
348
512
|
raise
|
|
349
513
|
except Exception as exc: # noqa: BLE001
|
|
350
|
-
|
|
514
|
+
_event("FAIL", f"Validation failed: {exc}", emoji="❌")
|
|
351
515
|
raise typer.Exit(1) from exc
|
|
352
516
|
|
|
353
517
|
|