invarlock 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +3 -3
- invarlock/adapters/auto.py +2 -10
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +28 -5
- invarlock/assurance/__init__.py +15 -23
- invarlock/calibration/spectral_null.py +1 -1
- invarlock/cli/adapter_auto.py +1 -5
- invarlock/cli/app.py +57 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/{certify.py → evaluate.py} +69 -46
- invarlock/cli/commands/explain_gates.py +94 -51
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/report.py +121 -47
- invarlock/cli/commands/run.py +274 -66
- invarlock/cli/commands/verify.py +84 -89
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/provenance.py +3 -3
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +1 -1
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +2 -2
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +11 -7
- invarlock/eval/primary_metric.py +1 -1
- invarlock/guards/spectral.py +2 -2
- invarlock/guards_ref/spectral_ref.py +1 -1
- invarlock/model_profile.py +16 -35
- invarlock/observability/health.py +38 -20
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/html.py +7 -7
- invarlock/reporting/normalizer.py +2 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +126 -120
- invarlock/reporting/report.py +43 -37
- invarlock/reporting/{certificate.py → report_builder.py} +103 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock-0.3.9.dist-info/METADATA +303 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/RECORD +46 -46
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/WHEEL +1 -1
- invarlock-0.3.7.dist-info/METADATA +0 -602
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
"""
|
|
2
|
-
InvarLock CLI
|
|
2
|
+
InvarLock CLI Evaluate Command
|
|
3
3
|
=========================
|
|
4
4
|
|
|
5
|
-
Hero path: Compare &
|
|
5
|
+
Hero path: Compare & Evaluate (BYOE). Provide baseline (`--baseline`) and
|
|
6
6
|
subject (`--subject`) checkpoints and InvarLock will run paired windows and emit a
|
|
7
|
-
|
|
7
|
+
evaluation report. Optionally, pass `--edit-config` to run the built‑in quant_rtn demo.
|
|
8
8
|
|
|
9
9
|
Steps:
|
|
10
10
|
1) Baseline (no-op edit) on baseline model
|
|
11
11
|
2) Subject (no-op or provided edit config) on subject model with --baseline pairing
|
|
12
|
-
3) Emit
|
|
12
|
+
3) Emit evaluation report via `invarlock report --format report`
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
15
|
from __future__ import annotations
|
|
@@ -18,6 +18,7 @@ import inspect
|
|
|
18
18
|
import io
|
|
19
19
|
import json
|
|
20
20
|
import math
|
|
21
|
+
import os
|
|
21
22
|
from collections.abc import Iterator
|
|
22
23
|
from contextlib import contextmanager
|
|
23
24
|
from pathlib import Path
|
|
@@ -60,7 +61,7 @@ def _render_banner_lines(title: str, context: str) -> list[str]:
|
|
|
60
61
|
def _print_header_banner(
|
|
61
62
|
console: Console, *, version: str, profile: str, tier: str, adapter: str
|
|
62
63
|
) -> None:
|
|
63
|
-
title = f"INVARLOCK v{version} ·
|
|
64
|
+
title = f"INVARLOCK v{version} · Evaluation Pipeline"
|
|
64
65
|
context = f"Profile: {profile} · Tier: {tier} · Adapter: {adapter}"
|
|
65
66
|
for line in _render_banner_lines(title, context):
|
|
66
67
|
console.print(line)
|
|
@@ -128,32 +129,32 @@ def _suppress_child_output(enabled: bool) -> Iterator[io.StringIO | None]:
|
|
|
128
129
|
|
|
129
130
|
def _print_quiet_summary(
|
|
130
131
|
*,
|
|
131
|
-
|
|
132
|
+
report_out: Path,
|
|
132
133
|
source: str,
|
|
133
134
|
edited: str,
|
|
134
135
|
profile: str,
|
|
135
136
|
) -> None:
|
|
136
|
-
|
|
137
|
-
console.print(f"INVARLOCK v{INVARLOCK_VERSION} ·
|
|
137
|
+
report_path = report_out / "evaluation.report.json"
|
|
138
|
+
console.print(f"INVARLOCK v{INVARLOCK_VERSION} · EVALUATE")
|
|
138
139
|
console.print(f"Baseline: {source} -> Subject: {edited} · Profile: {profile}")
|
|
139
|
-
if not
|
|
140
|
-
console.print(f"Output: {
|
|
140
|
+
if not report_path.exists():
|
|
141
|
+
console.print(f"Output: {report_out}")
|
|
141
142
|
return
|
|
142
143
|
try:
|
|
143
|
-
with
|
|
144
|
-
|
|
144
|
+
with report_path.open("r", encoding="utf-8") as fh:
|
|
145
|
+
evaluation_report = json.load(fh)
|
|
145
146
|
except Exception:
|
|
146
|
-
console.print(f"Output: {
|
|
147
|
+
console.print(f"Output: {report_path}")
|
|
147
148
|
return
|
|
148
|
-
if not isinstance(
|
|
149
|
-
console.print(f"Output: {
|
|
149
|
+
if not isinstance(evaluation_report, dict):
|
|
150
|
+
console.print(f"Output: {report_path}")
|
|
150
151
|
return
|
|
151
152
|
try:
|
|
152
153
|
from invarlock.reporting.render import (
|
|
153
154
|
compute_console_validation_block as _console_block,
|
|
154
155
|
)
|
|
155
156
|
|
|
156
|
-
block = _console_block(
|
|
157
|
+
block = _console_block(evaluation_report)
|
|
157
158
|
rows = block.get("rows", [])
|
|
158
159
|
total = len(rows) if isinstance(rows, list) else 0
|
|
159
160
|
passed = (
|
|
@@ -165,13 +166,13 @@ def _print_quiet_summary(
|
|
|
165
166
|
passed = 0
|
|
166
167
|
status = "UNKNOWN"
|
|
167
168
|
pm_ratio = _format_ratio(
|
|
168
|
-
(
|
|
169
|
+
(evaluation_report.get("primary_metric") or {}).get("ratio_vs_baseline")
|
|
169
170
|
)
|
|
170
171
|
gate_summary = f"{passed}/{total} passed" if total else "N/A"
|
|
171
172
|
console.print(f"Status: {status} · Gates: {gate_summary}")
|
|
172
173
|
if pm_ratio != "N/A":
|
|
173
174
|
console.print(f"Primary metric ratio: {pm_ratio}")
|
|
174
|
-
console.print(f"Output: {
|
|
175
|
+
console.print(f"Output: {report_path}")
|
|
175
176
|
|
|
176
177
|
|
|
177
178
|
def _latest_run_report(run_root: Path) -> Path | None:
|
|
@@ -206,6 +207,23 @@ def _dump_yaml(path: Path, data: dict[str, Any]) -> None:
|
|
|
206
207
|
yaml.safe_dump(data, fh, sort_keys=False)
|
|
207
208
|
|
|
208
209
|
|
|
210
|
+
def _resolve_evaluate_tmp_dir() -> Path:
|
|
211
|
+
"""Return the on-disk scratch directory for `invarlock evaluate`.
|
|
212
|
+
|
|
213
|
+
Evaluate generates merged YAML configs for baseline/subject runs so
|
|
214
|
+
downstream `invarlock run` flows remain traceable. We keep these files
|
|
215
|
+
under `./tmp/.evaluate` by default to avoid cluttering the working tree.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
candidate = os.environ.get("INVARLOCK_EVALUATE_TMP_DIR")
|
|
219
|
+
if candidate:
|
|
220
|
+
tmp_dir = Path(candidate).expanduser()
|
|
221
|
+
else:
|
|
222
|
+
tmp_dir = Path("tmp") / ".evaluate"
|
|
223
|
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
224
|
+
return tmp_dir
|
|
225
|
+
|
|
226
|
+
|
|
209
227
|
def _normalize_model_id(model_id: str, adapter_name: str) -> str:
|
|
210
228
|
"""Normalize model identifiers for adapters.
|
|
211
229
|
|
|
@@ -221,7 +239,7 @@ def _normalize_model_id(model_id: str, adapter_name: str) -> str:
|
|
|
221
239
|
return mid
|
|
222
240
|
|
|
223
241
|
|
|
224
|
-
def
|
|
242
|
+
def evaluate_command(
|
|
225
243
|
# Primary names for programmatic/test compatibility
|
|
226
244
|
source: str = typer.Option(
|
|
227
245
|
..., "--source", "--baseline", help="Baseline model dir or Hub ID"
|
|
@@ -258,8 +276,8 @@ def certify_command(
|
|
|
258
276
|
),
|
|
259
277
|
),
|
|
260
278
|
out: str = typer.Option("runs", "--out", help="Base output directory"),
|
|
261
|
-
|
|
262
|
-
"reports/
|
|
279
|
+
report_out: str = typer.Option(
|
|
280
|
+
"reports/eval", "--report-out", help="Evaluation report output directory"
|
|
263
281
|
),
|
|
264
282
|
edit_config: str | None = typer.Option(
|
|
265
283
|
None, "--edit-config", help="Edit preset to apply a demo edit (quant_rtn)"
|
|
@@ -290,7 +308,7 @@ def certify_command(
|
|
|
290
308
|
False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
|
|
291
309
|
),
|
|
292
310
|
):
|
|
293
|
-
"""
|
|
311
|
+
"""Evaluate two checkpoints (baseline vs subject) with pinned windows."""
|
|
294
312
|
# Support programmatic calls and Typer-invoked calls uniformly
|
|
295
313
|
try:
|
|
296
314
|
from typer.models import OptionInfo as _TyperOptionInfo
|
|
@@ -311,7 +329,7 @@ def certify_command(
|
|
|
311
329
|
tier = _coerce_option(tier, "balanced")
|
|
312
330
|
preset = _coerce_option(preset)
|
|
313
331
|
out = _coerce_option(out, "runs")
|
|
314
|
-
|
|
332
|
+
report_out = _coerce_option(report_out, "reports/eval")
|
|
315
333
|
edit_config = _coerce_option(edit_config)
|
|
316
334
|
edit_label = _coerce_option(edit_label)
|
|
317
335
|
quiet = _coerce_option(quiet, False)
|
|
@@ -410,7 +428,7 @@ def certify_command(
|
|
|
410
428
|
"stride": 512,
|
|
411
429
|
"preview_n": 64,
|
|
412
430
|
"final_n": 64,
|
|
413
|
-
"seed":
|
|
431
|
+
"seed": 43,
|
|
414
432
|
}
|
|
415
433
|
}
|
|
416
434
|
else:
|
|
@@ -424,7 +442,7 @@ def certify_command(
|
|
|
424
442
|
)
|
|
425
443
|
raise typer.Exit(1)
|
|
426
444
|
preset_data = _load_yaml(preset_path)
|
|
427
|
-
# Do not hard-code device from presets in auto-generated
|
|
445
|
+
# Do not hard-code device from presets in auto-generated evaluate configs;
|
|
428
446
|
# allow device resolution to pick CUDA/MPS/CPU via 'auto' or CLI overrides.
|
|
429
447
|
model_block = preset_data.get("model")
|
|
430
448
|
if isinstance(model_block, dict) and "device" in model_block:
|
|
@@ -575,8 +593,7 @@ def certify_command(
|
|
|
575
593
|
elif not edit_config:
|
|
576
594
|
subject_label = "custom" if norm_src_id != norm_edt_id else "noop"
|
|
577
595
|
|
|
578
|
-
tmp_dir =
|
|
579
|
-
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
596
|
+
tmp_dir = _resolve_evaluate_tmp_dir()
|
|
580
597
|
|
|
581
598
|
baseline_report_path: Path
|
|
582
599
|
if baseline_report:
|
|
@@ -635,7 +652,7 @@ def certify_command(
|
|
|
635
652
|
baseline_report_path = baseline_report_path_candidate
|
|
636
653
|
_debug(f"Baseline report: {baseline_report_path}")
|
|
637
654
|
|
|
638
|
-
# Edited run: either no-op (Compare &
|
|
655
|
+
# Edited run: either no-op (Compare & Evaluate) or provided edit_config (demo edit)
|
|
639
656
|
_phase(2, 3, "SUBJECT EVALUATION")
|
|
640
657
|
if edit_config:
|
|
641
658
|
edited_yaml = Path(edit_config)
|
|
@@ -704,8 +721,6 @@ def certify_command(
|
|
|
704
721
|
)
|
|
705
722
|
|
|
706
723
|
# Persist a temporary merged config for traceability
|
|
707
|
-
tmp_dir = Path(".certify_tmp")
|
|
708
|
-
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
709
724
|
edited_merged_yaml = tmp_dir / "edited_merged.yaml"
|
|
710
725
|
_dump_yaml(edited_merged_yaml, merged_edited_cfg)
|
|
711
726
|
_debug(f"Edited config (merged): {edited_merged_yaml}")
|
|
@@ -754,7 +769,7 @@ def certify_command(
|
|
|
754
769
|
)
|
|
755
770
|
edited_yaml = tmp_dir / "edited_noop.yaml"
|
|
756
771
|
_dump_yaml(edited_yaml, edited_cfg)
|
|
757
|
-
_info("Running edited (no-op, Compare &
|
|
772
|
+
_info("Running edited (no-op, Compare & Evaluate)", tag="EXEC", emoji="🧪")
|
|
758
773
|
_debug(f"Edited config: {edited_yaml}")
|
|
759
774
|
from .run import run_command as _run
|
|
760
775
|
|
|
@@ -799,28 +814,36 @@ def certify_command(
|
|
|
799
814
|
raise typer.Exit(1)
|
|
800
815
|
_debug(f"Edited report: {edited_report}")
|
|
801
816
|
|
|
802
|
-
_phase(3, 3, "
|
|
817
|
+
_phase(3, 3, "EVALUATION REPORT GENERATION")
|
|
803
818
|
|
|
804
|
-
def
|
|
805
|
-
_info("Emitting
|
|
819
|
+
def _emit_evaluation_report() -> None:
|
|
820
|
+
_info("Emitting evaluation report", tag="EXEC", emoji="📜")
|
|
806
821
|
with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
|
|
807
822
|
try:
|
|
808
823
|
with timed_step(
|
|
809
824
|
console=console,
|
|
810
825
|
style=output_style,
|
|
811
826
|
timings=timings,
|
|
812
|
-
key="
|
|
827
|
+
key="evaluation_report",
|
|
813
828
|
tag="EXEC",
|
|
814
|
-
message="
|
|
829
|
+
message="Evaluation Report",
|
|
815
830
|
emoji="📜",
|
|
816
831
|
):
|
|
832
|
+
# Use a wall-clock perf counter here (not the output module's
|
|
833
|
+
# test-patched counter) so timing tests remain deterministic.
|
|
834
|
+
from time import perf_counter as _wall_perf_counter
|
|
835
|
+
|
|
836
|
+
report_start = _wall_perf_counter()
|
|
817
837
|
report_kwargs = {
|
|
818
838
|
"run": str(edited_report),
|
|
819
|
-
"format": "
|
|
839
|
+
"format": "report",
|
|
820
840
|
"baseline": str(baseline_report_path),
|
|
821
|
-
"output":
|
|
841
|
+
"output": report_out,
|
|
822
842
|
"style": output_style.name,
|
|
823
843
|
"no_color": no_color,
|
|
844
|
+
"summary_baseline_seconds": float(timings.get("baseline", 0.0)),
|
|
845
|
+
"summary_subject_seconds": float(timings.get("subject", 0.0)),
|
|
846
|
+
"summary_report_start": float(report_start),
|
|
824
847
|
}
|
|
825
848
|
try:
|
|
826
849
|
sig = inspect.signature(_report)
|
|
@@ -890,7 +913,7 @@ def certify_command(
|
|
|
890
913
|
else None
|
|
891
914
|
) or "unknown"
|
|
892
915
|
|
|
893
|
-
# Enforce only when a primary_metric block is present; allow degraded-but-flagged metrics to emit
|
|
916
|
+
# Enforce only when a primary_metric block is present; allow degraded-but-flagged metrics to emit evaluation reports, but fail the task.
|
|
894
917
|
has_metric_block = isinstance(pm, dict) and bool(pm)
|
|
895
918
|
if has_metric_block:
|
|
896
919
|
degraded = bool(pm.get("invalid") or pm.get("degraded"))
|
|
@@ -906,7 +929,7 @@ def certify_command(
|
|
|
906
929
|
print_event(
|
|
907
930
|
console,
|
|
908
931
|
"WARN",
|
|
909
|
-
"Primary metric degraded or non-finite; emitting
|
|
932
|
+
"Primary metric degraded or non-finite; emitting evaluation report and marking task degraded. Primary metric computation failed.",
|
|
910
933
|
style=output_style,
|
|
911
934
|
emoji="⚠️",
|
|
912
935
|
)
|
|
@@ -919,8 +942,8 @@ def certify_command(
|
|
|
919
942
|
metrics["primary_metric"] = pm
|
|
920
943
|
edited_payload.setdefault("metrics", {}).update(metrics)
|
|
921
944
|
|
|
922
|
-
# Emit the
|
|
923
|
-
|
|
945
|
+
# Emit the evaluation report for inspection, then exit with a CI-visible error.
|
|
946
|
+
_emit_evaluation_report()
|
|
924
947
|
err = MetricsError(
|
|
925
948
|
code="E111",
|
|
926
949
|
message=f"Primary metric degraded or non-finite ({degraded_reason}).",
|
|
@@ -933,7 +956,7 @@ def certify_command(
|
|
|
933
956
|
)
|
|
934
957
|
raise typer.Exit(_resolve_exit_code(err, profile=profile))
|
|
935
958
|
|
|
936
|
-
|
|
959
|
+
_emit_evaluation_report()
|
|
937
960
|
if timing:
|
|
938
961
|
if total_start is not None:
|
|
939
962
|
timings["total"] = max(0.0, float(perf_counter() - total_start))
|
|
@@ -941,7 +964,7 @@ def certify_command(
|
|
|
941
964
|
timings["total"] = (
|
|
942
965
|
float(timings.get("baseline", 0.0))
|
|
943
966
|
+ float(timings.get("subject", 0.0))
|
|
944
|
-
+ float(timings.get("
|
|
967
|
+
+ float(timings.get("evaluation_report", 0.0))
|
|
945
968
|
)
|
|
946
969
|
print_timing_summary(
|
|
947
970
|
console,
|
|
@@ -950,13 +973,13 @@ def certify_command(
|
|
|
950
973
|
order=[
|
|
951
974
|
("Baseline", "baseline"),
|
|
952
975
|
("Subject", "subject"),
|
|
953
|
-
("
|
|
976
|
+
("Evaluation Report", "evaluation_report"),
|
|
954
977
|
("Total", "total"),
|
|
955
978
|
],
|
|
956
979
|
)
|
|
957
980
|
if verbosity == VERBOSITY_QUIET:
|
|
958
981
|
_print_quiet_summary(
|
|
959
|
-
|
|
982
|
+
report_out=Path(report_out),
|
|
960
983
|
source=src_id,
|
|
961
984
|
edited=edt_id,
|
|
962
985
|
profile=profile,
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import math
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
import typer
|
|
7
8
|
from rich.console import Console
|
|
8
9
|
|
|
9
10
|
from invarlock.core.auto_tuning import get_tier_policies
|
|
10
|
-
from invarlock.reporting.
|
|
11
|
+
from invarlock.reporting.report_builder import (
|
|
12
|
+
PM_DRIFT_BAND_DEFAULT,
|
|
13
|
+
make_report,
|
|
14
|
+
)
|
|
11
15
|
|
|
12
16
|
console = Console()
|
|
13
17
|
|
|
@@ -18,9 +22,9 @@ def explain_gates_command(
|
|
|
18
22
|
..., "--baseline", help="Path to baseline report.json"
|
|
19
23
|
),
|
|
20
24
|
) -> None:
|
|
21
|
-
"""Explain
|
|
25
|
+
"""Explain evaluation report gates for a report vs baseline.
|
|
22
26
|
|
|
23
|
-
Loads the reports, builds
|
|
27
|
+
Loads the reports, builds an evaluation report, and prints gate thresholds,
|
|
24
28
|
observed statistics, and pass/fail reasons in a compact, readable form.
|
|
25
29
|
"""
|
|
26
30
|
report_path = Path(report)
|
|
@@ -36,22 +40,22 @@ def explain_gates_command(
|
|
|
36
40
|
console.print(f"[red]Failed to load inputs: {exc}[/red]")
|
|
37
41
|
raise typer.Exit(1) from exc
|
|
38
42
|
|
|
39
|
-
|
|
43
|
+
evaluation_report = make_report(report_data, baseline_data)
|
|
40
44
|
validation = (
|
|
41
|
-
|
|
45
|
+
evaluation_report.get("validation", {})
|
|
46
|
+
if isinstance(evaluation_report.get("validation"), dict)
|
|
47
|
+
else {}
|
|
42
48
|
)
|
|
43
49
|
|
|
44
50
|
# Extract tier + metric policy (floors/hysteresis)
|
|
45
|
-
tier = str(
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
"none": 1.10,
|
|
51
|
-
}
|
|
51
|
+
tier = str(
|
|
52
|
+
(evaluation_report.get("auto", {}) or {}).get("tier", "balanced")
|
|
53
|
+
).lower()
|
|
54
|
+
tier_policies = get_tier_policies()
|
|
55
|
+
tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
52
56
|
resolved_policy = (
|
|
53
|
-
|
|
54
|
-
if isinstance(
|
|
57
|
+
evaluation_report.get("resolved_policy", {})
|
|
58
|
+
if isinstance(evaluation_report.get("resolved_policy"), dict)
|
|
55
59
|
else {}
|
|
56
60
|
)
|
|
57
61
|
metrics_policy = (
|
|
@@ -60,8 +64,6 @@ def explain_gates_command(
|
|
|
60
64
|
else {}
|
|
61
65
|
)
|
|
62
66
|
if not metrics_policy:
|
|
63
|
-
tier_policies = get_tier_policies()
|
|
64
|
-
tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
|
|
65
67
|
metrics_policy = (
|
|
66
68
|
tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
|
|
67
69
|
)
|
|
@@ -75,15 +77,37 @@ def explain_gates_command(
|
|
|
75
77
|
hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
|
|
76
78
|
min_tokens = int(pm_policy.get("min_tokens", 0))
|
|
77
79
|
try:
|
|
78
|
-
limit_base = float(
|
|
79
|
-
pm_policy.get("ratio_limit_base", tier_thresholds.get(tier, 1.10))
|
|
80
|
-
or tier_thresholds.get(tier, 1.10)
|
|
81
|
-
)
|
|
80
|
+
limit_base = float(pm_policy.get("ratio_limit_base"))
|
|
82
81
|
except Exception:
|
|
83
|
-
limit_base =
|
|
84
|
-
|
|
82
|
+
limit_base = None
|
|
83
|
+
if limit_base is None or not isinstance(limit_base, int | float):
|
|
84
|
+
limit_base = None
|
|
85
|
+
elif not float("-inf") < float(limit_base) < float("inf"):
|
|
86
|
+
limit_base = None
|
|
87
|
+
if limit_base is None:
|
|
88
|
+
try:
|
|
89
|
+
fallback = (
|
|
90
|
+
tier_defaults.get("metrics", {})
|
|
91
|
+
if isinstance(tier_defaults, dict)
|
|
92
|
+
else {}
|
|
93
|
+
)
|
|
94
|
+
fallback_pm = (
|
|
95
|
+
fallback.get("pm_ratio", {}) if isinstance(fallback, dict) else {}
|
|
96
|
+
)
|
|
97
|
+
limit_base = float(fallback_pm.get("ratio_limit_base"))
|
|
98
|
+
except Exception:
|
|
99
|
+
limit_base = None
|
|
100
|
+
limit_with_hyst = (
|
|
101
|
+
float(limit_base) + max(0.0, hysteresis_ratio)
|
|
102
|
+
if isinstance(limit_base, int | float)
|
|
103
|
+
else None
|
|
104
|
+
)
|
|
85
105
|
tokens_ok = True
|
|
86
|
-
telem =
|
|
106
|
+
telem = (
|
|
107
|
+
evaluation_report.get("telemetry", {})
|
|
108
|
+
if isinstance(evaluation_report.get("telemetry"), dict)
|
|
109
|
+
else {}
|
|
110
|
+
)
|
|
87
111
|
try:
|
|
88
112
|
total_tokens = int(telem.get("preview_total_tokens", 0)) + int(
|
|
89
113
|
telem.get("final_total_tokens", 0)
|
|
@@ -95,8 +119,8 @@ def explain_gates_command(
|
|
|
95
119
|
# Primary-metric ratio gate explanation (ppl-like kinds shown as ratios)
|
|
96
120
|
ratio = None
|
|
97
121
|
ratio_ci = None
|
|
98
|
-
if isinstance(
|
|
99
|
-
pm =
|
|
122
|
+
if isinstance(evaluation_report.get("primary_metric"), dict):
|
|
123
|
+
pm = evaluation_report.get("primary_metric", {})
|
|
100
124
|
ratio = pm.get("ratio_vs_baseline")
|
|
101
125
|
ratio_ci = pm.get("display_ci")
|
|
102
126
|
hysteresis_applied = bool(validation.get("hysteresis_applied"))
|
|
@@ -110,21 +134,26 @@ def explain_gates_command(
|
|
|
110
134
|
)
|
|
111
135
|
else:
|
|
112
136
|
console.print(f" observed: {ratio:.3f}x")
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
137
|
+
if isinstance(limit_base, int | float):
|
|
138
|
+
hyst_suffix = (
|
|
139
|
+
f" (+hysteresis {hysteresis_ratio:.3f})" if hysteresis_ratio else ""
|
|
140
|
+
)
|
|
141
|
+
console.print(f" threshold: ≤ {float(limit_base):.2f}x{hyst_suffix}")
|
|
142
|
+
else:
|
|
143
|
+
console.print(" threshold: unavailable")
|
|
116
144
|
console.print(
|
|
117
145
|
f" tokens: {'ok' if tokens_ok else 'below floor'} (token floors: min_tokens={min_tokens or 0}, total={int(telem.get('preview_total_tokens', 0)) + int(telem.get('final_total_tokens', 0)) if telem else 0})"
|
|
118
146
|
)
|
|
119
147
|
if hysteresis_applied:
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
148
|
+
if isinstance(limit_with_hyst, int | float):
|
|
149
|
+
console.print(
|
|
150
|
+
f" note: hysteresis applied → effective threshold = {float(limit_with_hyst):.3f}x"
|
|
151
|
+
)
|
|
123
152
|
|
|
124
153
|
# Tail gate explanation (warn/fail; based on per-window Δlog-loss vs baseline)
|
|
125
154
|
pm_tail = (
|
|
126
|
-
|
|
127
|
-
if isinstance(
|
|
155
|
+
evaluation_report.get("primary_metric_tail", {})
|
|
156
|
+
if isinstance(evaluation_report.get("primary_metric_tail"), dict)
|
|
128
157
|
else {}
|
|
129
158
|
)
|
|
130
159
|
if pm_tail:
|
|
@@ -191,11 +220,18 @@ def explain_gates_command(
|
|
|
191
220
|
except Exception:
|
|
192
221
|
pass
|
|
193
222
|
|
|
194
|
-
# Drift gate explanation
|
|
223
|
+
# Drift gate explanation (ppl-like kinds only)
|
|
195
224
|
drift = None
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
225
|
+
drift_status = (
|
|
226
|
+
"PASS" if bool(validation.get("preview_final_drift_acceptable")) else "FAIL"
|
|
227
|
+
)
|
|
228
|
+
pm = (
|
|
229
|
+
evaluation_report.get("primary_metric", {})
|
|
230
|
+
if isinstance(evaluation_report.get("primary_metric"), dict)
|
|
231
|
+
else {}
|
|
232
|
+
)
|
|
233
|
+
kind = str(pm.get("kind", "") or "").lower()
|
|
234
|
+
if kind.startswith("ppl"):
|
|
199
235
|
preview = pm.get("preview")
|
|
200
236
|
final = pm.get("final")
|
|
201
237
|
if isinstance(preview, int | float) and isinstance(final, int | float):
|
|
@@ -204,24 +240,31 @@ def explain_gates_command(
|
|
|
204
240
|
drift = float(final) / float(preview)
|
|
205
241
|
except Exception:
|
|
206
242
|
drift = None
|
|
207
|
-
|
|
208
|
-
"
|
|
209
|
-
|
|
210
|
-
console.print("\n[bold]Gate: Drift (final/preview)[/bold]")
|
|
211
|
-
if isinstance(drift, int | float):
|
|
212
|
-
if isinstance(drift_ci, tuple | list) and len(drift_ci) == 2:
|
|
213
|
-
console.print(
|
|
214
|
-
f" observed: {drift:.3f} (CI {drift_ci[0]:.3f}-{drift_ci[1]:.3f})"
|
|
215
|
-
)
|
|
216
|
-
else:
|
|
243
|
+
|
|
244
|
+
console.print("\n[bold]Gate: Drift (final/preview)[/bold]")
|
|
245
|
+
if isinstance(drift, int | float):
|
|
217
246
|
console.print(f" observed: {drift:.3f}")
|
|
218
|
-
|
|
219
|
-
|
|
247
|
+
drift_band = (
|
|
248
|
+
pm.get("drift_band") if isinstance(pm.get("drift_band"), dict) else {}
|
|
249
|
+
)
|
|
250
|
+
drift_min = drift_band.get("min")
|
|
251
|
+
drift_max = drift_band.get("max")
|
|
252
|
+
if not (
|
|
253
|
+
isinstance(drift_min, int | float)
|
|
254
|
+
and isinstance(drift_max, int | float)
|
|
255
|
+
and math.isfinite(float(drift_min))
|
|
256
|
+
and math.isfinite(float(drift_max))
|
|
257
|
+
and float(drift_min) > 0.0
|
|
258
|
+
and float(drift_min) < float(drift_max)
|
|
259
|
+
):
|
|
260
|
+
drift_min, drift_max = PM_DRIFT_BAND_DEFAULT
|
|
261
|
+
console.print(f" threshold: {float(drift_min):.3f}-{float(drift_max):.3f}")
|
|
262
|
+
console.print(f" status: {drift_status}")
|
|
220
263
|
|
|
221
264
|
# Guard Overhead explanation (if present)
|
|
222
265
|
overhead = (
|
|
223
|
-
|
|
224
|
-
if isinstance(
|
|
266
|
+
evaluation_report.get("guard_overhead", {})
|
|
267
|
+
if isinstance(evaluation_report.get("guard_overhead"), dict)
|
|
225
268
|
else {}
|
|
226
269
|
)
|
|
227
270
|
if overhead:
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
InvarLock HTML Export
|
|
3
3
|
=================
|
|
4
4
|
|
|
5
|
-
Thin wrapper over the HTML
|
|
5
|
+
Thin wrapper over the HTML evaluation report renderer to make exporting
|
|
6
6
|
discoverable and scriptable.
|
|
7
7
|
"""
|
|
8
8
|
|
|
@@ -20,7 +20,9 @@ console = Console()
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def export_html_command(
|
|
23
|
-
input: str = typer.Option(
|
|
23
|
+
input: str = typer.Option(
|
|
24
|
+
..., "--input", "-i", help="Path to evaluation report JSON"
|
|
25
|
+
),
|
|
24
26
|
output: str = typer.Option(..., "--output", "-o", help="Path to output HTML file"),
|
|
25
27
|
embed_css: bool = typer.Option(
|
|
26
28
|
True,
|
|
@@ -31,12 +33,12 @@ def export_html_command(
|
|
|
31
33
|
False, "--force", help="Overwrite output file if it already exists"
|
|
32
34
|
),
|
|
33
35
|
) -> None:
|
|
34
|
-
"""Render
|
|
36
|
+
"""Render an evaluation report JSON to HTML.
|
|
35
37
|
|
|
36
38
|
Exit codes:
|
|
37
39
|
- 0: success
|
|
38
40
|
- 1: generic failure (IO or overwrite refusal)
|
|
39
|
-
- 2: validation failure (invalid
|
|
41
|
+
- 2: validation failure (invalid evaluation report schema)
|
|
40
42
|
"""
|
|
41
43
|
# When called programmatically, Typer's Option defaults can be OptionInfo
|
|
42
44
|
try: # pragma: no cover - defensive, matches other commands' pattern
|
|
@@ -70,12 +72,12 @@ def export_html_command(
|
|
|
70
72
|
raise typer.Exit(1) from exc
|
|
71
73
|
|
|
72
74
|
try:
|
|
73
|
-
from invarlock.reporting.html import
|
|
75
|
+
from invarlock.reporting.html import render_report_html
|
|
74
76
|
|
|
75
|
-
html =
|
|
77
|
+
html = render_report_html(payload)
|
|
76
78
|
except ValueError as exc:
|
|
77
|
-
#
|
|
78
|
-
console.print(f"[red]❌
|
|
79
|
+
# Evaluation report validation failed upstream
|
|
80
|
+
console.print(f"[red]❌ Evaluation report validation failed: {exc}[/red]")
|
|
79
81
|
raise typer.Exit(2) from exc
|
|
80
82
|
except Exception as exc:
|
|
81
83
|
console.print(f"[red]❌ Failed to render HTML: {exc}[/red]")
|
|
@@ -94,7 +96,7 @@ def export_html_command(
|
|
|
94
96
|
console.print(f"[red]❌ Failed to write output file: {exc}[/red]")
|
|
95
97
|
raise typer.Exit(1) from exc
|
|
96
98
|
|
|
97
|
-
console.print(f"✅ Exported
|
|
99
|
+
console.print(f"✅ Exported evaluation report HTML → {out_path}")
|
|
98
100
|
|
|
99
101
|
|
|
100
102
|
__all__ = ["export_html_command"]
|