invarlock 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. invarlock/__init__.py +3 -3
  2. invarlock/adapters/auto.py +2 -10
  3. invarlock/adapters/hf_loading.py +7 -7
  4. invarlock/adapters/hf_mixin.py +28 -5
  5. invarlock/assurance/__init__.py +15 -23
  6. invarlock/calibration/spectral_null.py +1 -1
  7. invarlock/cli/adapter_auto.py +1 -5
  8. invarlock/cli/app.py +57 -27
  9. invarlock/cli/commands/__init__.py +2 -2
  10. invarlock/cli/commands/calibrate.py +48 -4
  11. invarlock/cli/commands/{certify.py → evaluate.py} +69 -46
  12. invarlock/cli/commands/explain_gates.py +94 -51
  13. invarlock/cli/commands/export_html.py +11 -9
  14. invarlock/cli/commands/report.py +121 -47
  15. invarlock/cli/commands/run.py +274 -66
  16. invarlock/cli/commands/verify.py +84 -89
  17. invarlock/cli/determinism.py +1 -1
  18. invarlock/cli/provenance.py +3 -3
  19. invarlock/core/bootstrap.py +1 -1
  20. invarlock/core/retry.py +14 -14
  21. invarlock/core/runner.py +1 -1
  22. invarlock/edits/noop.py +2 -2
  23. invarlock/edits/quant_rtn.py +2 -2
  24. invarlock/eval/__init__.py +1 -1
  25. invarlock/eval/bench.py +11 -7
  26. invarlock/eval/primary_metric.py +1 -1
  27. invarlock/guards/spectral.py +2 -2
  28. invarlock/guards_ref/spectral_ref.py +1 -1
  29. invarlock/model_profile.py +16 -35
  30. invarlock/observability/health.py +38 -20
  31. invarlock/plugins/hf_bnb_adapter.py +32 -21
  32. invarlock/reporting/__init__.py +18 -4
  33. invarlock/reporting/html.py +7 -7
  34. invarlock/reporting/normalizer.py +2 -2
  35. invarlock/reporting/policy_utils.py +1 -1
  36. invarlock/reporting/primary_metric_utils.py +11 -11
  37. invarlock/reporting/render.py +126 -120
  38. invarlock/reporting/report.py +43 -37
  39. invarlock/reporting/{certificate.py → report_builder.py} +103 -99
  40. invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
  41. invarlock-0.3.9.dist-info/METADATA +303 -0
  42. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/RECORD +46 -46
  43. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/WHEEL +1 -1
  44. invarlock-0.3.7.dist-info/METADATA +0 -602
  45. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/entry_points.txt +0 -0
  46. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/licenses/LICENSE +0 -0
  47. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,15 @@
1
1
  """
2
- InvarLock CLI Certify Command
2
+ InvarLock CLI Evaluate Command
3
3
  =========================
4
4
 
5
- Hero path: Compare & Certify (BYOE). Provide baseline (`--baseline`) and
5
+ Hero path: Compare & Evaluate (BYOE). Provide baseline (`--baseline`) and
6
6
  subject (`--subject`) checkpoints and InvarLock will run paired windows and emit a
7
- certificate. Optionally, pass `--edit-config` to run the built‑in quant_rtn demo.
7
+ evaluation report. Optionally, pass `--edit-config` to run the built‑in quant_rtn demo.
8
8
 
9
9
  Steps:
10
10
  1) Baseline (no-op edit) on baseline model
11
11
  2) Subject (no-op or provided edit config) on subject model with --baseline pairing
12
- 3) Emit certificate via `invarlock report --format cert`
12
+ 3) Emit evaluation report via `invarlock report --format report`
13
13
  """
14
14
 
15
15
  from __future__ import annotations
@@ -18,6 +18,7 @@ import inspect
18
18
  import io
19
19
  import json
20
20
  import math
21
+ import os
21
22
  from collections.abc import Iterator
22
23
  from contextlib import contextmanager
23
24
  from pathlib import Path
@@ -60,7 +61,7 @@ def _render_banner_lines(title: str, context: str) -> list[str]:
60
61
  def _print_header_banner(
61
62
  console: Console, *, version: str, profile: str, tier: str, adapter: str
62
63
  ) -> None:
63
- title = f"INVARLOCK v{version} · Certification Pipeline"
64
+ title = f"INVARLOCK v{version} · Evaluation Pipeline"
64
65
  context = f"Profile: {profile} · Tier: {tier} · Adapter: {adapter}"
65
66
  for line in _render_banner_lines(title, context):
66
67
  console.print(line)
@@ -128,32 +129,32 @@ def _suppress_child_output(enabled: bool) -> Iterator[io.StringIO | None]:
128
129
 
129
130
  def _print_quiet_summary(
130
131
  *,
131
- cert_out: Path,
132
+ report_out: Path,
132
133
  source: str,
133
134
  edited: str,
134
135
  profile: str,
135
136
  ) -> None:
136
- cert_path = cert_out / "evaluation.cert.json"
137
- console.print(f"INVARLOCK v{INVARLOCK_VERSION} · CERTIFY")
137
+ report_path = report_out / "evaluation.report.json"
138
+ console.print(f"INVARLOCK v{INVARLOCK_VERSION} · EVALUATE")
138
139
  console.print(f"Baseline: {source} -> Subject: {edited} · Profile: {profile}")
139
- if not cert_path.exists():
140
- console.print(f"Output: {cert_out}")
140
+ if not report_path.exists():
141
+ console.print(f"Output: {report_out}")
141
142
  return
142
143
  try:
143
- with cert_path.open("r", encoding="utf-8") as fh:
144
- certificate = json.load(fh)
144
+ with report_path.open("r", encoding="utf-8") as fh:
145
+ evaluation_report = json.load(fh)
145
146
  except Exception:
146
- console.print(f"Output: {cert_path}")
147
+ console.print(f"Output: {report_path}")
147
148
  return
148
- if not isinstance(certificate, dict):
149
- console.print(f"Output: {cert_path}")
149
+ if not isinstance(evaluation_report, dict):
150
+ console.print(f"Output: {report_path}")
150
151
  return
151
152
  try:
152
153
  from invarlock.reporting.render import (
153
154
  compute_console_validation_block as _console_block,
154
155
  )
155
156
 
156
- block = _console_block(certificate)
157
+ block = _console_block(evaluation_report)
157
158
  rows = block.get("rows", [])
158
159
  total = len(rows) if isinstance(rows, list) else 0
159
160
  passed = (
@@ -165,13 +166,13 @@ def _print_quiet_summary(
165
166
  passed = 0
166
167
  status = "UNKNOWN"
167
168
  pm_ratio = _format_ratio(
168
- (certificate.get("primary_metric") or {}).get("ratio_vs_baseline")
169
+ (evaluation_report.get("primary_metric") or {}).get("ratio_vs_baseline")
169
170
  )
170
171
  gate_summary = f"{passed}/{total} passed" if total else "N/A"
171
172
  console.print(f"Status: {status} · Gates: {gate_summary}")
172
173
  if pm_ratio != "N/A":
173
174
  console.print(f"Primary metric ratio: {pm_ratio}")
174
- console.print(f"Output: {cert_path}")
175
+ console.print(f"Output: {report_path}")
175
176
 
176
177
 
177
178
  def _latest_run_report(run_root: Path) -> Path | None:
@@ -206,6 +207,23 @@ def _dump_yaml(path: Path, data: dict[str, Any]) -> None:
206
207
  yaml.safe_dump(data, fh, sort_keys=False)
207
208
 
208
209
 
210
+ def _resolve_evaluate_tmp_dir() -> Path:
211
+ """Return the on-disk scratch directory for `invarlock evaluate`.
212
+
213
+ Evaluate generates merged YAML configs for baseline/subject runs so
214
+ downstream `invarlock run` flows remain traceable. We keep these files
215
+ under `./tmp/.evaluate` by default to avoid cluttering the working tree.
216
+ """
217
+
218
+ candidate = os.environ.get("INVARLOCK_EVALUATE_TMP_DIR")
219
+ if candidate:
220
+ tmp_dir = Path(candidate).expanduser()
221
+ else:
222
+ tmp_dir = Path("tmp") / ".evaluate"
223
+ tmp_dir.mkdir(parents=True, exist_ok=True)
224
+ return tmp_dir
225
+
226
+
209
227
  def _normalize_model_id(model_id: str, adapter_name: str) -> str:
210
228
  """Normalize model identifiers for adapters.
211
229
 
@@ -221,7 +239,7 @@ def _normalize_model_id(model_id: str, adapter_name: str) -> str:
221
239
  return mid
222
240
 
223
241
 
224
- def certify_command(
242
+ def evaluate_command(
225
243
  # Primary names for programmatic/test compatibility
226
244
  source: str = typer.Option(
227
245
  ..., "--source", "--baseline", help="Baseline model dir or Hub ID"
@@ -258,8 +276,8 @@ def certify_command(
258
276
  ),
259
277
  ),
260
278
  out: str = typer.Option("runs", "--out", help="Base output directory"),
261
- cert_out: str = typer.Option(
262
- "reports/cert", "--cert-out", help="Certificate output directory"
279
+ report_out: str = typer.Option(
280
+ "reports/eval", "--report-out", help="Evaluation report output directory"
263
281
  ),
264
282
  edit_config: str | None = typer.Option(
265
283
  None, "--edit-config", help="Edit preset to apply a demo edit (quant_rtn)"
@@ -290,7 +308,7 @@ def certify_command(
290
308
  False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
291
309
  ),
292
310
  ):
293
- """Certify two checkpoints (baseline vs subject) with pinned windows."""
311
+ """Evaluate two checkpoints (baseline vs subject) with pinned windows."""
294
312
  # Support programmatic calls and Typer-invoked calls uniformly
295
313
  try:
296
314
  from typer.models import OptionInfo as _TyperOptionInfo
@@ -311,7 +329,7 @@ def certify_command(
311
329
  tier = _coerce_option(tier, "balanced")
312
330
  preset = _coerce_option(preset)
313
331
  out = _coerce_option(out, "runs")
314
- cert_out = _coerce_option(cert_out, "reports/cert")
332
+ report_out = _coerce_option(report_out, "reports/eval")
315
333
  edit_config = _coerce_option(edit_config)
316
334
  edit_label = _coerce_option(edit_label)
317
335
  quiet = _coerce_option(quiet, False)
@@ -410,7 +428,7 @@ def certify_command(
410
428
  "stride": 512,
411
429
  "preview_n": 64,
412
430
  "final_n": 64,
413
- "seed": 42,
431
+ "seed": 43,
414
432
  }
415
433
  }
416
434
  else:
@@ -424,7 +442,7 @@ def certify_command(
424
442
  )
425
443
  raise typer.Exit(1)
426
444
  preset_data = _load_yaml(preset_path)
427
- # Do not hard-code device from presets in auto-generated certify configs;
445
+ # Do not hard-code device from presets in auto-generated evaluate configs;
428
446
  # allow device resolution to pick CUDA/MPS/CPU via 'auto' or CLI overrides.
429
447
  model_block = preset_data.get("model")
430
448
  if isinstance(model_block, dict) and "device" in model_block:
@@ -575,8 +593,7 @@ def certify_command(
575
593
  elif not edit_config:
576
594
  subject_label = "custom" if norm_src_id != norm_edt_id else "noop"
577
595
 
578
- tmp_dir = Path(".certify_tmp")
579
- tmp_dir.mkdir(parents=True, exist_ok=True)
596
+ tmp_dir = _resolve_evaluate_tmp_dir()
580
597
 
581
598
  baseline_report_path: Path
582
599
  if baseline_report:
@@ -635,7 +652,7 @@ def certify_command(
635
652
  baseline_report_path = baseline_report_path_candidate
636
653
  _debug(f"Baseline report: {baseline_report_path}")
637
654
 
638
- # Edited run: either no-op (Compare & Certify) or provided edit_config (demo edit)
655
+ # Edited run: either no-op (Compare & Evaluate) or provided edit_config (demo edit)
639
656
  _phase(2, 3, "SUBJECT EVALUATION")
640
657
  if edit_config:
641
658
  edited_yaml = Path(edit_config)
@@ -704,8 +721,6 @@ def certify_command(
704
721
  )
705
722
 
706
723
  # Persist a temporary merged config for traceability
707
- tmp_dir = Path(".certify_tmp")
708
- tmp_dir.mkdir(parents=True, exist_ok=True)
709
724
  edited_merged_yaml = tmp_dir / "edited_merged.yaml"
710
725
  _dump_yaml(edited_merged_yaml, merged_edited_cfg)
711
726
  _debug(f"Edited config (merged): {edited_merged_yaml}")
@@ -754,7 +769,7 @@ def certify_command(
754
769
  )
755
770
  edited_yaml = tmp_dir / "edited_noop.yaml"
756
771
  _dump_yaml(edited_yaml, edited_cfg)
757
- _info("Running edited (no-op, Compare & Certify)", tag="EXEC", emoji="🧪")
772
+ _info("Running edited (no-op, Compare & Evaluate)", tag="EXEC", emoji="🧪")
758
773
  _debug(f"Edited config: {edited_yaml}")
759
774
  from .run import run_command as _run
760
775
 
@@ -799,28 +814,36 @@ def certify_command(
799
814
  raise typer.Exit(1)
800
815
  _debug(f"Edited report: {edited_report}")
801
816
 
802
- _phase(3, 3, "CERTIFICATE GENERATION")
817
+ _phase(3, 3, "EVALUATION REPORT GENERATION")
803
818
 
804
- def _emit_certificate() -> None:
805
- _info("Emitting certificate", tag="EXEC", emoji="📜")
819
+ def _emit_evaluation_report() -> None:
820
+ _info("Emitting evaluation report", tag="EXEC", emoji="📜")
806
821
  with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
807
822
  try:
808
823
  with timed_step(
809
824
  console=console,
810
825
  style=output_style,
811
826
  timings=timings,
812
- key="certificate",
827
+ key="evaluation_report",
813
828
  tag="EXEC",
814
- message="Certificate",
829
+ message="Evaluation Report",
815
830
  emoji="📜",
816
831
  ):
832
+ # Use a wall-clock perf counter here (not the output module's
833
+ # test-patched counter) so timing tests remain deterministic.
834
+ from time import perf_counter as _wall_perf_counter
835
+
836
+ report_start = _wall_perf_counter()
817
837
  report_kwargs = {
818
838
  "run": str(edited_report),
819
- "format": "cert",
839
+ "format": "report",
820
840
  "baseline": str(baseline_report_path),
821
- "output": cert_out,
841
+ "output": report_out,
822
842
  "style": output_style.name,
823
843
  "no_color": no_color,
844
+ "summary_baseline_seconds": float(timings.get("baseline", 0.0)),
845
+ "summary_subject_seconds": float(timings.get("subject", 0.0)),
846
+ "summary_report_start": float(report_start),
824
847
  }
825
848
  try:
826
849
  sig = inspect.signature(_report)
@@ -890,7 +913,7 @@ def certify_command(
890
913
  else None
891
914
  ) or "unknown"
892
915
 
893
- # Enforce only when a primary_metric block is present; allow degraded-but-flagged metrics to emit certificates, but fail the task.
916
+ # Enforce only when a primary_metric block is present; allow degraded-but-flagged metrics to emit evaluation reports, but fail the task.
894
917
  has_metric_block = isinstance(pm, dict) and bool(pm)
895
918
  if has_metric_block:
896
919
  degraded = bool(pm.get("invalid") or pm.get("degraded"))
@@ -906,7 +929,7 @@ def certify_command(
906
929
  print_event(
907
930
  console,
908
931
  "WARN",
909
- "Primary metric degraded or non-finite; emitting certificate and marking task degraded. Primary metric computation failed.",
932
+ "Primary metric degraded or non-finite; emitting evaluation report and marking task degraded. Primary metric computation failed.",
910
933
  style=output_style,
911
934
  emoji="⚠️",
912
935
  )
@@ -919,8 +942,8 @@ def certify_command(
919
942
  metrics["primary_metric"] = pm
920
943
  edited_payload.setdefault("metrics", {}).update(metrics)
921
944
 
922
- # Emit the certificate for inspection, then exit with a CI-visible error.
923
- _emit_certificate()
945
+ # Emit the evaluation report for inspection, then exit with a CI-visible error.
946
+ _emit_evaluation_report()
924
947
  err = MetricsError(
925
948
  code="E111",
926
949
  message=f"Primary metric degraded or non-finite ({degraded_reason}).",
@@ -933,7 +956,7 @@ def certify_command(
933
956
  )
934
957
  raise typer.Exit(_resolve_exit_code(err, profile=profile))
935
958
 
936
- _emit_certificate()
959
+ _emit_evaluation_report()
937
960
  if timing:
938
961
  if total_start is not None:
939
962
  timings["total"] = max(0.0, float(perf_counter() - total_start))
@@ -941,7 +964,7 @@ def certify_command(
941
964
  timings["total"] = (
942
965
  float(timings.get("baseline", 0.0))
943
966
  + float(timings.get("subject", 0.0))
944
- + float(timings.get("certificate", 0.0))
967
+ + float(timings.get("evaluation_report", 0.0))
945
968
  )
946
969
  print_timing_summary(
947
970
  console,
@@ -950,13 +973,13 @@ def certify_command(
950
973
  order=[
951
974
  ("Baseline", "baseline"),
952
975
  ("Subject", "subject"),
953
- ("Certificate", "certificate"),
976
+ ("Evaluation Report", "evaluation_report"),
954
977
  ("Total", "total"),
955
978
  ],
956
979
  )
957
980
  if verbosity == VERBOSITY_QUIET:
958
981
  _print_quiet_summary(
959
- cert_out=Path(cert_out),
982
+ report_out=Path(report_out),
960
983
  source=src_id,
961
984
  edited=edt_id,
962
985
  profile=profile,
@@ -1,13 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import math
4
5
  from pathlib import Path
5
6
 
6
7
  import typer
7
8
  from rich.console import Console
8
9
 
9
10
  from invarlock.core.auto_tuning import get_tier_policies
10
- from invarlock.reporting.certificate import make_certificate
11
+ from invarlock.reporting.report_builder import (
12
+ PM_DRIFT_BAND_DEFAULT,
13
+ make_report,
14
+ )
11
15
 
12
16
  console = Console()
13
17
 
@@ -18,9 +22,9 @@ def explain_gates_command(
18
22
  ..., "--baseline", help="Path to baseline report.json"
19
23
  ),
20
24
  ) -> None:
21
- """Explain certificate gates for a report vs baseline.
25
+ """Explain evaluation report gates for a report vs baseline.
22
26
 
23
- Loads the reports, builds a certificate, and prints gate thresholds,
27
+ Loads the reports, builds an evaluation report, and prints gate thresholds,
24
28
  observed statistics, and pass/fail reasons in a compact, readable form.
25
29
  """
26
30
  report_path = Path(report)
@@ -36,22 +40,22 @@ def explain_gates_command(
36
40
  console.print(f"[red]Failed to load inputs: {exc}[/red]")
37
41
  raise typer.Exit(1) from exc
38
42
 
39
- cert = make_certificate(report_data, baseline_data)
43
+ evaluation_report = make_report(report_data, baseline_data)
40
44
  validation = (
41
- cert.get("validation", {}) if isinstance(cert.get("validation"), dict) else {}
45
+ evaluation_report.get("validation", {})
46
+ if isinstance(evaluation_report.get("validation"), dict)
47
+ else {}
42
48
  )
43
49
 
44
50
  # Extract tier + metric policy (floors/hysteresis)
45
- tier = str((cert.get("auto", {}) or {}).get("tier", "balanced")).lower()
46
- tier_thresholds = {
47
- "conservative": 1.05,
48
- "balanced": 1.10,
49
- "aggressive": 1.20,
50
- "none": 1.10,
51
- }
51
+ tier = str(
52
+ (evaluation_report.get("auto", {}) or {}).get("tier", "balanced")
53
+ ).lower()
54
+ tier_policies = get_tier_policies()
55
+ tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
52
56
  resolved_policy = (
53
- cert.get("resolved_policy", {})
54
- if isinstance(cert.get("resolved_policy"), dict)
57
+ evaluation_report.get("resolved_policy", {})
58
+ if isinstance(evaluation_report.get("resolved_policy"), dict)
55
59
  else {}
56
60
  )
57
61
  metrics_policy = (
@@ -60,8 +64,6 @@ def explain_gates_command(
60
64
  else {}
61
65
  )
62
66
  if not metrics_policy:
63
- tier_policies = get_tier_policies()
64
- tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
65
67
  metrics_policy = (
66
68
  tier_defaults.get("metrics", {}) if isinstance(tier_defaults, dict) else {}
67
69
  )
@@ -75,15 +77,37 @@ def explain_gates_command(
75
77
  hysteresis_ratio = float(pm_policy.get("hysteresis_ratio", 0.0))
76
78
  min_tokens = int(pm_policy.get("min_tokens", 0))
77
79
  try:
78
- limit_base = float(
79
- pm_policy.get("ratio_limit_base", tier_thresholds.get(tier, 1.10))
80
- or tier_thresholds.get(tier, 1.10)
81
- )
80
+ limit_base = float(pm_policy.get("ratio_limit_base"))
82
81
  except Exception:
83
- limit_base = tier_thresholds.get(tier, 1.10)
84
- limit_with_hyst = limit_base + max(0.0, hysteresis_ratio)
82
+ limit_base = None
83
+ if limit_base is None or not isinstance(limit_base, int | float):
84
+ limit_base = None
85
+ elif not float("-inf") < float(limit_base) < float("inf"):
86
+ limit_base = None
87
+ if limit_base is None:
88
+ try:
89
+ fallback = (
90
+ tier_defaults.get("metrics", {})
91
+ if isinstance(tier_defaults, dict)
92
+ else {}
93
+ )
94
+ fallback_pm = (
95
+ fallback.get("pm_ratio", {}) if isinstance(fallback, dict) else {}
96
+ )
97
+ limit_base = float(fallback_pm.get("ratio_limit_base"))
98
+ except Exception:
99
+ limit_base = None
100
+ limit_with_hyst = (
101
+ float(limit_base) + max(0.0, hysteresis_ratio)
102
+ if isinstance(limit_base, int | float)
103
+ else None
104
+ )
85
105
  tokens_ok = True
86
- telem = cert.get("telemetry", {}) if isinstance(cert.get("telemetry"), dict) else {}
106
+ telem = (
107
+ evaluation_report.get("telemetry", {})
108
+ if isinstance(evaluation_report.get("telemetry"), dict)
109
+ else {}
110
+ )
87
111
  try:
88
112
  total_tokens = int(telem.get("preview_total_tokens", 0)) + int(
89
113
  telem.get("final_total_tokens", 0)
@@ -95,8 +119,8 @@ def explain_gates_command(
95
119
  # Primary-metric ratio gate explanation (ppl-like kinds shown as ratios)
96
120
  ratio = None
97
121
  ratio_ci = None
98
- if isinstance(cert.get("primary_metric"), dict):
99
- pm = cert.get("primary_metric", {})
122
+ if isinstance(evaluation_report.get("primary_metric"), dict):
123
+ pm = evaluation_report.get("primary_metric", {})
100
124
  ratio = pm.get("ratio_vs_baseline")
101
125
  ratio_ci = pm.get("display_ci")
102
126
  hysteresis_applied = bool(validation.get("hysteresis_applied"))
@@ -110,21 +134,26 @@ def explain_gates_command(
110
134
  )
111
135
  else:
112
136
  console.print(f" observed: {ratio:.3f}x")
113
- console.print(
114
- f" threshold: {limit_base:.2f}x{(f' (+hysteresis {hysteresis_ratio:.3f})' if hysteresis_ratio else '')}"
115
- )
137
+ if isinstance(limit_base, int | float):
138
+ hyst_suffix = (
139
+ f" (+hysteresis {hysteresis_ratio:.3f})" if hysteresis_ratio else ""
140
+ )
141
+ console.print(f" threshold: ≤ {float(limit_base):.2f}x{hyst_suffix}")
142
+ else:
143
+ console.print(" threshold: unavailable")
116
144
  console.print(
117
145
  f" tokens: {'ok' if tokens_ok else 'below floor'} (token floors: min_tokens={min_tokens or 0}, total={int(telem.get('preview_total_tokens', 0)) + int(telem.get('final_total_tokens', 0)) if telem else 0})"
118
146
  )
119
147
  if hysteresis_applied:
120
- console.print(
121
- f" note: hysteresis applied → effective threshold = {limit_with_hyst:.3f}x"
122
- )
148
+ if isinstance(limit_with_hyst, int | float):
149
+ console.print(
150
+ f" note: hysteresis applied → effective threshold = {float(limit_with_hyst):.3f}x"
151
+ )
123
152
 
124
153
  # Tail gate explanation (warn/fail; based on per-window Δlog-loss vs baseline)
125
154
  pm_tail = (
126
- cert.get("primary_metric_tail", {})
127
- if isinstance(cert.get("primary_metric_tail"), dict)
155
+ evaluation_report.get("primary_metric_tail", {})
156
+ if isinstance(evaluation_report.get("primary_metric_tail"), dict)
128
157
  else {}
129
158
  )
130
159
  if pm_tail:
@@ -191,11 +220,18 @@ def explain_gates_command(
191
220
  except Exception:
192
221
  pass
193
222
 
194
- # Drift gate explanation
223
+ # Drift gate explanation (ppl-like kinds only)
195
224
  drift = None
196
- drift_ci = None
197
- if isinstance(cert.get("primary_metric"), dict):
198
- pm = cert.get("primary_metric", {})
225
+ drift_status = (
226
+ "PASS" if bool(validation.get("preview_final_drift_acceptable")) else "FAIL"
227
+ )
228
+ pm = (
229
+ evaluation_report.get("primary_metric", {})
230
+ if isinstance(evaluation_report.get("primary_metric"), dict)
231
+ else {}
232
+ )
233
+ kind = str(pm.get("kind", "") or "").lower()
234
+ if kind.startswith("ppl"):
199
235
  preview = pm.get("preview")
200
236
  final = pm.get("final")
201
237
  if isinstance(preview, int | float) and isinstance(final, int | float):
@@ -204,24 +240,31 @@ def explain_gates_command(
204
240
  drift = float(final) / float(preview)
205
241
  except Exception:
206
242
  drift = None
207
- drift_status = (
208
- "PASS" if bool(validation.get("preview_final_drift_acceptable")) else "FAIL"
209
- )
210
- console.print("\n[bold]Gate: Drift (final/preview)[/bold]")
211
- if isinstance(drift, int | float):
212
- if isinstance(drift_ci, tuple | list) and len(drift_ci) == 2:
213
- console.print(
214
- f" observed: {drift:.3f} (CI {drift_ci[0]:.3f}-{drift_ci[1]:.3f})"
215
- )
216
- else:
243
+
244
+ console.print("\n[bold]Gate: Drift (final/preview)[/bold]")
245
+ if isinstance(drift, int | float):
217
246
  console.print(f" observed: {drift:.3f}")
218
- console.print(" threshold: 0.95-1.05")
219
- console.print(f" status: {drift_status}")
247
+ drift_band = (
248
+ pm.get("drift_band") if isinstance(pm.get("drift_band"), dict) else {}
249
+ )
250
+ drift_min = drift_band.get("min")
251
+ drift_max = drift_band.get("max")
252
+ if not (
253
+ isinstance(drift_min, int | float)
254
+ and isinstance(drift_max, int | float)
255
+ and math.isfinite(float(drift_min))
256
+ and math.isfinite(float(drift_max))
257
+ and float(drift_min) > 0.0
258
+ and float(drift_min) < float(drift_max)
259
+ ):
260
+ drift_min, drift_max = PM_DRIFT_BAND_DEFAULT
261
+ console.print(f" threshold: {float(drift_min):.3f}-{float(drift_max):.3f}")
262
+ console.print(f" status: {drift_status}")
220
263
 
221
264
  # Guard Overhead explanation (if present)
222
265
  overhead = (
223
- cert.get("guard_overhead", {})
224
- if isinstance(cert.get("guard_overhead"), dict)
266
+ evaluation_report.get("guard_overhead", {})
267
+ if isinstance(evaluation_report.get("guard_overhead"), dict)
225
268
  else {}
226
269
  )
227
270
  if overhead:
@@ -2,7 +2,7 @@
2
2
  InvarLock HTML Export
3
3
  =================
4
4
 
5
- Thin wrapper over the HTML certificate renderer to make exporting
5
+ Thin wrapper over the HTML evaluation report renderer to make exporting
6
6
  discoverable and scriptable.
7
7
  """
8
8
 
@@ -20,7 +20,9 @@ console = Console()
20
20
 
21
21
 
22
22
  def export_html_command(
23
- input: str = typer.Option(..., "--input", "-i", help="Path to certificate JSON"),
23
+ input: str = typer.Option(
24
+ ..., "--input", "-i", help="Path to evaluation report JSON"
25
+ ),
24
26
  output: str = typer.Option(..., "--output", "-o", help="Path to output HTML file"),
25
27
  embed_css: bool = typer.Option(
26
28
  True,
@@ -31,12 +33,12 @@ def export_html_command(
31
33
  False, "--force", help="Overwrite output file if it already exists"
32
34
  ),
33
35
  ) -> None:
34
- """Render a certificate JSON to HTML.
36
+ """Render an evaluation report JSON to HTML.
35
37
 
36
38
  Exit codes:
37
39
  - 0: success
38
40
  - 1: generic failure (IO or overwrite refusal)
39
- - 2: validation failure (invalid certificate schema)
41
+ - 2: validation failure (invalid evaluation report schema)
40
42
  """
41
43
  # When called programmatically, Typer's Option defaults can be OptionInfo
42
44
  try: # pragma: no cover - defensive, matches other commands' pattern
@@ -70,12 +72,12 @@ def export_html_command(
70
72
  raise typer.Exit(1) from exc
71
73
 
72
74
  try:
73
- from invarlock.reporting.html import render_certificate_html
75
+ from invarlock.reporting.html import render_report_html
74
76
 
75
- html = render_certificate_html(payload)
77
+ html = render_report_html(payload)
76
78
  except ValueError as exc:
77
- # Certificate validation failed upstream
78
- console.print(f"[red]❌ Certificate validation failed: {exc}[/red]")
79
+ # Evaluation report validation failed upstream
80
+ console.print(f"[red]❌ Evaluation report validation failed: {exc}[/red]")
79
81
  raise typer.Exit(2) from exc
80
82
  except Exception as exc:
81
83
  console.print(f"[red]❌ Failed to render HTML: {exc}[/red]")
@@ -94,7 +96,7 @@ def export_html_command(
94
96
  console.print(f"[red]❌ Failed to write output file: {exc}[/red]")
95
97
  raise typer.Exit(1) from exc
96
98
 
97
- console.print(f"✅ Exported certificate HTML → {out_path}")
99
+ console.print(f"✅ Exported evaluation report HTML → {out_path}")
98
100
 
99
101
 
100
102
  __all__ = ["export_html_command"]