invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. invarlock/__init__.py +4 -4
  2. invarlock/adapters/__init__.py +10 -14
  3. invarlock/adapters/auto.py +37 -50
  4. invarlock/adapters/capabilities.py +2 -2
  5. invarlock/adapters/hf_causal.py +418 -0
  6. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  7. invarlock/adapters/hf_loading.py +7 -7
  8. invarlock/adapters/hf_mixin.py +53 -9
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/assurance/__init__.py +15 -23
  12. invarlock/cli/adapter_auto.py +32 -26
  13. invarlock/cli/app.py +128 -27
  14. invarlock/cli/commands/__init__.py +2 -2
  15. invarlock/cli/commands/calibrate.py +48 -4
  16. invarlock/cli/commands/doctor.py +8 -10
  17. invarlock/cli/commands/evaluate.py +986 -0
  18. invarlock/cli/commands/explain_gates.py +25 -17
  19. invarlock/cli/commands/export_html.py +11 -9
  20. invarlock/cli/commands/plugins.py +13 -9
  21. invarlock/cli/commands/report.py +326 -92
  22. invarlock/cli/commands/run.py +1160 -228
  23. invarlock/cli/commands/verify.py +157 -97
  24. invarlock/cli/config.py +1 -1
  25. invarlock/cli/determinism.py +1 -1
  26. invarlock/cli/doctor_helpers.py +4 -5
  27. invarlock/cli/output.py +193 -0
  28. invarlock/cli/provenance.py +4 -4
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/registry.py +9 -11
  31. invarlock/core/retry.py +14 -14
  32. invarlock/core/runner.py +112 -26
  33. invarlock/edits/noop.py +2 -2
  34. invarlock/edits/quant_rtn.py +67 -39
  35. invarlock/eval/__init__.py +1 -1
  36. invarlock/eval/bench.py +14 -10
  37. invarlock/eval/data.py +68 -23
  38. invarlock/eval/metrics.py +59 -1
  39. invarlock/eval/primary_metric.py +1 -1
  40. invarlock/eval/tasks/__init__.py +12 -0
  41. invarlock/eval/tasks/classification.py +48 -0
  42. invarlock/eval/tasks/qa.py +36 -0
  43. invarlock/eval/tasks/text_generation.py +102 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/rmt.py +2 -2
  46. invarlock/guards/spectral.py +1 -1
  47. invarlock/guards/variance.py +2 -2
  48. invarlock/model_profile.py +64 -62
  49. invarlock/observability/health.py +6 -6
  50. invarlock/observability/metrics.py +108 -0
  51. invarlock/plugins/hf_bnb_adapter.py +32 -21
  52. invarlock/reporting/__init__.py +18 -4
  53. invarlock/reporting/guards_analysis.py +154 -4
  54. invarlock/reporting/html.py +61 -11
  55. invarlock/reporting/normalizer.py +9 -2
  56. invarlock/reporting/policy_utils.py +1 -1
  57. invarlock/reporting/primary_metric_utils.py +11 -11
  58. invarlock/reporting/render.py +876 -510
  59. invarlock/reporting/report.py +72 -30
  60. invarlock/reporting/{certificate.py → report_builder.py} +252 -99
  61. invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
  62. invarlock/reporting/report_types.py +6 -1
  63. invarlock/reporting/telemetry.py +86 -0
  64. invarlock-0.3.8.dist-info/METADATA +283 -0
  65. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
  66. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
  67. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
  68. invarlock/adapters/hf_gpt2.py +0 -404
  69. invarlock/adapters/hf_llama.py +0 -487
  70. invarlock/cli/commands/certify.py +0 -422
  71. invarlock-0.3.6.dist-info/METADATA +0 -588
  72. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
  73. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,986 @@
1
+ """
2
+ InvarLock CLI Evaluate Command
3
+ =========================
4
+
5
+ Hero path: Compare & Evaluate (BYOE). Provide baseline (`--baseline`) and
6
+ subject (`--subject`) checkpoints and InvarLock will run paired windows and emit a
7
+ evaluation report. Optionally, pass `--edit-config` to run the built‑in quant_rtn demo.
8
+
9
+ Steps:
10
+ 1) Baseline (no-op edit) on baseline model
11
+ 2) Subject (no-op or provided edit config) on subject model with --baseline pairing
12
+ 3) Emit evaluation report via `invarlock report --format report`
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import inspect
18
+ import io
19
+ import json
20
+ import math
21
+ import os
22
+ from collections.abc import Iterator
23
+ from contextlib import contextmanager
24
+ from pathlib import Path
25
+ from typing import Any, NoReturn
26
+
27
+ import typer
28
+ from rich.console import Console
29
+
30
+ from invarlock import __version__ as INVARLOCK_VERSION
31
+
32
+ from ...core.exceptions import MetricsError
33
+ from ..adapter_auto import resolve_auto_adapter
34
+ from ..config import _deep_merge as _merge # reuse helper
35
+
36
+ # Use the report group's programmatic entry for report generation
37
+ from .report import report_command as _report
38
+ from .run import _resolve_exit_code as _resolve_exit_code
39
+
40
+ _LAZY_RUN_IMPORT = True
41
+
42
+ PHASE_BAR_WIDTH = 67
43
+ VERBOSITY_QUIET = 0
44
+ VERBOSITY_DEFAULT = 1
45
+ VERBOSITY_VERBOSE = 2
46
+
47
+ console = Console()
48
+
49
+
50
+ def _render_banner_lines(title: str, context: str) -> list[str]:
51
+ width = max(len(title), len(context))
52
+ border = "─" * (width + 2)
53
+ return [
54
+ f"┌{border}┐",
55
+ f"│ {title.ljust(width)} │",
56
+ f"│ {context.ljust(width)} │",
57
+ f"└{border}┘",
58
+ ]
59
+
60
+
61
+ def _print_header_banner(
62
+ console: Console, *, version: str, profile: str, tier: str, adapter: str
63
+ ) -> None:
64
+ title = f"INVARLOCK v{version} · Evaluation Pipeline"
65
+ context = f"Profile: {profile} · Tier: {tier} · Adapter: {adapter}"
66
+ for line in _render_banner_lines(title, context):
67
+ console.print(line)
68
+
69
+
70
+ def _phase_title(index: int, total: int, title: str) -> str:
71
+ return f"PHASE {index}/{total} · {title}"
72
+
73
+
74
+ def _print_phase_header(console: Console, title: str) -> None:
75
+ bar_width = max(PHASE_BAR_WIDTH, len(title))
76
+ bar = "═" * bar_width
77
+ console.print(bar)
78
+ console.print(title)
79
+ console.print(bar)
80
+
81
+
82
+ def _format_ratio(value: Any) -> str:
83
+ try:
84
+ val = float(value)
85
+ except (TypeError, ValueError):
86
+ return "N/A"
87
+ if not math.isfinite(val):
88
+ return "N/A"
89
+ return f"{val:.3f}"
90
+
91
+
92
+ def _resolve_verbosity(quiet: bool, verbose: bool) -> int:
93
+ if quiet and verbose:
94
+ console.print("--quiet and --verbose are mutually exclusive")
95
+ raise typer.Exit(2)
96
+ if quiet:
97
+ return VERBOSITY_QUIET
98
+ if verbose:
99
+ return VERBOSITY_VERBOSE
100
+ return VERBOSITY_DEFAULT
101
+
102
+
103
+ @contextmanager
104
+ def _override_console(module: Any, new_console: Console) -> Iterator[None]:
105
+ original_console = getattr(module, "console", None)
106
+ module.console = new_console
107
+ try:
108
+ yield
109
+ finally:
110
+ module.console = original_console
111
+
112
+
113
+ @contextmanager
114
+ def _suppress_child_output(enabled: bool) -> Iterator[io.StringIO | None]:
115
+ if not enabled:
116
+ yield None
117
+ return
118
+ from . import report as report_mod
119
+ from . import run as run_mod
120
+
121
+ buffer = io.StringIO()
122
+ quiet_console = Console(file=buffer, force_terminal=False, color_system=None)
123
+ with (
124
+ _override_console(run_mod, quiet_console),
125
+ _override_console(report_mod, quiet_console),
126
+ ):
127
+ yield buffer
128
+
129
+
130
+ def _print_quiet_summary(
131
+ *,
132
+ report_out: Path,
133
+ source: str,
134
+ edited: str,
135
+ profile: str,
136
+ ) -> None:
137
+ report_path = report_out / "evaluation.report.json"
138
+ console.print(f"INVARLOCK v{INVARLOCK_VERSION} · EVALUATE")
139
+ console.print(f"Baseline: {source} -> Subject: {edited} · Profile: {profile}")
140
+ if not report_path.exists():
141
+ console.print(f"Output: {report_out}")
142
+ return
143
+ try:
144
+ with report_path.open("r", encoding="utf-8") as fh:
145
+ evaluation_report = json.load(fh)
146
+ except Exception:
147
+ console.print(f"Output: {report_path}")
148
+ return
149
+ if not isinstance(evaluation_report, dict):
150
+ console.print(f"Output: {report_path}")
151
+ return
152
+ try:
153
+ from invarlock.reporting.render import (
154
+ compute_console_validation_block as _console_block,
155
+ )
156
+
157
+ block = _console_block(evaluation_report)
158
+ rows = block.get("rows", [])
159
+ total = len(rows) if isinstance(rows, list) else 0
160
+ passed = (
161
+ sum(1 for row in rows if row.get("ok")) if isinstance(rows, list) else 0
162
+ )
163
+ status = "PASS" if block.get("overall_pass") else "FAIL"
164
+ except Exception:
165
+ total = 0
166
+ passed = 0
167
+ status = "UNKNOWN"
168
+ pm_ratio = _format_ratio(
169
+ (evaluation_report.get("primary_metric") or {}).get("ratio_vs_baseline")
170
+ )
171
+ gate_summary = f"{passed}/{total} passed" if total else "N/A"
172
+ console.print(f"Status: {status} · Gates: {gate_summary}")
173
+ if pm_ratio != "N/A":
174
+ console.print(f"Primary metric ratio: {pm_ratio}")
175
+ console.print(f"Output: {report_path}")
176
+
177
+
178
+ def _latest_run_report(run_root: Path) -> Path | None:
179
+ if not run_root.exists():
180
+ return None
181
+ candidates = sorted([p for p in run_root.iterdir() if p.is_dir()])
182
+ if not candidates:
183
+ return None
184
+ latest = candidates[-1]
185
+ for f in [latest / "report.json", latest / f"{latest.name}.json"]:
186
+ if f.exists():
187
+ return f
188
+ # Fallback: first JSON in the directory
189
+ jsons = list(latest.glob("*.json"))
190
+ return jsons[0] if jsons else None
191
+
192
+
193
+ def _load_yaml(path: Path) -> dict[str, Any]:
194
+ import yaml
195
+
196
+ with path.open("r", encoding="utf-8") as fh:
197
+ data = yaml.safe_load(fh) or {}
198
+ if not isinstance(data, dict):
199
+ raise ValueError("Preset must be a mapping")
200
+ return data
201
+
202
+
203
+ def _dump_yaml(path: Path, data: dict[str, Any]) -> None:
204
+ import yaml
205
+
206
+ with path.open("w", encoding="utf-8") as fh:
207
+ yaml.safe_dump(data, fh, sort_keys=False)
208
+
209
+
210
+ def _resolve_evaluate_tmp_dir() -> Path:
211
+ """Return the on-disk scratch directory for `invarlock evaluate`.
212
+
213
+ Evaluate generates merged YAML configs for baseline/subject runs so
214
+ downstream `invarlock run` flows remain traceable. We keep these files
215
+ under `./tmp/.evaluate` by default to avoid cluttering the working tree.
216
+ """
217
+
218
+ candidate = os.environ.get("INVARLOCK_EVALUATE_TMP_DIR")
219
+ if candidate:
220
+ tmp_dir = Path(candidate).expanduser()
221
+ else:
222
+ tmp_dir = Path("tmp") / ".evaluate"
223
+ tmp_dir.mkdir(parents=True, exist_ok=True)
224
+ return tmp_dir
225
+
226
+
227
+ def _normalize_model_id(model_id: str, adapter_name: str) -> str:
228
+ """Normalize model identifiers for adapters.
229
+
230
+ - Accepts optional "hf:" prefix for Hugging Face repo IDs and strips it
231
+ before passing to transformers APIs.
232
+ """
233
+ mid = str(model_id or "").strip()
234
+ try:
235
+ if str(adapter_name).startswith("hf_") and mid.startswith("hf:"):
236
+ return mid.split(":", 1)[1]
237
+ except Exception:
238
+ pass
239
+ return mid
240
+
241
+
242
+ def evaluate_command(
243
+ # Primary names for programmatic/test compatibility
244
+ source: str = typer.Option(
245
+ ..., "--source", "--baseline", help="Baseline model dir or Hub ID"
246
+ ),
247
+ edited: str = typer.Option(
248
+ ..., "--edited", "--subject", help="Subject model dir or Hub ID"
249
+ ),
250
+ baseline_report: str | None = typer.Option(
251
+ None,
252
+ "--baseline-report",
253
+ help=(
254
+ "Reuse an existing baseline run report.json (skips baseline evaluation). "
255
+ "Must include stored evaluation windows (e.g., set INVARLOCK_STORE_EVAL_WINDOWS=1)."
256
+ ),
257
+ ),
258
+ adapter: str = typer.Option(
259
+ "auto", "--adapter", help="Adapter name or 'auto' to resolve"
260
+ ),
261
+ device: str | None = typer.Option(
262
+ None,
263
+ "--device",
264
+ help="Device override for runs (auto|cuda|mps|cpu)",
265
+ ),
266
+ profile: str = typer.Option(
267
+ "ci", "--profile", help="Profile (ci|release|ci_cpu|dev)"
268
+ ),
269
+ tier: str = typer.Option("balanced", "--tier", help="Tier label for context"),
270
+ preset: str | None = typer.Option(
271
+ None,
272
+ "--preset",
273
+ help=(
274
+ "Universal preset path to use (defaults to causal or masked preset"
275
+ " based on adapter)"
276
+ ),
277
+ ),
278
+ out: str = typer.Option("runs", "--out", help="Base output directory"),
279
+ report_out: str = typer.Option(
280
+ "reports/eval", "--report-out", help="Evaluation report output directory"
281
+ ),
282
+ edit_config: str | None = typer.Option(
283
+ None, "--edit-config", help="Edit preset to apply a demo edit (quant_rtn)"
284
+ ),
285
+ edit_label: str | None = typer.Option(
286
+ None,
287
+ "--edit-label",
288
+ help=(
289
+ "Edit algorithm label for BYOE models. Use 'noop' for baseline, "
290
+ "'quant_rtn' etc. for built-in edits, 'custom' for pre-edited models."
291
+ ),
292
+ ),
293
+ quiet: bool = typer.Option(
294
+ False, "--quiet", "-q", help="Minimal output (suppress run/report detail)"
295
+ ),
296
+ verbose: bool = typer.Option(
297
+ False, "--verbose", "-v", help="Verbose output (include debug details)"
298
+ ),
299
+ banner: bool = typer.Option(
300
+ True, "--banner/--no-banner", help="Show header banner"
301
+ ),
302
+ style: str = typer.Option("audit", "--style", help="Output style (audit|friendly)"),
303
+ timing: bool = typer.Option(False, "--timing", help="Show timing summary"),
304
+ progress: bool = typer.Option(
305
+ True, "--progress/--no-progress", help="Show progress done messages"
306
+ ),
307
+ no_color: bool = typer.Option(
308
+ False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
309
+ ),
310
+ ):
311
+ """Evaluate two checkpoints (baseline vs subject) with pinned windows."""
312
+ # Support programmatic calls and Typer-invoked calls uniformly
313
+ try:
314
+ from typer.models import OptionInfo as _TyperOptionInfo
315
+ except Exception: # pragma: no cover - typer internals may change
316
+ _TyperOptionInfo = () # type: ignore[assignment]
317
+
318
+ def _coerce_option(value, fallback=None):
319
+ if isinstance(value, _TyperOptionInfo):
320
+ return getattr(value, "default", fallback)
321
+ return value if value is not None else fallback
322
+
323
+ source = _coerce_option(source)
324
+ edited = _coerce_option(edited)
325
+ baseline_report = _coerce_option(baseline_report)
326
+ adapter = _coerce_option(adapter, "auto")
327
+ device = _coerce_option(device)
328
+ profile = _coerce_option(profile, "ci")
329
+ tier = _coerce_option(tier, "balanced")
330
+ preset = _coerce_option(preset)
331
+ out = _coerce_option(out, "runs")
332
+ report_out = _coerce_option(report_out, "reports/eval")
333
+ edit_config = _coerce_option(edit_config)
334
+ edit_label = _coerce_option(edit_label)
335
+ quiet = _coerce_option(quiet, False)
336
+ verbose = _coerce_option(verbose, False)
337
+ banner = _coerce_option(banner, True)
338
+ style = _coerce_option(style, "audit")
339
+ timing = bool(_coerce_option(timing, False))
340
+ progress = bool(_coerce_option(progress, True))
341
+ no_color = bool(_coerce_option(no_color, False))
342
+
343
+ verbosity = _resolve_verbosity(bool(quiet), bool(verbose))
344
+
345
+ if verbosity == VERBOSITY_QUIET:
346
+ progress = False
347
+ timing = False
348
+
349
+ from invarlock.cli.output import (
350
+ make_console,
351
+ perf_counter,
352
+ print_event,
353
+ print_timing_summary,
354
+ resolve_output_style,
355
+ timed_step,
356
+ )
357
+
358
+ output_style = resolve_output_style(
359
+ style=str(style),
360
+ profile=str(profile),
361
+ progress=bool(progress),
362
+ timing=bool(timing),
363
+ no_color=bool(no_color),
364
+ )
365
+ console = make_console(no_color=not output_style.color)
366
+ timings: dict[str, float] = {}
367
+ total_start: float | None = perf_counter() if output_style.timing else None
368
+
369
+ def _info(message: str, *, tag: str = "INFO", emoji: str | None = None) -> None:
370
+ if verbosity >= VERBOSITY_DEFAULT:
371
+ print_event(console, tag, message, style=output_style, emoji=emoji)
372
+
373
+ def _debug(msg: str) -> None:
374
+ if verbosity >= VERBOSITY_VERBOSE:
375
+ console.print(msg, markup=False)
376
+
377
+ def _fail(message: str, *, exit_code: int = 2) -> NoReturn:
378
+ print_event(console, "FAIL", message, style=output_style, emoji="❌")
379
+ raise typer.Exit(exit_code)
380
+
381
+ def _phase(index: int, total: int, title: str) -> None:
382
+ if verbosity >= VERBOSITY_DEFAULT:
383
+ console.print("")
384
+ _print_phase_header(console, _phase_title(index, total, title))
385
+
386
+ src_id = str(source)
387
+ edt_id = str(edited)
388
+
389
+ # Resolve adapter when requested
390
+ eff_adapter = adapter
391
+ adapter_auto = False
392
+ if str(adapter).strip().lower() in {"auto", "auto_hf"}:
393
+ eff_adapter = resolve_auto_adapter(src_id)
394
+ adapter_auto = True
395
+
396
+ show_banner = bool(banner) and verbosity >= VERBOSITY_DEFAULT
397
+ if show_banner:
398
+ _print_header_banner(
399
+ console,
400
+ version=INVARLOCK_VERSION,
401
+ profile=profile,
402
+ tier=tier,
403
+ adapter=str(eff_adapter),
404
+ )
405
+ console.print("")
406
+
407
+ if adapter_auto:
408
+ _debug(f"Adapter:auto -> {eff_adapter}")
409
+
410
+ # Choose preset. If none provided and repo preset is missing (pip install
411
+ # scenario), fall back to a minimal built-in universal preset so the
412
+ # flag-only quick start works without cloning the repo.
413
+ default_universal = (
414
+ Path("configs/presets/masked_lm/wikitext2_128.yaml")
415
+ if eff_adapter == "hf_mlm"
416
+ else Path("configs/presets/causal_lm/wikitext2_512.yaml")
417
+ )
418
+ preset_path = Path(preset) if preset is not None else default_universal
419
+
420
+ preset_data: dict[str, Any]
421
+ if preset is None and not preset_path.exists():
422
+ # Inline minimal preset (wikitext2 universal) for pip installs
423
+ preset_data = {
424
+ "dataset": {
425
+ "provider": "wikitext2",
426
+ "split": "validation",
427
+ "seq_len": 512,
428
+ "stride": 512,
429
+ "preview_n": 64,
430
+ "final_n": 64,
431
+ "seed": 43,
432
+ }
433
+ }
434
+ else:
435
+ if not preset_path.exists():
436
+ print_event(
437
+ console,
438
+ "FAIL",
439
+ f"Preset not found: {preset_path}",
440
+ style=output_style,
441
+ emoji="❌",
442
+ )
443
+ raise typer.Exit(1)
444
+ preset_data = _load_yaml(preset_path)
445
+ # Do not hard-code device from presets in auto-generated evaluate configs;
446
+ # allow device resolution to pick CUDA/MPS/CPU via 'auto' or CLI overrides.
447
+ model_block = preset_data.get("model")
448
+ if isinstance(model_block, dict) and "device" in model_block:
449
+ model_block = dict(model_block)
450
+ model_block.pop("device", None)
451
+ preset_data["model"] = model_block
452
+
453
+ default_guards_order = ["invariants", "spectral", "rmt", "variance", "invariants"]
454
+ guards_order = None
455
+ preset_guards = preset_data.get("guards")
456
+ if isinstance(preset_guards, dict):
457
+ preset_order = preset_guards.get("order")
458
+ if (
459
+ isinstance(preset_order, list)
460
+ and preset_order
461
+ and all(isinstance(item, str) for item in preset_order)
462
+ ):
463
+ guards_order = list(preset_order)
464
+ if guards_order is None:
465
+ guards_order = list(default_guards_order)
466
+
467
+ def _load_and_validate_baseline_report(
468
+ report_path: Path,
469
+ *,
470
+ expected_profile: str,
471
+ expected_tier: str,
472
+ expected_adapter: str,
473
+ ) -> Path:
474
+ candidate = Path(report_path).expanduser()
475
+ if not candidate.exists():
476
+ _fail(f"Baseline report not found: {candidate}")
477
+ resolved_report: Path | None = None
478
+ if candidate.is_dir():
479
+ direct = candidate / "report.json"
480
+ if direct.is_file():
481
+ resolved_report = direct
482
+ else:
483
+ resolved_report = _latest_run_report(candidate)
484
+ elif candidate.is_file():
485
+ resolved_report = candidate
486
+ if resolved_report is None or not resolved_report.is_file():
487
+ _fail(f"Baseline report not found: {candidate}")
488
+ resolved_report = resolved_report.resolve()
489
+ try:
490
+ with resolved_report.open("r", encoding="utf-8") as fh:
491
+ payload = json.load(fh)
492
+ except Exception as exc: # noqa: BLE001
493
+ _fail(f"Baseline report is not valid JSON: {resolved_report} ({exc})")
494
+ if not isinstance(payload, dict):
495
+ _fail(f"Baseline report must be a JSON object: {resolved_report}")
496
+
497
+ edit_block = payload.get("edit")
498
+ edit_name = edit_block.get("name") if isinstance(edit_block, dict) else None
499
+ if edit_name != "noop":
500
+ _fail(
501
+ "Baseline report must be a no-op run (edit.name == 'noop'). "
502
+ f"Got edit.name={edit_name!r} in {resolved_report}"
503
+ )
504
+
505
+ meta = payload.get("meta")
506
+ if isinstance(meta, dict):
507
+ baseline_adapter = meta.get("adapter")
508
+ if (
509
+ isinstance(baseline_adapter, str)
510
+ and baseline_adapter != expected_adapter
511
+ ):
512
+ _fail(
513
+ "Baseline report adapter mismatch. "
514
+ f"Expected {expected_adapter!r}, got {baseline_adapter!r} in {resolved_report}"
515
+ )
516
+
517
+ context = payload.get("context")
518
+ if isinstance(context, dict):
519
+ baseline_profile = context.get("profile")
520
+ if (
521
+ isinstance(baseline_profile, str)
522
+ and baseline_profile.strip().lower() != expected_profile.strip().lower()
523
+ ):
524
+ _fail(
525
+ "Baseline report profile mismatch. "
526
+ f"Expected {expected_profile!r}, got {baseline_profile!r} in {resolved_report}"
527
+ )
528
+ auto_ctx = context.get("auto")
529
+ if isinstance(auto_ctx, dict):
530
+ baseline_tier = auto_ctx.get("tier")
531
+ if isinstance(baseline_tier, str) and baseline_tier != expected_tier:
532
+ _fail(
533
+ "Baseline report tier mismatch. "
534
+ f"Expected {expected_tier!r}, got {baseline_tier!r} in {resolved_report}"
535
+ )
536
+
537
+ eval_windows = payload.get("evaluation_windows")
538
+ if not isinstance(eval_windows, dict):
539
+ _fail(
540
+ "Baseline report missing evaluation window payloads. "
541
+ "Re-run baseline with INVARLOCK_STORE_EVAL_WINDOWS=1."
542
+ )
543
+
544
+ for phase_name in ("preview", "final"):
545
+ phase = eval_windows.get(phase_name)
546
+ if not isinstance(phase, dict):
547
+ _fail(
548
+ f"Baseline report missing evaluation_windows.{phase_name} payloads. "
549
+ "Re-run baseline with INVARLOCK_STORE_EVAL_WINDOWS=1."
550
+ )
551
+ window_ids = phase.get("window_ids")
552
+ input_ids = phase.get("input_ids")
553
+ if not isinstance(window_ids, list) or not window_ids:
554
+ _fail(
555
+ f"Baseline report missing evaluation_windows.{phase_name}.window_ids."
556
+ )
557
+ if not isinstance(input_ids, list) or not input_ids:
558
+ _fail(
559
+ f"Baseline report missing evaluation_windows.{phase_name}.input_ids."
560
+ )
561
+ if len(input_ids) != len(window_ids):
562
+ _fail(
563
+ "Baseline report has inconsistent evaluation window payloads "
564
+ f"for {phase_name}: input_ids={len(input_ids)} window_ids={len(window_ids)}."
565
+ )
566
+
567
+ return resolved_report
568
+
569
+ # Create temp baseline config (no-op edit)
570
+ # Normalize possible "hf:" prefixes for HF adapters
571
+ norm_src_id = _normalize_model_id(src_id, eff_adapter)
572
+ norm_edt_id = _normalize_model_id(edt_id, eff_adapter)
573
+
574
+ baseline_cfg = _merge(
575
+ preset_data,
576
+ {
577
+ "model": {
578
+ "id": norm_src_id,
579
+ "adapter": eff_adapter,
580
+ },
581
+ "edit": {"name": "noop", "plan": {}},
582
+ "eval": {},
583
+ "guards": {"order": guards_order},
584
+ "output": {"dir": str(Path(out) / "source")},
585
+ "context": {"profile": profile, "tier": tier},
586
+ },
587
+ )
588
+
589
+ baseline_label = "noop"
590
+ subject_label: str | None = None
591
+ if edit_label:
592
+ subject_label = edit_label
593
+ elif not edit_config:
594
+ subject_label = "custom" if norm_src_id != norm_edt_id else "noop"
595
+
596
+ tmp_dir = _resolve_evaluate_tmp_dir()
597
+
598
+ baseline_report_path: Path
599
+ if baseline_report:
600
+ _info(
601
+ "Using provided baseline report (skipping baseline evaluation)",
602
+ tag="EXEC",
603
+ emoji="♻️",
604
+ )
605
+ baseline_report_path = _load_and_validate_baseline_report(
606
+ Path(baseline_report),
607
+ expected_profile=profile,
608
+ expected_tier=tier,
609
+ expected_adapter=str(eff_adapter),
610
+ )
611
+ _debug(f"Baseline report: {baseline_report_path}")
612
+ else:
613
+ baseline_yaml = tmp_dir / "baseline_noop.yaml"
614
+ _dump_yaml(baseline_yaml, baseline_cfg)
615
+
616
+ _phase(1, 3, "BASELINE EVALUATION")
617
+ _info("Running baseline (no-op edit)", tag="EXEC", emoji="🏁")
618
+ _debug(f"Baseline config: {baseline_yaml}")
619
+ from .run import run_command as _run
620
+
621
+ with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
622
+ try:
623
+ with timed_step(
624
+ console=console,
625
+ style=output_style,
626
+ timings=timings,
627
+ key="baseline",
628
+ tag="EXEC",
629
+ message="Baseline",
630
+ emoji="🏁",
631
+ ):
632
+ _run(
633
+ config=str(baseline_yaml),
634
+ profile=profile,
635
+ out=str(Path(out) / "source"),
636
+ tier=tier,
637
+ device=device,
638
+ edit_label=baseline_label,
639
+ style=output_style.name,
640
+ progress=progress,
641
+ timing=False,
642
+ no_color=no_color,
643
+ )
644
+ except Exception:
645
+ if quiet_buffer is not None:
646
+ console.print(quiet_buffer.getvalue(), markup=False)
647
+ raise
648
+
649
+ baseline_report_path_candidate = _latest_run_report(Path(out) / "source")
650
+ if not baseline_report_path_candidate:
651
+ _fail("Could not locate baseline report after run", exit_code=1)
652
+ baseline_report_path = baseline_report_path_candidate
653
+ _debug(f"Baseline report: {baseline_report_path}")
654
+
655
+ # Edited run: either no-op (Compare & Evaluate) or provided edit_config (demo edit)
656
+ _phase(2, 3, "SUBJECT EVALUATION")
657
+ if edit_config:
658
+ edited_yaml = Path(edit_config)
659
+ if not edited_yaml.exists():
660
+ print_event(
661
+ console,
662
+ "FAIL",
663
+ f"Edit config not found: {edited_yaml}",
664
+ style=output_style,
665
+ emoji="❌",
666
+ )
667
+ raise typer.Exit(1)
668
+ _info("Running edited (demo edit via --edit-config)", tag="EXEC", emoji="✂️")
669
+ # Overlay subject model id/adapter and output/context onto the provided edit config
670
+ try:
671
+ cfg_loaded: dict[str, Any] = _load_yaml(edited_yaml)
672
+ except Exception as exc: # noqa: BLE001
673
+ print_event(
674
+ console,
675
+ "FAIL",
676
+ f"Failed to load edit config: {exc}",
677
+ style=output_style,
678
+ emoji="❌",
679
+ )
680
+ raise typer.Exit(1) from exc
681
+
682
+ # Ensure model.id/adapter point to the requested subject
683
+ model_block = dict(cfg_loaded.get("model") or {})
684
+ # Replace placeholder IDs like "<MODEL_ID>" or "<set-your-model-id>"
685
+ if not isinstance(model_block.get("id"), str) or model_block.get(
686
+ "id", ""
687
+ ).startswith("<"):
688
+ model_block["id"] = norm_edt_id
689
+ else:
690
+ # Always normalize when adapter is HF family
691
+ model_block["id"] = _normalize_model_id(str(model_block["id"]), eff_adapter)
692
+ # Respect explicit device from edit config; only set adapter if missing
693
+ if not isinstance(model_block.get("adapter"), str) or not model_block.get(
694
+ "adapter"
695
+ ):
696
+ model_block["adapter"] = eff_adapter
697
+ cfg_loaded["model"] = model_block
698
+
699
+ # Apply the same preset to the edited run to avoid duplicating dataset/task
700
+ # settings in edit configs; then overlay the edit, output, and context.
701
+ merged_edited_cfg = _merge(
702
+ _merge(preset_data, cfg_loaded),
703
+ {
704
+ "output": {"dir": str(Path(out) / "edited")},
705
+ "context": {"profile": profile, "tier": tier},
706
+ },
707
+ )
708
+ # Ensure the edited run always has a guard chain. Presets/edit configs
709
+ # often omit it, but `invarlock run` expects guards.order.
710
+ guards_block = merged_edited_cfg.get("guards")
711
+ guards_order_cfg = (
712
+ guards_block.get("order") if isinstance(guards_block, dict) else None
713
+ )
714
+ if not (
715
+ isinstance(guards_order_cfg, list)
716
+ and guards_order_cfg
717
+ and all(isinstance(item, str) for item in guards_order_cfg)
718
+ ):
719
+ merged_edited_cfg = _merge(
720
+ merged_edited_cfg, {"guards": {"order": guards_order}}
721
+ )
722
+
723
+ # Persist a temporary merged config for traceability
724
+ edited_merged_yaml = tmp_dir / "edited_merged.yaml"
725
+ _dump_yaml(edited_merged_yaml, merged_edited_cfg)
726
+ _debug(f"Edited config (merged): {edited_merged_yaml}")
727
+
728
+ from .run import run_command as _run
729
+
730
+ with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
731
+ try:
732
+ with timed_step(
733
+ console=console,
734
+ style=output_style,
735
+ timings=timings,
736
+ key="subject",
737
+ tag="EXEC",
738
+ message="Subject",
739
+ emoji="✂️",
740
+ ):
741
+ _run(
742
+ config=str(edited_merged_yaml),
743
+ profile=profile,
744
+ out=str(Path(out) / "edited"),
745
+ tier=tier,
746
+ baseline=str(baseline_report_path),
747
+ device=device,
748
+ edit_label=subject_label if edit_label else None,
749
+ style=output_style.name,
750
+ progress=progress,
751
+ timing=False,
752
+ no_color=no_color,
753
+ )
754
+ except Exception:
755
+ if quiet_buffer is not None:
756
+ console.print(quiet_buffer.getvalue(), markup=False)
757
+ raise
758
+ else:
759
+ edited_cfg = _merge(
760
+ preset_data,
761
+ {
762
+ "model": {"id": norm_edt_id, "adapter": eff_adapter},
763
+ "edit": {"name": "noop", "plan": {}},
764
+ "eval": {},
765
+ "guards": {"order": guards_order},
766
+ "output": {"dir": str(Path(out) / "edited")},
767
+ "context": {"profile": profile, "tier": tier},
768
+ },
769
+ )
770
+ edited_yaml = tmp_dir / "edited_noop.yaml"
771
+ _dump_yaml(edited_yaml, edited_cfg)
772
+ _info("Running edited (no-op, Compare & Evaluate)", tag="EXEC", emoji="🧪")
773
+ _debug(f"Edited config: {edited_yaml}")
774
+ from .run import run_command as _run
775
+
776
+ with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
777
+ try:
778
+ with timed_step(
779
+ console=console,
780
+ style=output_style,
781
+ timings=timings,
782
+ key="subject",
783
+ tag="EXEC",
784
+ message="Subject",
785
+ emoji="🧪",
786
+ ):
787
+ _run(
788
+ config=str(edited_yaml),
789
+ profile=profile,
790
+ out=str(Path(out) / "edited"),
791
+ tier=tier,
792
+ baseline=str(baseline_report_path),
793
+ device=device,
794
+ edit_label=subject_label,
795
+ style=output_style.name,
796
+ progress=progress,
797
+ timing=False,
798
+ no_color=no_color,
799
+ )
800
+ except Exception:
801
+ if quiet_buffer is not None:
802
+ console.print(quiet_buffer.getvalue(), markup=False)
803
+ raise
804
+
805
+ edited_report = _latest_run_report(Path(out) / "edited")
806
+ if not edited_report:
807
+ print_event(
808
+ console,
809
+ "FAIL",
810
+ "Could not locate edited report after run",
811
+ style=output_style,
812
+ emoji="❌",
813
+ )
814
+ raise typer.Exit(1)
815
+ _debug(f"Edited report: {edited_report}")
816
+
817
+ _phase(3, 3, "EVALUATION REPORT GENERATION")
818
+
819
+ def _emit_evaluation_report() -> None:
820
+ _info("Emitting evaluation report", tag="EXEC", emoji="📜")
821
+ with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
822
+ try:
823
+ with timed_step(
824
+ console=console,
825
+ style=output_style,
826
+ timings=timings,
827
+ key="evaluation_report",
828
+ tag="EXEC",
829
+ message="Evaluation Report",
830
+ emoji="📜",
831
+ ):
832
+ # Use a wall-clock perf counter here (not the output module's
833
+ # test-patched counter) so timing tests remain deterministic.
834
+ from time import perf_counter as _wall_perf_counter
835
+
836
+ report_start = _wall_perf_counter()
837
+ report_kwargs = {
838
+ "run": str(edited_report),
839
+ "format": "report",
840
+ "baseline": str(baseline_report_path),
841
+ "output": report_out,
842
+ "style": output_style.name,
843
+ "no_color": no_color,
844
+ "summary_baseline_seconds": float(timings.get("baseline", 0.0)),
845
+ "summary_subject_seconds": float(timings.get("subject", 0.0)),
846
+ "summary_report_start": float(report_start),
847
+ }
848
+ try:
849
+ sig = inspect.signature(_report)
850
+ except (TypeError, ValueError):
851
+ _report(**report_kwargs)
852
+ else:
853
+ if any(
854
+ param.kind == inspect.Parameter.VAR_KEYWORD
855
+ for param in sig.parameters.values()
856
+ ):
857
+ _report(**report_kwargs)
858
+ else:
859
+ _report(
860
+ **{
861
+ key: value
862
+ for key, value in report_kwargs.items()
863
+ if key in sig.parameters
864
+ }
865
+ )
866
+ except Exception:
867
+ if quiet_buffer is not None:
868
+ console.print(quiet_buffer.getvalue(), markup=False)
869
+ raise
870
+
871
+ # CI/Release hard‑abort: fail fast when primary metric is not computable.
872
+ try:
873
+ prof = str(profile or "").strip().lower()
874
+ except Exception:
875
+ prof = ""
876
+ if prof in {"ci", "ci_cpu", "release"}:
877
+ try:
878
+ with Path(edited_report).open("r", encoding="utf-8") as fh:
879
+ edited_payload = json.load(fh)
880
+ except Exception as exc: # noqa: BLE001
881
+ print_event(
882
+ console,
883
+ "FAIL",
884
+ f"Failed to read edited report: {exc}",
885
+ style=output_style,
886
+ emoji="❌",
887
+ )
888
+ raise typer.Exit(1) from exc
889
+
890
+ def _finite(x: Any) -> bool:
891
+ try:
892
+ return isinstance(x, (int | float)) and math.isfinite(float(x))
893
+ except Exception:
894
+ return False
895
+
896
+ meta = (
897
+ edited_payload.get("meta", {}) if isinstance(edited_payload, dict) else {}
898
+ )
899
+ metrics = (
900
+ edited_payload.get("metrics", {})
901
+ if isinstance(edited_payload, dict)
902
+ else {}
903
+ )
904
+ pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
905
+ pm_prev = pm.get("preview") if isinstance(pm, dict) else None
906
+ pm_final = pm.get("final") if isinstance(pm, dict) else None
907
+ pm_ratio = pm.get("ratio_vs_baseline")
908
+ device = meta.get("device") or "unknown"
909
+ adapter_name = meta.get("adapter") or "unknown"
910
+ edit_name = (
911
+ (edited_payload.get("edit", {}) or {}).get("name")
912
+ if isinstance(edited_payload, dict)
913
+ else None
914
+ ) or "unknown"
915
+
916
+ # Enforce only when a primary_metric block is present; allow degraded-but-flagged metrics to emit evaluation reports, but fail the task.
917
+ has_metric_block = isinstance(pm, dict) and bool(pm)
918
+ if has_metric_block:
919
+ degraded = bool(pm.get("invalid") or pm.get("degraded"))
920
+ if degraded or not _finite(pm_final):
921
+ fallback = pm_prev if _finite(pm_prev) else pm_final
922
+ if not _finite(fallback) or fallback <= 0:
923
+ fallback = 1.0
924
+ degraded_reason = pm.get("degraded_reason") or (
925
+ "non_finite_pm"
926
+ if (not _finite(pm_prev) or not _finite(pm_final))
927
+ else "primary_metric_degraded"
928
+ )
929
+ print_event(
930
+ console,
931
+ "WARN",
932
+ "Primary metric degraded or non-finite; emitting evaluation report and marking task degraded. Primary metric computation failed.",
933
+ style=output_style,
934
+ emoji="⚠️",
935
+ )
936
+ pm["degraded"] = True
937
+ pm["invalid"] = pm.get("invalid") or True
938
+ pm["preview"] = pm_prev if _finite(pm_prev) else fallback
939
+ pm["final"] = pm_final if _finite(pm_final) else fallback
940
+ pm["ratio_vs_baseline"] = pm_ratio if _finite(pm_ratio) else 1.0
941
+ pm["degraded_reason"] = degraded_reason
942
+ metrics["primary_metric"] = pm
943
+ edited_payload.setdefault("metrics", {}).update(metrics)
944
+
945
+ # Emit the evaluation report for inspection, then exit with a CI-visible error.
946
+ _emit_evaluation_report()
947
+ err = MetricsError(
948
+ code="E111",
949
+ message=f"Primary metric degraded or non-finite ({degraded_reason}).",
950
+ details={
951
+ "reason": degraded_reason,
952
+ "adapter": adapter_name,
953
+ "device": device,
954
+ "edit": edit_name,
955
+ },
956
+ )
957
+ raise typer.Exit(_resolve_exit_code(err, profile=profile))
958
+
959
+ _emit_evaluation_report()
960
+ if timing:
961
+ if total_start is not None:
962
+ timings["total"] = max(0.0, float(perf_counter() - total_start))
963
+ else:
964
+ timings["total"] = (
965
+ float(timings.get("baseline", 0.0))
966
+ + float(timings.get("subject", 0.0))
967
+ + float(timings.get("evaluation_report", 0.0))
968
+ )
969
+ print_timing_summary(
970
+ console,
971
+ timings,
972
+ style=output_style,
973
+ order=[
974
+ ("Baseline", "baseline"),
975
+ ("Subject", "subject"),
976
+ ("Evaluation Report", "evaluation_report"),
977
+ ("Total", "total"),
978
+ ],
979
+ )
980
+ if verbosity == VERBOSITY_QUIET:
981
+ _print_quiet_summary(
982
+ report_out=Path(report_out),
983
+ source=src_id,
984
+ edited=edt_id,
985
+ profile=profile,
986
+ )