invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. invarlock/__init__.py +2 -2
  2. invarlock/_data/runtime/tiers.yaml +57 -30
  3. invarlock/adapters/__init__.py +11 -15
  4. invarlock/adapters/auto.py +35 -40
  5. invarlock/adapters/capabilities.py +2 -2
  6. invarlock/adapters/hf_causal.py +418 -0
  7. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  8. invarlock/adapters/hf_mixin.py +25 -4
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/calibration/spectral_null.py +15 -10
  12. invarlock/calibration/variance_ve.py +0 -2
  13. invarlock/cli/adapter_auto.py +31 -21
  14. invarlock/cli/app.py +73 -2
  15. invarlock/cli/commands/calibrate.py +6 -2
  16. invarlock/cli/commands/certify.py +651 -91
  17. invarlock/cli/commands/doctor.py +11 -11
  18. invarlock/cli/commands/explain_gates.py +57 -8
  19. invarlock/cli/commands/plugins.py +13 -9
  20. invarlock/cli/commands/report.py +233 -69
  21. invarlock/cli/commands/run.py +1066 -244
  22. invarlock/cli/commands/verify.py +154 -15
  23. invarlock/cli/config.py +22 -6
  24. invarlock/cli/doctor_helpers.py +4 -5
  25. invarlock/cli/output.py +193 -0
  26. invarlock/cli/provenance.py +1 -1
  27. invarlock/core/api.py +45 -5
  28. invarlock/core/auto_tuning.py +65 -20
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/contracts.py +7 -1
  31. invarlock/core/registry.py +11 -13
  32. invarlock/core/runner.py +425 -75
  33. invarlock/edits/quant_rtn.py +65 -37
  34. invarlock/eval/bench.py +3 -16
  35. invarlock/eval/data.py +82 -51
  36. invarlock/eval/metrics.py +63 -2
  37. invarlock/eval/primary_metric.py +23 -0
  38. invarlock/eval/tail_stats.py +230 -0
  39. invarlock/eval/tasks/__init__.py +12 -0
  40. invarlock/eval/tasks/classification.py +48 -0
  41. invarlock/eval/tasks/qa.py +36 -0
  42. invarlock/eval/tasks/text_generation.py +102 -0
  43. invarlock/guards/_estimators.py +154 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/policies.py +16 -6
  46. invarlock/guards/rmt.py +627 -546
  47. invarlock/guards/spectral.py +348 -110
  48. invarlock/guards/tier_config.py +32 -30
  49. invarlock/guards/variance.py +7 -31
  50. invarlock/guards_ref/rmt_ref.py +23 -23
  51. invarlock/model_profile.py +90 -42
  52. invarlock/observability/health.py +6 -6
  53. invarlock/observability/metrics.py +108 -0
  54. invarlock/reporting/certificate.py +384 -55
  55. invarlock/reporting/certificate_schema.py +3 -2
  56. invarlock/reporting/dataset_hashing.py +15 -2
  57. invarlock/reporting/guards_analysis.py +350 -277
  58. invarlock/reporting/html.py +55 -5
  59. invarlock/reporting/normalizer.py +13 -0
  60. invarlock/reporting/policy_utils.py +38 -36
  61. invarlock/reporting/primary_metric_utils.py +71 -17
  62. invarlock/reporting/render.py +852 -431
  63. invarlock/reporting/report.py +40 -4
  64. invarlock/reporting/report_types.py +11 -3
  65. invarlock/reporting/telemetry.py +86 -0
  66. invarlock/reporting/validate.py +1 -18
  67. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
  68. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
  69. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
  70. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
  71. invarlock/adapters/hf_gpt2.py +0 -404
  72. invarlock/adapters/hf_llama.py +0 -487
  73. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
  74. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
@@ -14,17 +14,23 @@ Steps:
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ import inspect
18
+ import io
17
19
  import json
18
20
  import math
21
+ from collections.abc import Iterator
22
+ from contextlib import contextmanager
19
23
  from pathlib import Path
20
- from typing import Any
24
+ from typing import Any, NoReturn
21
25
 
22
26
  import typer
23
27
  from rich.console import Console
24
28
 
29
+ from invarlock import __version__ as INVARLOCK_VERSION
30
+
31
+ from ...core.exceptions import MetricsError
25
32
  from ..adapter_auto import resolve_auto_adapter
26
33
  from ..config import _deep_merge as _merge # reuse helper
27
- from ..errors import InvarlockError
28
34
 
29
35
  # Use the report group's programmatic entry for report generation
30
36
  from .report import report_command as _report
@@ -32,9 +38,142 @@ from .run import _resolve_exit_code as _resolve_exit_code
32
38
 
33
39
  _LAZY_RUN_IMPORT = True
34
40
 
41
+ PHASE_BAR_WIDTH = 67
42
+ VERBOSITY_QUIET = 0
43
+ VERBOSITY_DEFAULT = 1
44
+ VERBOSITY_VERBOSE = 2
45
+
35
46
  console = Console()
36
47
 
37
48
 
49
+ def _render_banner_lines(title: str, context: str) -> list[str]:
50
+ width = max(len(title), len(context))
51
+ border = "─" * (width + 2)
52
+ return [
53
+ f"┌{border}┐",
54
+ f"│ {title.ljust(width)} │",
55
+ f"│ {context.ljust(width)} │",
56
+ f"└{border}┘",
57
+ ]
58
+
59
+
60
+ def _print_header_banner(
61
+ console: Console, *, version: str, profile: str, tier: str, adapter: str
62
+ ) -> None:
63
+ title = f"INVARLOCK v{version} · Certification Pipeline"
64
+ context = f"Profile: {profile} · Tier: {tier} · Adapter: {adapter}"
65
+ for line in _render_banner_lines(title, context):
66
+ console.print(line)
67
+
68
+
69
+ def _phase_title(index: int, total: int, title: str) -> str:
70
+ return f"PHASE {index}/{total} · {title}"
71
+
72
+
73
+ def _print_phase_header(console: Console, title: str) -> None:
74
+ bar_width = max(PHASE_BAR_WIDTH, len(title))
75
+ bar = "═" * bar_width
76
+ console.print(bar)
77
+ console.print(title)
78
+ console.print(bar)
79
+
80
+
81
+ def _format_ratio(value: Any) -> str:
82
+ try:
83
+ val = float(value)
84
+ except (TypeError, ValueError):
85
+ return "N/A"
86
+ if not math.isfinite(val):
87
+ return "N/A"
88
+ return f"{val:.3f}"
89
+
90
+
91
+ def _resolve_verbosity(quiet: bool, verbose: bool) -> int:
92
+ if quiet and verbose:
93
+ console.print("--quiet and --verbose are mutually exclusive")
94
+ raise typer.Exit(2)
95
+ if quiet:
96
+ return VERBOSITY_QUIET
97
+ if verbose:
98
+ return VERBOSITY_VERBOSE
99
+ return VERBOSITY_DEFAULT
100
+
101
+
102
+ @contextmanager
103
+ def _override_console(module: Any, new_console: Console) -> Iterator[None]:
104
+ original_console = getattr(module, "console", None)
105
+ module.console = new_console
106
+ try:
107
+ yield
108
+ finally:
109
+ module.console = original_console
110
+
111
+
112
+ @contextmanager
113
+ def _suppress_child_output(enabled: bool) -> Iterator[io.StringIO | None]:
114
+ if not enabled:
115
+ yield None
116
+ return
117
+ from . import report as report_mod
118
+ from . import run as run_mod
119
+
120
+ buffer = io.StringIO()
121
+ quiet_console = Console(file=buffer, force_terminal=False, color_system=None)
122
+ with (
123
+ _override_console(run_mod, quiet_console),
124
+ _override_console(report_mod, quiet_console),
125
+ ):
126
+ yield buffer
127
+
128
+
129
+ def _print_quiet_summary(
130
+ *,
131
+ cert_out: Path,
132
+ source: str,
133
+ edited: str,
134
+ profile: str,
135
+ ) -> None:
136
+ cert_path = cert_out / "evaluation.cert.json"
137
+ console.print(f"INVARLOCK v{INVARLOCK_VERSION} · CERTIFY")
138
+ console.print(f"Baseline: {source} -> Subject: {edited} · Profile: {profile}")
139
+ if not cert_path.exists():
140
+ console.print(f"Output: {cert_out}")
141
+ return
142
+ try:
143
+ with cert_path.open("r", encoding="utf-8") as fh:
144
+ certificate = json.load(fh)
145
+ except Exception:
146
+ console.print(f"Output: {cert_path}")
147
+ return
148
+ if not isinstance(certificate, dict):
149
+ console.print(f"Output: {cert_path}")
150
+ return
151
+ try:
152
+ from invarlock.reporting.render import (
153
+ compute_console_validation_block as _console_block,
154
+ )
155
+
156
+ block = _console_block(certificate)
157
+ rows = block.get("rows", [])
158
+ total = len(rows) if isinstance(rows, list) else 0
159
+ passed = (
160
+ sum(1 for row in rows if row.get("ok")) if isinstance(rows, list) else 0
161
+ )
162
+ status = "PASS" if block.get("overall_pass") else "FAIL"
163
+ except Exception:
164
+ total = 0
165
+ passed = 0
166
+ status = "UNKNOWN"
167
+ pm_ratio = _format_ratio(
168
+ (certificate.get("primary_metric") or {}).get("ratio_vs_baseline")
169
+ )
170
+ gate_summary = f"{passed}/{total} passed" if total else "N/A"
171
+ console.print(f"Status: {status} · Gates: {gate_summary}")
172
+ if pm_ratio != "N/A":
173
+ console.print(f"Primary metric ratio: {pm_ratio}")
174
+ console.print(f"Output: {cert_path}")
175
+
176
+
38
177
  def _latest_run_report(run_root: Path) -> Path | None:
39
178
  if not run_root.exists():
40
179
  return None
@@ -90,6 +229,14 @@ def certify_command(
90
229
  edited: str = typer.Option(
91
230
  ..., "--edited", "--subject", help="Subject model dir or Hub ID"
92
231
  ),
232
+ baseline_report: str | None = typer.Option(
233
+ None,
234
+ "--baseline-report",
235
+ help=(
236
+ "Reuse an existing baseline run report.json (skips baseline evaluation). "
237
+ "Must include stored evaluation windows (e.g., set INVARLOCK_STORE_EVAL_WINDOWS=1)."
238
+ ),
239
+ ),
93
240
  adapter: str = typer.Option(
94
241
  "auto", "--adapter", help="Adapter name or 'auto' to resolve"
95
242
  ),
@@ -98,7 +245,9 @@ def certify_command(
98
245
  "--device",
99
246
  help="Device override for runs (auto|cuda|mps|cpu)",
100
247
  ),
101
- profile: str = typer.Option("ci", "--profile", help="Profile (ci|release)"),
248
+ profile: str = typer.Option(
249
+ "ci", "--profile", help="Profile (ci|release|ci_cpu|dev)"
250
+ ),
102
251
  tier: str = typer.Option("balanced", "--tier", help="Tier label for context"),
103
252
  preset: str | None = typer.Option(
104
253
  None,
@@ -115,6 +264,31 @@ def certify_command(
115
264
  edit_config: str | None = typer.Option(
116
265
  None, "--edit-config", help="Edit preset to apply a demo edit (quant_rtn)"
117
266
  ),
267
+ edit_label: str | None = typer.Option(
268
+ None,
269
+ "--edit-label",
270
+ help=(
271
+ "Edit algorithm label for BYOE models. Use 'noop' for baseline, "
272
+ "'quant_rtn' etc. for built-in edits, 'custom' for pre-edited models."
273
+ ),
274
+ ),
275
+ quiet: bool = typer.Option(
276
+ False, "--quiet", "-q", help="Minimal output (suppress run/report detail)"
277
+ ),
278
+ verbose: bool = typer.Option(
279
+ False, "--verbose", "-v", help="Verbose output (include debug details)"
280
+ ),
281
+ banner: bool = typer.Option(
282
+ True, "--banner/--no-banner", help="Show header banner"
283
+ ),
284
+ style: str = typer.Option("audit", "--style", help="Output style (audit|friendly)"),
285
+ timing: bool = typer.Option(False, "--timing", help="Show timing summary"),
286
+ progress: bool = typer.Option(
287
+ True, "--progress/--no-progress", help="Show progress done messages"
288
+ ),
289
+ no_color: bool = typer.Option(
290
+ False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
291
+ ),
118
292
  ):
119
293
  """Certify two checkpoints (baseline vs subject) with pinned windows."""
120
294
  # Support programmatic calls and Typer-invoked calls uniformly
@@ -130,6 +304,7 @@ def certify_command(
130
304
 
131
305
  source = _coerce_option(source)
132
306
  edited = _coerce_option(edited)
307
+ baseline_report = _coerce_option(baseline_report)
133
308
  adapter = _coerce_option(adapter, "auto")
134
309
  device = _coerce_option(device)
135
310
  profile = _coerce_option(profile, "ci")
@@ -138,23 +313,89 @@ def certify_command(
138
313
  out = _coerce_option(out, "runs")
139
314
  cert_out = _coerce_option(cert_out, "reports/cert")
140
315
  edit_config = _coerce_option(edit_config)
316
+ edit_label = _coerce_option(edit_label)
317
+ quiet = _coerce_option(quiet, False)
318
+ verbose = _coerce_option(verbose, False)
319
+ banner = _coerce_option(banner, True)
320
+ style = _coerce_option(style, "audit")
321
+ timing = bool(_coerce_option(timing, False))
322
+ progress = bool(_coerce_option(progress, True))
323
+ no_color = bool(_coerce_option(no_color, False))
324
+
325
+ verbosity = _resolve_verbosity(bool(quiet), bool(verbose))
326
+
327
+ if verbosity == VERBOSITY_QUIET:
328
+ progress = False
329
+ timing = False
330
+
331
+ from invarlock.cli.output import (
332
+ make_console,
333
+ perf_counter,
334
+ print_event,
335
+ print_timing_summary,
336
+ resolve_output_style,
337
+ timed_step,
338
+ )
339
+
340
+ output_style = resolve_output_style(
341
+ style=str(style),
342
+ profile=str(profile),
343
+ progress=bool(progress),
344
+ timing=bool(timing),
345
+ no_color=bool(no_color),
346
+ )
347
+ console = make_console(no_color=not output_style.color)
348
+ timings: dict[str, float] = {}
349
+ total_start: float | None = perf_counter() if output_style.timing else None
350
+
351
+ def _info(message: str, *, tag: str = "INFO", emoji: str | None = None) -> None:
352
+ if verbosity >= VERBOSITY_DEFAULT:
353
+ print_event(console, tag, message, style=output_style, emoji=emoji)
354
+
355
+ def _debug(msg: str) -> None:
356
+ if verbosity >= VERBOSITY_VERBOSE:
357
+ console.print(msg, markup=False)
358
+
359
+ def _fail(message: str, *, exit_code: int = 2) -> NoReturn:
360
+ print_event(console, "FAIL", message, style=output_style, emoji="❌")
361
+ raise typer.Exit(exit_code)
362
+
363
+ def _phase(index: int, total: int, title: str) -> None:
364
+ if verbosity >= VERBOSITY_DEFAULT:
365
+ console.print("")
366
+ _print_phase_header(console, _phase_title(index, total, title))
141
367
 
142
368
  src_id = str(source)
143
369
  edt_id = str(edited)
144
370
 
145
371
  # Resolve adapter when requested
146
372
  eff_adapter = adapter
147
- if str(adapter).strip().lower() in {"auto", "hf_auto", "auto_hf"}:
373
+ adapter_auto = False
374
+ if str(adapter).strip().lower() in {"auto", "auto_hf"}:
148
375
  eff_adapter = resolve_auto_adapter(src_id)
149
- console.print(f"🔎 Adapter:auto → {eff_adapter}")
376
+ adapter_auto = True
377
+
378
+ show_banner = bool(banner) and verbosity >= VERBOSITY_DEFAULT
379
+ if show_banner:
380
+ _print_header_banner(
381
+ console,
382
+ version=INVARLOCK_VERSION,
383
+ profile=profile,
384
+ tier=tier,
385
+ adapter=str(eff_adapter),
386
+ )
387
+ console.print("")
388
+
389
+ if adapter_auto:
390
+ _debug(f"Adapter:auto -> {eff_adapter}")
150
391
 
151
392
  # Choose preset. If none provided and repo preset is missing (pip install
152
393
  # scenario), fall back to a minimal built-in universal preset so the
153
394
  # flag-only quick start works without cloning the repo.
154
395
  default_universal = (
155
- Path("configs/tasks/masked_lm/ci_cpu.yaml")
156
- if eff_adapter == "hf_bert"
157
- else Path("configs/tasks/causal_lm/ci_cpu.yaml")
396
+ Path("configs/presets/masked_lm/wikitext2_128.yaml")
397
+ if eff_adapter == "hf_mlm"
398
+ else Path("configs/presets/causal_lm/wikitext2_512.yaml")
158
399
  )
159
400
  preset_path = Path(preset) if preset is not None else default_universal
160
401
 
@@ -174,7 +415,13 @@ def certify_command(
174
415
  }
175
416
  else:
176
417
  if not preset_path.exists():
177
- console.print(f"[red]❌ Preset not found: {preset_path}")
418
+ print_event(
419
+ console,
420
+ "FAIL",
421
+ f"Preset not found: {preset_path}",
422
+ style=output_style,
423
+ emoji="❌",
424
+ )
178
425
  raise typer.Exit(1)
179
426
  preset_data = _load_yaml(preset_path)
180
427
  # Do not hard-code device from presets in auto-generated certify configs;
@@ -185,6 +432,122 @@ def certify_command(
185
432
  model_block.pop("device", None)
186
433
  preset_data["model"] = model_block
187
434
 
435
+ default_guards_order = ["invariants", "spectral", "rmt", "variance", "invariants"]
436
+ guards_order = None
437
+ preset_guards = preset_data.get("guards")
438
+ if isinstance(preset_guards, dict):
439
+ preset_order = preset_guards.get("order")
440
+ if (
441
+ isinstance(preset_order, list)
442
+ and preset_order
443
+ and all(isinstance(item, str) for item in preset_order)
444
+ ):
445
+ guards_order = list(preset_order)
446
+ if guards_order is None:
447
+ guards_order = list(default_guards_order)
448
+
449
+ def _load_and_validate_baseline_report(
450
+ report_path: Path,
451
+ *,
452
+ expected_profile: str,
453
+ expected_tier: str,
454
+ expected_adapter: str,
455
+ ) -> Path:
456
+ candidate = Path(report_path).expanduser()
457
+ if not candidate.exists():
458
+ _fail(f"Baseline report not found: {candidate}")
459
+ resolved_report: Path | None = None
460
+ if candidate.is_dir():
461
+ direct = candidate / "report.json"
462
+ if direct.is_file():
463
+ resolved_report = direct
464
+ else:
465
+ resolved_report = _latest_run_report(candidate)
466
+ elif candidate.is_file():
467
+ resolved_report = candidate
468
+ if resolved_report is None or not resolved_report.is_file():
469
+ _fail(f"Baseline report not found: {candidate}")
470
+ resolved_report = resolved_report.resolve()
471
+ try:
472
+ with resolved_report.open("r", encoding="utf-8") as fh:
473
+ payload = json.load(fh)
474
+ except Exception as exc: # noqa: BLE001
475
+ _fail(f"Baseline report is not valid JSON: {resolved_report} ({exc})")
476
+ if not isinstance(payload, dict):
477
+ _fail(f"Baseline report must be a JSON object: {resolved_report}")
478
+
479
+ edit_block = payload.get("edit")
480
+ edit_name = edit_block.get("name") if isinstance(edit_block, dict) else None
481
+ if edit_name != "noop":
482
+ _fail(
483
+ "Baseline report must be a no-op run (edit.name == 'noop'). "
484
+ f"Got edit.name={edit_name!r} in {resolved_report}"
485
+ )
486
+
487
+ meta = payload.get("meta")
488
+ if isinstance(meta, dict):
489
+ baseline_adapter = meta.get("adapter")
490
+ if (
491
+ isinstance(baseline_adapter, str)
492
+ and baseline_adapter != expected_adapter
493
+ ):
494
+ _fail(
495
+ "Baseline report adapter mismatch. "
496
+ f"Expected {expected_adapter!r}, got {baseline_adapter!r} in {resolved_report}"
497
+ )
498
+
499
+ context = payload.get("context")
500
+ if isinstance(context, dict):
501
+ baseline_profile = context.get("profile")
502
+ if (
503
+ isinstance(baseline_profile, str)
504
+ and baseline_profile.strip().lower() != expected_profile.strip().lower()
505
+ ):
506
+ _fail(
507
+ "Baseline report profile mismatch. "
508
+ f"Expected {expected_profile!r}, got {baseline_profile!r} in {resolved_report}"
509
+ )
510
+ auto_ctx = context.get("auto")
511
+ if isinstance(auto_ctx, dict):
512
+ baseline_tier = auto_ctx.get("tier")
513
+ if isinstance(baseline_tier, str) and baseline_tier != expected_tier:
514
+ _fail(
515
+ "Baseline report tier mismatch. "
516
+ f"Expected {expected_tier!r}, got {baseline_tier!r} in {resolved_report}"
517
+ )
518
+
519
+ eval_windows = payload.get("evaluation_windows")
520
+ if not isinstance(eval_windows, dict):
521
+ _fail(
522
+ "Baseline report missing evaluation window payloads. "
523
+ "Re-run baseline with INVARLOCK_STORE_EVAL_WINDOWS=1."
524
+ )
525
+
526
+ for phase_name in ("preview", "final"):
527
+ phase = eval_windows.get(phase_name)
528
+ if not isinstance(phase, dict):
529
+ _fail(
530
+ f"Baseline report missing evaluation_windows.{phase_name} payloads. "
531
+ "Re-run baseline with INVARLOCK_STORE_EVAL_WINDOWS=1."
532
+ )
533
+ window_ids = phase.get("window_ids")
534
+ input_ids = phase.get("input_ids")
535
+ if not isinstance(window_ids, list) or not window_ids:
536
+ _fail(
537
+ f"Baseline report missing evaluation_windows.{phase_name}.window_ids."
538
+ )
539
+ if not isinstance(input_ids, list) or not input_ids:
540
+ _fail(
541
+ f"Baseline report missing evaluation_windows.{phase_name}.input_ids."
542
+ )
543
+ if len(input_ids) != len(window_ids):
544
+ _fail(
545
+ "Baseline report has inconsistent evaluation window payloads "
546
+ f"for {phase_name}: input_ids={len(input_ids)} window_ids={len(window_ids)}."
547
+ )
548
+
549
+ return resolved_report
550
+
188
551
  # Create temp baseline config (no-op edit)
189
552
  # Normalize possible "hf:" prefixes for HF adapters
190
553
  norm_src_id = _normalize_model_id(src_id, eff_adapter)
@@ -199,47 +562,104 @@ def certify_command(
199
562
  },
200
563
  "edit": {"name": "noop", "plan": {}},
201
564
  "eval": {},
202
- "guards": {
203
- "order": ["invariants", "spectral", "rmt", "variance", "invariants"]
204
- },
565
+ "guards": {"order": guards_order},
205
566
  "output": {"dir": str(Path(out) / "source")},
206
567
  "context": {"profile": profile, "tier": tier},
207
568
  },
208
569
  )
209
570
 
571
+ baseline_label = "noop"
572
+ subject_label: str | None = None
573
+ if edit_label:
574
+ subject_label = edit_label
575
+ elif not edit_config:
576
+ subject_label = "custom" if norm_src_id != norm_edt_id else "noop"
577
+
210
578
  tmp_dir = Path(".certify_tmp")
211
579
  tmp_dir.mkdir(parents=True, exist_ok=True)
212
- baseline_yaml = tmp_dir / "baseline_noop.yaml"
213
- _dump_yaml(baseline_yaml, baseline_cfg)
214
-
215
- console.print("🏁 Running baseline (no-op edit)")
216
- from .run import run_command as _run
217
-
218
- _run(
219
- config=str(baseline_yaml),
220
- profile=profile,
221
- out=str(Path(out) / "source"),
222
- tier=tier,
223
- device=device,
224
- )
225
580
 
226
- baseline_report = _latest_run_report(Path(out) / "source")
227
- if not baseline_report:
228
- console.print("[red]❌ Could not locate baseline report after run")
229
- raise typer.Exit(1)
581
+ baseline_report_path: Path
582
+ if baseline_report:
583
+ _info(
584
+ "Using provided baseline report (skipping baseline evaluation)",
585
+ tag="EXEC",
586
+ emoji="♻️",
587
+ )
588
+ baseline_report_path = _load_and_validate_baseline_report(
589
+ Path(baseline_report),
590
+ expected_profile=profile,
591
+ expected_tier=tier,
592
+ expected_adapter=str(eff_adapter),
593
+ )
594
+ _debug(f"Baseline report: {baseline_report_path}")
595
+ else:
596
+ baseline_yaml = tmp_dir / "baseline_noop.yaml"
597
+ _dump_yaml(baseline_yaml, baseline_cfg)
598
+
599
+ _phase(1, 3, "BASELINE EVALUATION")
600
+ _info("Running baseline (no-op edit)", tag="EXEC", emoji="🏁")
601
+ _debug(f"Baseline config: {baseline_yaml}")
602
+ from .run import run_command as _run
603
+
604
+ with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
605
+ try:
606
+ with timed_step(
607
+ console=console,
608
+ style=output_style,
609
+ timings=timings,
610
+ key="baseline",
611
+ tag="EXEC",
612
+ message="Baseline",
613
+ emoji="🏁",
614
+ ):
615
+ _run(
616
+ config=str(baseline_yaml),
617
+ profile=profile,
618
+ out=str(Path(out) / "source"),
619
+ tier=tier,
620
+ device=device,
621
+ edit_label=baseline_label,
622
+ style=output_style.name,
623
+ progress=progress,
624
+ timing=False,
625
+ no_color=no_color,
626
+ )
627
+ except Exception:
628
+ if quiet_buffer is not None:
629
+ console.print(quiet_buffer.getvalue(), markup=False)
630
+ raise
631
+
632
+ baseline_report_path_candidate = _latest_run_report(Path(out) / "source")
633
+ if not baseline_report_path_candidate:
634
+ _fail("Could not locate baseline report after run", exit_code=1)
635
+ baseline_report_path = baseline_report_path_candidate
636
+ _debug(f"Baseline report: {baseline_report_path}")
230
637
 
231
638
  # Edited run: either no-op (Compare & Certify) or provided edit_config (demo edit)
639
+ _phase(2, 3, "SUBJECT EVALUATION")
232
640
  if edit_config:
233
641
  edited_yaml = Path(edit_config)
234
642
  if not edited_yaml.exists():
235
- console.print(f"[red]❌ Edit config not found: {edited_yaml}")
643
+ print_event(
644
+ console,
645
+ "FAIL",
646
+ f"Edit config not found: {edited_yaml}",
647
+ style=output_style,
648
+ emoji="❌",
649
+ )
236
650
  raise typer.Exit(1)
237
- console.print("✂️ Running edited (demo edit via --edit-config)")
651
+ _info("Running edited (demo edit via --edit-config)", tag="EXEC", emoji="✂️")
238
652
  # Overlay subject model id/adapter and output/context onto the provided edit config
239
653
  try:
240
654
  cfg_loaded: dict[str, Any] = _load_yaml(edited_yaml)
241
655
  except Exception as exc: # noqa: BLE001
242
- console.print(f"[red]❌ Failed to load edit config: {exc}")
656
+ print_event(
657
+ console,
658
+ "FAIL",
659
+ f"Failed to load edit config: {exc}",
660
+ style=output_style,
661
+ emoji="❌",
662
+ )
243
663
  raise typer.Exit(1) from exc
244
664
 
245
665
  # Ensure model.id/adapter point to the requested subject
@@ -268,23 +688,58 @@ def certify_command(
268
688
  "context": {"profile": profile, "tier": tier},
269
689
  },
270
690
  )
691
+ # Ensure the edited run always has a guard chain. Presets/edit configs
692
+ # often omit it, but `invarlock run` expects guards.order.
693
+ guards_block = merged_edited_cfg.get("guards")
694
+ guards_order_cfg = (
695
+ guards_block.get("order") if isinstance(guards_block, dict) else None
696
+ )
697
+ if not (
698
+ isinstance(guards_order_cfg, list)
699
+ and guards_order_cfg
700
+ and all(isinstance(item, str) for item in guards_order_cfg)
701
+ ):
702
+ merged_edited_cfg = _merge(
703
+ merged_edited_cfg, {"guards": {"order": guards_order}}
704
+ )
271
705
 
272
706
  # Persist a temporary merged config for traceability
273
707
  tmp_dir = Path(".certify_tmp")
274
708
  tmp_dir.mkdir(parents=True, exist_ok=True)
275
709
  edited_merged_yaml = tmp_dir / "edited_merged.yaml"
276
710
  _dump_yaml(edited_merged_yaml, merged_edited_cfg)
711
+ _debug(f"Edited config (merged): {edited_merged_yaml}")
277
712
 
278
713
  from .run import run_command as _run
279
714
 
280
- _run(
281
- config=str(edited_merged_yaml),
282
- profile=profile,
283
- out=str(Path(out) / "edited"),
284
- tier=tier,
285
- baseline=str(baseline_report),
286
- device=device,
287
- )
715
+ with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
716
+ try:
717
+ with timed_step(
718
+ console=console,
719
+ style=output_style,
720
+ timings=timings,
721
+ key="subject",
722
+ tag="EXEC",
723
+ message="Subject",
724
+ emoji="✂️",
725
+ ):
726
+ _run(
727
+ config=str(edited_merged_yaml),
728
+ profile=profile,
729
+ out=str(Path(out) / "edited"),
730
+ tier=tier,
731
+ baseline=str(baseline_report_path),
732
+ device=device,
733
+ edit_label=subject_label if edit_label else None,
734
+ style=output_style.name,
735
+ progress=progress,
736
+ timing=False,
737
+ no_color=no_color,
738
+ )
739
+ except Exception:
740
+ if quiet_buffer is not None:
741
+ console.print(quiet_buffer.getvalue(), markup=False)
742
+ raise
288
743
  else:
289
744
  edited_cfg = _merge(
290
745
  preset_data,
@@ -292,50 +747,121 @@ def certify_command(
292
747
  "model": {"id": norm_edt_id, "adapter": eff_adapter},
293
748
  "edit": {"name": "noop", "plan": {}},
294
749
  "eval": {},
295
- "guards": {
296
- "order": [
297
- "invariants",
298
- "spectral",
299
- "rmt",
300
- "variance",
301
- "invariants",
302
- ]
303
- },
750
+ "guards": {"order": guards_order},
304
751
  "output": {"dir": str(Path(out) / "edited")},
305
752
  "context": {"profile": profile, "tier": tier},
306
753
  },
307
754
  )
308
755
  edited_yaml = tmp_dir / "edited_noop.yaml"
309
756
  _dump_yaml(edited_yaml, edited_cfg)
310
- console.print("🧪 Running edited (no-op, Compare & Certify)")
757
+ _info("Running edited (no-op, Compare & Certify)", tag="EXEC", emoji="🧪")
758
+ _debug(f"Edited config: {edited_yaml}")
311
759
  from .run import run_command as _run
312
760
 
313
- _run(
314
- config=str(edited_yaml),
315
- profile=profile,
316
- out=str(Path(out) / "edited"),
317
- tier=tier,
318
- baseline=str(baseline_report),
319
- device=device,
320
- )
761
+ with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
762
+ try:
763
+ with timed_step(
764
+ console=console,
765
+ style=output_style,
766
+ timings=timings,
767
+ key="subject",
768
+ tag="EXEC",
769
+ message="Subject",
770
+ emoji="🧪",
771
+ ):
772
+ _run(
773
+ config=str(edited_yaml),
774
+ profile=profile,
775
+ out=str(Path(out) / "edited"),
776
+ tier=tier,
777
+ baseline=str(baseline_report_path),
778
+ device=device,
779
+ edit_label=subject_label,
780
+ style=output_style.name,
781
+ progress=progress,
782
+ timing=False,
783
+ no_color=no_color,
784
+ )
785
+ except Exception:
786
+ if quiet_buffer is not None:
787
+ console.print(quiet_buffer.getvalue(), markup=False)
788
+ raise
321
789
 
322
790
  edited_report = _latest_run_report(Path(out) / "edited")
323
791
  if not edited_report:
324
- console.print("[red]❌ Could not locate edited report after run")
792
+ print_event(
793
+ console,
794
+ "FAIL",
795
+ "Could not locate edited report after run",
796
+ style=output_style,
797
+ emoji="❌",
798
+ )
325
799
  raise typer.Exit(1)
800
+ _debug(f"Edited report: {edited_report}")
801
+
802
+ _phase(3, 3, "CERTIFICATE GENERATION")
803
+
804
+ def _emit_certificate() -> None:
805
+ _info("Emitting certificate", tag="EXEC", emoji="📜")
806
+ with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
807
+ try:
808
+ with timed_step(
809
+ console=console,
810
+ style=output_style,
811
+ timings=timings,
812
+ key="certificate",
813
+ tag="EXEC",
814
+ message="Certificate",
815
+ emoji="📜",
816
+ ):
817
+ report_kwargs = {
818
+ "run": str(edited_report),
819
+ "format": "cert",
820
+ "baseline": str(baseline_report_path),
821
+ "output": cert_out,
822
+ "style": output_style.name,
823
+ "no_color": no_color,
824
+ }
825
+ try:
826
+ sig = inspect.signature(_report)
827
+ except (TypeError, ValueError):
828
+ _report(**report_kwargs)
829
+ else:
830
+ if any(
831
+ param.kind == inspect.Parameter.VAR_KEYWORD
832
+ for param in sig.parameters.values()
833
+ ):
834
+ _report(**report_kwargs)
835
+ else:
836
+ _report(
837
+ **{
838
+ key: value
839
+ for key, value in report_kwargs.items()
840
+ if key in sig.parameters
841
+ }
842
+ )
843
+ except Exception:
844
+ if quiet_buffer is not None:
845
+ console.print(quiet_buffer.getvalue(), markup=False)
846
+ raise
326
847
 
327
848
  # CI/Release hard‑abort: fail fast when primary metric is not computable.
328
- # Fall back to legacy ppl_* keys when primary_metric block is absent.
329
849
  try:
330
850
  prof = str(profile or "").strip().lower()
331
851
  except Exception:
332
852
  prof = ""
333
- if prof in {"ci", "release"}:
853
+ if prof in {"ci", "ci_cpu", "release"}:
334
854
  try:
335
855
  with Path(edited_report).open("r", encoding="utf-8") as fh:
336
856
  edited_payload = json.load(fh)
337
857
  except Exception as exc: # noqa: BLE001
338
- console.print(f"[red]❌ Failed to read edited report: {exc}")
858
+ print_event(
859
+ console,
860
+ "FAIL",
861
+ f"Failed to read edited report: {exc}",
862
+ style=output_style,
863
+ emoji="❌",
864
+ )
339
865
  raise typer.Exit(1) from exc
340
866
 
341
867
  def _finite(x: Any) -> bool:
@@ -364,40 +890,74 @@ def certify_command(
364
890
  else None
365
891
  ) or "unknown"
366
892
 
367
- # Enforce only when a metric block is present; skip for minimal stub reports
368
- # Enforce only when a primary_metric block is present
893
+ # Enforce only when a primary_metric block is present; allow degraded-but-flagged metrics to emit certificates, but fail the task.
369
894
  has_metric_block = isinstance(pm, dict) and bool(pm)
370
895
  if has_metric_block:
371
- # Treat non‑finite PM as hard error in CI/Release (after legacy fallback).
372
- # Require a finite final value; preview is optional for legacy reports.
373
- if not _finite(pm_final):
374
- err = InvarlockError(
896
+ degraded = bool(pm.get("invalid") or pm.get("degraded"))
897
+ if degraded or not _finite(pm_final):
898
+ fallback = pm_prev if _finite(pm_prev) else pm_final
899
+ if not _finite(fallback) or fallback <= 0:
900
+ fallback = 1.0
901
+ degraded_reason = pm.get("degraded_reason") or (
902
+ "non_finite_pm"
903
+ if (not _finite(pm_prev) or not _finite(pm_final))
904
+ else "primary_metric_degraded"
905
+ )
906
+ print_event(
907
+ console,
908
+ "WARN",
909
+ "Primary metric degraded or non-finite; emitting certificate and marking task degraded. Primary metric computation failed.",
910
+ style=output_style,
911
+ emoji="⚠️",
912
+ )
913
+ pm["degraded"] = True
914
+ pm["invalid"] = pm.get("invalid") or True
915
+ pm["preview"] = pm_prev if _finite(pm_prev) else fallback
916
+ pm["final"] = pm_final if _finite(pm_final) else fallback
917
+ pm["ratio_vs_baseline"] = pm_ratio if _finite(pm_ratio) else 1.0
918
+ pm["degraded_reason"] = degraded_reason
919
+ metrics["primary_metric"] = pm
920
+ edited_payload.setdefault("metrics", {}).update(metrics)
921
+
922
+ # Emit the certificate for inspection, then exit with a CI-visible error.
923
+ _emit_certificate()
924
+ err = MetricsError(
375
925
  code="E111",
376
- message=(
377
- "Primary metric computation failed (NaN/inf). "
378
- f"Context: device={device}, adapter={adapter_name}, edit={edit_name}. "
379
- "Baseline ok; edited failed to compute ppl. "
380
- "Try: use an accelerator (mps/cuda), force float32, reduce max_modules, "
381
- "or lower the evaluation batch size."
382
- ),
926
+ message=f"Primary metric degraded or non-finite ({degraded_reason}).",
383
927
  details={
384
- "device": device,
928
+ "reason": degraded_reason,
385
929
  "adapter": adapter_name,
930
+ "device": device,
386
931
  "edit": edit_name,
387
- "pm_preview": pm_prev,
388
- "pm_final": pm_final,
389
- "pm_ratio": pm_ratio,
390
932
  },
391
933
  )
392
- code = _resolve_exit_code(err, profile=prof)
393
- console.print(f"[red]{err}[/red]")
394
- # Do not emit a certificate
395
- raise typer.Exit(code)
396
-
397
- console.print("📜 Emitting certificate")
398
- _report(
399
- run=str(edited_report),
400
- format="cert",
401
- baseline=str(baseline_report),
402
- output=cert_out,
403
- )
934
+ raise typer.Exit(_resolve_exit_code(err, profile=profile))
935
+
936
+ _emit_certificate()
937
+ if timing:
938
+ if total_start is not None:
939
+ timings["total"] = max(0.0, float(perf_counter() - total_start))
940
+ else:
941
+ timings["total"] = (
942
+ float(timings.get("baseline", 0.0))
943
+ + float(timings.get("subject", 0.0))
944
+ + float(timings.get("certificate", 0.0))
945
+ )
946
+ print_timing_summary(
947
+ console,
948
+ timings,
949
+ style=output_style,
950
+ order=[
951
+ ("Baseline", "baseline"),
952
+ ("Subject", "subject"),
953
+ ("Certificate", "certificate"),
954
+ ("Total", "total"),
955
+ ],
956
+ )
957
+ if verbosity == VERBOSITY_QUIET:
958
+ _print_quiet_summary(
959
+ cert_out=Path(cert_out),
960
+ source=src_id,
961
+ edited=edt_id,
962
+ profile=profile,
963
+ )