invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. invarlock/__init__.py +2 -2
  2. invarlock/_data/runtime/tiers.yaml +57 -30
  3. invarlock/adapters/__init__.py +11 -15
  4. invarlock/adapters/auto.py +35 -40
  5. invarlock/adapters/capabilities.py +2 -2
  6. invarlock/adapters/hf_causal.py +418 -0
  7. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  8. invarlock/adapters/hf_mixin.py +25 -4
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/calibration/spectral_null.py +15 -10
  12. invarlock/calibration/variance_ve.py +0 -2
  13. invarlock/cli/adapter_auto.py +31 -21
  14. invarlock/cli/app.py +73 -2
  15. invarlock/cli/commands/calibrate.py +6 -2
  16. invarlock/cli/commands/certify.py +651 -91
  17. invarlock/cli/commands/doctor.py +11 -11
  18. invarlock/cli/commands/explain_gates.py +57 -8
  19. invarlock/cli/commands/plugins.py +13 -9
  20. invarlock/cli/commands/report.py +233 -69
  21. invarlock/cli/commands/run.py +1066 -244
  22. invarlock/cli/commands/verify.py +154 -15
  23. invarlock/cli/config.py +22 -6
  24. invarlock/cli/doctor_helpers.py +4 -5
  25. invarlock/cli/output.py +193 -0
  26. invarlock/cli/provenance.py +1 -1
  27. invarlock/core/api.py +45 -5
  28. invarlock/core/auto_tuning.py +65 -20
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/contracts.py +7 -1
  31. invarlock/core/registry.py +11 -13
  32. invarlock/core/runner.py +425 -75
  33. invarlock/edits/quant_rtn.py +65 -37
  34. invarlock/eval/bench.py +3 -16
  35. invarlock/eval/data.py +82 -51
  36. invarlock/eval/metrics.py +63 -2
  37. invarlock/eval/primary_metric.py +23 -0
  38. invarlock/eval/tail_stats.py +230 -0
  39. invarlock/eval/tasks/__init__.py +12 -0
  40. invarlock/eval/tasks/classification.py +48 -0
  41. invarlock/eval/tasks/qa.py +36 -0
  42. invarlock/eval/tasks/text_generation.py +102 -0
  43. invarlock/guards/_estimators.py +154 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/policies.py +16 -6
  46. invarlock/guards/rmt.py +627 -546
  47. invarlock/guards/spectral.py +348 -110
  48. invarlock/guards/tier_config.py +32 -30
  49. invarlock/guards/variance.py +7 -31
  50. invarlock/guards_ref/rmt_ref.py +23 -23
  51. invarlock/model_profile.py +90 -42
  52. invarlock/observability/health.py +6 -6
  53. invarlock/observability/metrics.py +108 -0
  54. invarlock/reporting/certificate.py +384 -55
  55. invarlock/reporting/certificate_schema.py +3 -2
  56. invarlock/reporting/dataset_hashing.py +15 -2
  57. invarlock/reporting/guards_analysis.py +350 -277
  58. invarlock/reporting/html.py +55 -5
  59. invarlock/reporting/normalizer.py +13 -0
  60. invarlock/reporting/policy_utils.py +38 -36
  61. invarlock/reporting/primary_metric_utils.py +71 -17
  62. invarlock/reporting/render.py +852 -431
  63. invarlock/reporting/report.py +40 -4
  64. invarlock/reporting/report_types.py +11 -3
  65. invarlock/reporting/telemetry.py +86 -0
  66. invarlock/reporting/validate.py +1 -18
  67. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
  68. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
  69. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
  70. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
  71. invarlock/adapters/hf_gpt2.py +0 -404
  72. invarlock/adapters/hf_llama.py +0 -487
  73. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
  74. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
@@ -17,8 +17,10 @@ import random
17
17
  import shutil
18
18
  import sys as _sys
19
19
  import types as _types
20
+ import warnings
20
21
  from array import array
21
- from collections.abc import Iterable, Sequence
22
+ from collections.abc import Callable, Iterable, Iterator, Sequence
23
+ from contextlib import contextmanager
22
24
  from datetime import datetime
23
25
  from pathlib import Path
24
26
  from types import SimpleNamespace
@@ -30,6 +32,16 @@ import psutil
30
32
  import typer
31
33
  from rich.console import Console
32
34
 
35
+ from invarlock.cli.output import (
36
+ OutputStyle,
37
+ make_console,
38
+ perf_counter,
39
+ print_event,
40
+ print_timing_summary,
41
+ resolve_output_style,
42
+ timed_step,
43
+ )
44
+
33
45
  try:
34
46
  import torch
35
47
  except ImportError:
@@ -63,7 +75,42 @@ from ..config import (
63
75
  )
64
76
  from ..overhead_utils import _extract_pm_snapshot_for_overhead
65
77
 
66
- console = Console()
78
+ console = make_console()
79
+
80
+
81
+ def _style_from_console(console: Console, profile: str | None = None) -> OutputStyle:
82
+ style = getattr(console, "_invarlock_output_style", None)
83
+ if isinstance(style, OutputStyle):
84
+ return style
85
+ return resolve_output_style(
86
+ style=None,
87
+ profile=profile,
88
+ progress=False,
89
+ timing=False,
90
+ no_color=False,
91
+ )
92
+
93
+
94
+ def _event(
95
+ console: Console,
96
+ tag: str,
97
+ message: str,
98
+ *,
99
+ emoji: str | None = None,
100
+ console_style: str | None = None,
101
+ profile: str | None = None,
102
+ ) -> None:
103
+ style = _style_from_console(console, profile=profile)
104
+ print_event(
105
+ console,
106
+ tag,
107
+ message,
108
+ style=style,
109
+ emoji=emoji,
110
+ console_style=console_style,
111
+ )
112
+
113
+
67
114
  LIGHT_IMPORT = os.getenv("INVARLOCK_LIGHT_IMPORT", "").strip().lower() in {
68
115
  "1",
69
116
  "true",
@@ -76,6 +123,73 @@ RELEASE_MIN_WINDOWS_PER_ARM = 200
76
123
  RELEASE_CALIBRATION_MIN = 16
77
124
  RELEASE_CALIBRATION_MAX = 24
78
125
  GUARD_OVERHEAD_THRESHOLD = 0.01
126
+ KV_LABEL_WIDTH = 10
127
+
128
+ _NOISY_WARNING_PATTERNS = (
129
+ r".*`torch_dtype` is deprecated.*",
130
+ r".*loss_type=None.*unrecognized.*",
131
+ )
132
+
133
+
134
+ def _resolve_warning_suppression(profile: str | None) -> tuple[bool, bool]:
135
+ suppress_all = os.getenv("INVARLOCK_SUPPRESS_WARNINGS", "").strip().lower() in {
136
+ "1",
137
+ "true",
138
+ "yes",
139
+ "on",
140
+ }
141
+ profile_norm = (profile or "").strip().lower()
142
+ enabled = bool(suppress_all) or profile_norm in {"ci", "ci_cpu", "release", "dev"}
143
+ return enabled, suppress_all
144
+
145
+
146
+ def _apply_warning_filters(profile: str | None) -> bool:
147
+ enabled, suppress_all = _resolve_warning_suppression(profile)
148
+ if not enabled:
149
+ return False
150
+ if suppress_all:
151
+ warnings.simplefilter("ignore")
152
+ else:
153
+ for pattern in _NOISY_WARNING_PATTERNS:
154
+ warnings.filterwarnings("ignore", message=pattern)
155
+ return True
156
+
157
+
158
+ @contextmanager
159
+ def _suppress_noisy_warnings(profile: str | None) -> Iterator[None]:
160
+ enabled, _suppress_all = _resolve_warning_suppression(profile)
161
+ if not enabled:
162
+ yield
163
+ return
164
+ with warnings.catch_warnings():
165
+ _apply_warning_filters(profile)
166
+ yield
167
+
168
+
169
+ def _format_kv_line(label: str, value: str, *, width: int = KV_LABEL_WIDTH) -> str:
170
+ return f" {label:<{width}}: {value}"
171
+
172
+
173
+ def _device_resolution_note(target_device: str, resolved_device: str) -> str:
174
+ target_norm = str(target_device or "").strip().lower()
175
+ resolved_norm = str(resolved_device or "").strip().lower()
176
+ if not target_norm or target_norm == "auto":
177
+ return "auto-resolved"
178
+ if target_norm == resolved_norm:
179
+ return "requested"
180
+ return f"resolved from {target_device}"
181
+
182
+
183
+ def _format_guard_chain(guards: list[Any]) -> str:
184
+ names = [str(getattr(guard, "name", "unknown")) for guard in guards]
185
+ seen: set[str] = set()
186
+ deduped: list[str] = []
187
+ for name in names:
188
+ if name in seen:
189
+ continue
190
+ seen.add(name)
191
+ deduped.append(name)
192
+ return " → ".join(deduped)
79
193
 
80
194
 
81
195
  # Common dataset split aliases we probe in order when not explicitly set
@@ -108,6 +222,64 @@ def _coerce_mapping(obj: object) -> dict[str, Any]:
108
222
  return {}
109
223
 
110
224
 
225
+ def _prune_none_values(value: Any) -> Any:
226
+ """Recursively drop keys/items whose value is None.
227
+
228
+ Used when serializing dataclass-style config sections that define many optional
229
+ fields defaulting to None; those should behave as "unset" rather than explicit
230
+ policy overrides.
231
+ """
232
+
233
+ if isinstance(value, dict):
234
+ return {
235
+ key: _prune_none_values(val)
236
+ for key, val in value.items()
237
+ if val is not None
238
+ }
239
+ if isinstance(value, list):
240
+ return [_prune_none_values(item) for item in value if item is not None]
241
+ if isinstance(value, tuple):
242
+ return tuple(_prune_none_values(item) for item in value if item is not None)
243
+ return value
244
+
245
+
246
+ def _to_serialisable_dict(section: object) -> dict[str, Any]:
247
+ """Coerce config fragments to plain dicts.
248
+
249
+ Handles InvarLockConfig sections (which wrap dicts in a private `_Obj` with
250
+ `_data`) so downstream components (core.runner) see canonical mappings,
251
+ e.g. `eval.bootstrap.replicates`.
252
+ """
253
+
254
+ # Prefer native dump methods
255
+ if hasattr(section, "model_dump"):
256
+ return section.model_dump() # type: ignore[return-value]
257
+ if hasattr(section, "dict"):
258
+ try:
259
+ return section.dict() # type: ignore[return-value]
260
+ except Exception:
261
+ pass
262
+ # Unwrap CLI _Obj wrapper used by InvarLockConfig for attribute access
263
+ try:
264
+ raw = getattr(section, "_data", None)
265
+ if isinstance(raw, dict):
266
+ return raw
267
+ except Exception:
268
+ pass
269
+ # Already a mapping
270
+ if isinstance(section, dict):
271
+ return section
272
+ # Best-effort attribute dump (prune None so "unset" does not override tier defaults)
273
+ try:
274
+ data = vars(section)
275
+ # Common case: {'_data': {...}}
276
+ if isinstance(data, dict) and isinstance(data.get("_data"), dict):
277
+ return data["_data"]
278
+ return _prune_none_values(data) # type: ignore[return-value]
279
+ except TypeError:
280
+ return {}
281
+
282
+
111
283
  def _resolve_pm_acceptance_range(
112
284
  cfg: InvarLockConfig | dict[str, Any] | None,
113
285
  ) -> dict[str, float]:
@@ -183,6 +355,89 @@ def _resolve_pm_acceptance_range(
183
355
  return {"min": float(min_val), "max": float(max_val)}
184
356
 
185
357
 
358
+ def _resolve_pm_drift_band(
359
+ cfg: InvarLockConfig | dict[str, Any] | None,
360
+ ) -> dict[str, float]:
361
+ """Resolve preview→final drift band from config/env with safe defaults.
362
+
363
+ The drift band governs the Preview Final Drift Acceptable gate. By default,
364
+ certificates enforce 0.95–1.05 unless an explicit band is provided.
365
+ """
366
+
367
+ base_min = 0.95
368
+ base_max = 1.05
369
+
370
+ cfg_min = None
371
+ cfg_max = None
372
+ try:
373
+ cfg_map = _coerce_mapping(cfg) if cfg is not None else {}
374
+ pm_section = cfg_map.get("primary_metric") if isinstance(cfg_map, dict) else {}
375
+ pm_map = _coerce_mapping(pm_section)
376
+ drift_band = pm_map.get("drift_band") if isinstance(pm_map, dict) else None
377
+ if isinstance(drift_band, dict):
378
+ if drift_band.get("min") is not None:
379
+ try:
380
+ cfg_min = float(drift_band["min"])
381
+ except (TypeError, ValueError):
382
+ cfg_min = None
383
+ if drift_band.get("max") is not None:
384
+ try:
385
+ cfg_max = float(drift_band["max"])
386
+ except (TypeError, ValueError):
387
+ cfg_max = None
388
+ elif isinstance(drift_band, list | tuple) and len(drift_band) == 2:
389
+ try:
390
+ cfg_min = float(drift_band[0])
391
+ cfg_max = float(drift_band[1])
392
+ except (TypeError, ValueError):
393
+ cfg_min = None
394
+ cfg_max = None
395
+ except Exception:
396
+ cfg_min = None
397
+ cfg_max = None
398
+
399
+ def _parse_env(name: str) -> float | None:
400
+ try:
401
+ raw = os.environ.get(name, "")
402
+ if raw is None or str(raw).strip() == "":
403
+ return None
404
+ return float(raw)
405
+ except Exception:
406
+ return None
407
+
408
+ env_min = _parse_env("INVARLOCK_PM_DRIFT_MIN")
409
+ env_max = _parse_env("INVARLOCK_PM_DRIFT_MAX")
410
+
411
+ has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
412
+ if not has_explicit:
413
+ return {}
414
+
415
+ min_val = (
416
+ env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
417
+ )
418
+ max_val = (
419
+ env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
420
+ )
421
+
422
+ try:
423
+ if min_val is not None and min_val <= 0:
424
+ min_val = base_min
425
+ except Exception:
426
+ min_val = base_min
427
+ try:
428
+ if max_val is not None and max_val <= 0:
429
+ max_val = base_max
430
+ except Exception:
431
+ max_val = base_max
432
+ try:
433
+ if min_val is not None and max_val is not None and min_val >= max_val:
434
+ min_val, max_val = base_min, base_max
435
+ except Exception:
436
+ min_val, max_val = base_min, base_max
437
+
438
+ return {"min": float(min_val), "max": float(max_val)}
439
+
440
+
186
441
  def _free_model_memory(model: object | None) -> None:
187
442
  """Best-effort cleanup to release GPU memory for a model object."""
188
443
  if model is None:
@@ -296,7 +551,7 @@ def _resolve_exit_code(exc: Exception, *, profile: str | None) -> int:
296
551
  return 1
297
552
 
298
553
 
299
- ## NOTE: Deprecated legacy helper `_check_pairability_or_abort` was removed.
554
+ ## NOTE: Deprecated helper `_check_pairability_or_abort` was removed.
300
555
  ## Provider parity and pairing guarantees are enforced via guard digests and
301
556
  ## invariant checks during run execution.
302
557
 
@@ -696,38 +951,60 @@ def _prepare_config_for_run(
696
951
  resolve_edit_kind as _resolve_edit_kind,
697
952
  )
698
953
 
699
- console.print(f"📋 Loading configuration: {config_path}")
954
+ _event(
955
+ console,
956
+ "INIT",
957
+ f"Loading configuration: {config_path}",
958
+ emoji="📋",
959
+ profile=profile,
960
+ )
700
961
  cfg = _load_config(config_path)
701
962
 
702
963
  # Apply profile if specified (dev is a no-op)
703
- if profile and str(profile).lower() in {"ci", "release"}:
704
- console.print(f"🎯 Applying profile: {profile}")
964
+ if profile and str(profile).lower() not in {"dev"}:
965
+ _event(
966
+ console, "INIT", f"Applying profile: {profile}", emoji="🎯", profile=profile
967
+ )
705
968
  try:
706
969
  cfg = _apply_profile(cfg, profile)
707
970
  except Exception as exc:
708
- console.print(f"[red]{exc}[/red]")
971
+ _event(console, "FAIL", str(exc), emoji="❌", profile=profile)
709
972
  raise typer.Exit(1) from exc
710
973
 
711
974
  # Apply edit override
712
975
  if edit:
713
976
  try:
714
977
  edit_name = _resolve_edit_kind(edit)
715
- console.print(f"✂️ Edit override: {edit} → {edit_name}")
978
+ _event(
979
+ console,
980
+ "EXEC",
981
+ f"Edit override: {edit} → {edit_name}",
982
+ emoji="✂️",
983
+ profile=profile,
984
+ )
716
985
  cfg = _apply_edit_override(cfg, edit)
717
986
  except ValueError as e:
718
- console.print(f"[red]{e}[/red]")
987
+ _event(console, "FAIL", str(e), emoji="❌", profile=profile)
719
988
  raise typer.Exit(1) from e
720
989
 
721
990
  # Apply CLI overrides for auto configuration
722
991
  if tier or probes is not None:
723
992
  if tier and tier not in ["conservative", "balanced", "aggressive", "none"]:
724
- console.print(
725
- f"[red]❌ Invalid tier '{tier}'. Valid options: conservative, balanced, aggressive, none[/red]"
993
+ _event(
994
+ console,
995
+ "FAIL",
996
+ f"Invalid tier '{tier}'. Valid options: conservative, balanced, aggressive, none",
997
+ emoji="❌",
998
+ profile=profile,
726
999
  )
727
1000
  raise typer.Exit(1)
728
1001
  if probes is not None and (probes < 0 or probes > 10):
729
- console.print(
730
- f"[red]❌ Invalid probes '{probes}'. Must be between 0 and 10[/red]"
1002
+ _event(
1003
+ console,
1004
+ "FAIL",
1005
+ f"Invalid probes '{probes}'. Must be between 0 and 10",
1006
+ emoji="❌",
1007
+ profile=profile,
731
1008
  )
732
1009
  raise typer.Exit(1)
733
1010
 
@@ -738,10 +1015,22 @@ def _prepare_config_for_run(
738
1015
  cfg_dict["auto"] = auto_section
739
1016
  if tier:
740
1017
  auto_section["tier"] = tier
741
- console.print(f"🎛️ Auto tier override: {tier}")
1018
+ _event(
1019
+ console,
1020
+ "INIT",
1021
+ f"Auto tier override: {tier}",
1022
+ emoji="🎛️",
1023
+ profile=profile,
1024
+ )
742
1025
  if probes is not None:
743
1026
  auto_section["probes"] = probes
744
- console.print(f"🔬 Auto probes override: {probes}")
1027
+ _event(
1028
+ console,
1029
+ "INIT",
1030
+ f"Auto probes override: {probes}",
1031
+ emoji="🔬",
1032
+ profile=profile,
1033
+ )
745
1034
  cfg = InvarLockConfig(cfg_dict)
746
1035
 
747
1036
  # Resolve adapter:auto to a concrete built-in adapter if requested
@@ -774,7 +1063,7 @@ def _maybe_plan_release_windows(
774
1063
 
775
1064
 
776
1065
  def _print_pipeline_start(console: Console) -> None:
777
- console.print("🚀 Starting InvarLock pipeline...")
1066
+ _event(console, "INIT", "Starting InvarLock pipeline...", emoji="🚀")
778
1067
 
779
1068
 
780
1069
  def _emit_run_artifacts(
@@ -783,7 +1072,7 @@ def _emit_run_artifacts(
783
1072
  """Save run report and return emitted artifact paths."""
784
1073
  from invarlock.reporting.report import save_report as _save_report
785
1074
 
786
- console.print("💾 Saving run report...")
1075
+ _event(console, "DATA", "Saving run report...", emoji="💾")
787
1076
  return _save_report(
788
1077
  report, out_dir, formats=["json"], filename_prefix=filename_prefix
789
1078
  )
@@ -806,25 +1095,21 @@ def _resolve_device_and_output(
806
1095
  cfg_device = None
807
1096
  target_device = device or cfg_device or "auto"
808
1097
  resolved_device = _resolve_device(target_device)
809
- console.print(
810
- f"Device: {resolved_device} (requested={target_device}, resolved={resolved_device})"
811
- )
1098
+ resolution_note = _device_resolution_note(target_device, resolved_device)
1099
+ console.print(_format_kv_line("Device", f"{resolved_device} ({resolution_note})"))
812
1100
  is_valid, error_msg = _validate(resolved_device)
813
1101
  if not is_valid:
814
- console.print(f"[red]❌ Device validation failed: {error_msg}[/red]")
1102
+ _event(console, "FAIL", f"Device validation failed: {error_msg}", emoji="❌")
815
1103
  raise typer.Exit(1)
816
1104
 
817
- # Determine output directory (support both 'output.dir' and legacy 'out.dir')
1105
+ # Determine output directory
818
1106
  if out:
819
1107
  output_dir = Path(out)
820
1108
  else:
821
1109
  try:
822
1110
  output_dir = Path(cfg.output.dir)
823
1111
  except Exception:
824
- try:
825
- output_dir = Path(cfg.out.dir) # type: ignore[attr-defined]
826
- except Exception:
827
- output_dir = Path("runs")
1112
+ output_dir = Path("runs")
828
1113
  output_dir.mkdir(parents=True, exist_ok=True)
829
1114
  return str(resolved_device), output_dir
830
1115
 
@@ -837,6 +1122,7 @@ def _resolve_provider_and_split(
837
1122
  provider_kwargs: dict[str, Any] | None = None,
838
1123
  console: Console,
839
1124
  resolved_device: str | None = None,
1125
+ emit: Callable[[str, str, str | None], None] | None = None,
840
1126
  ) -> tuple[Any, str, bool]:
841
1127
  """Resolve dataset provider and split, returning (provider, split, used_fallback)."""
842
1128
  provider_name = None
@@ -863,7 +1149,10 @@ def _resolve_provider_and_split(
863
1149
  # Pass device hint only to providers that understand it (currently WikiText-2)
864
1150
  if resolved_device and provider_name == "wikitext2":
865
1151
  provider_kwargs.setdefault("device_hint", resolved_device)
866
- data_provider = get_provider_fn(provider_name, **provider_kwargs)
1152
+ if emit is not None and provider_name == "wikitext2":
1153
+ data_provider = get_provider_fn(provider_name, emit=emit, **provider_kwargs)
1154
+ else:
1155
+ data_provider = get_provider_fn(provider_name, **provider_kwargs)
867
1156
 
868
1157
  requested_split = None
869
1158
  try:
@@ -917,7 +1206,13 @@ def _extract_model_load_kwargs(cfg: InvarLockConfig) -> dict[str, Any]:
917
1206
  return extra
918
1207
 
919
1208
 
920
- def _load_model_with_cfg(adapter: Any, cfg: InvarLockConfig, device: str) -> Any:
1209
+ def _load_model_with_cfg(
1210
+ adapter: Any,
1211
+ cfg: InvarLockConfig,
1212
+ device: str,
1213
+ *,
1214
+ profile: str | None = None,
1215
+ ) -> Any:
921
1216
  """Load a model with config-provided kwargs, filtering for strict adapters."""
922
1217
  try:
923
1218
  model_id = cfg.model.id
@@ -930,20 +1225,21 @@ def _load_model_with_cfg(adapter: Any, cfg: InvarLockConfig, device: str) -> Any
930
1225
  raise ValueError("Missing model.id in config")
931
1226
 
932
1227
  extra = _extract_model_load_kwargs(cfg)
933
- try:
934
- sig = inspect.signature(adapter.load_model)
935
- accepts_var_kw = any(
936
- p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
937
- )
938
- if accepts_var_kw:
939
- return adapter.load_model(model_id, device=device, **extra)
940
- allowed = {k: v for k, v in extra.items() if k in sig.parameters}
941
- if allowed:
942
- return adapter.load_model(model_id, device=device, **allowed)
943
- except Exception:
944
- # Fall back to the strictest call shape.
945
- pass
946
- return adapter.load_model(model_id, device=device)
1228
+ with _suppress_noisy_warnings(profile):
1229
+ try:
1230
+ sig = inspect.signature(adapter.load_model)
1231
+ accepts_var_kw = any(
1232
+ p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
1233
+ )
1234
+ if accepts_var_kw:
1235
+ return adapter.load_model(model_id, device=device, **extra)
1236
+ allowed = {k: v for k, v in extra.items() if k in sig.parameters}
1237
+ if allowed:
1238
+ return adapter.load_model(model_id, device=device, **allowed)
1239
+ except Exception:
1240
+ # Fall back to the strictest call shape.
1241
+ pass
1242
+ return adapter.load_model(model_id, device=device)
947
1243
 
948
1244
 
949
1245
  def _run_bare_control(
@@ -963,14 +1259,20 @@ def _run_bare_control(
963
1259
  restore_fn: Any | None,
964
1260
  console: Console,
965
1261
  resolved_loss_type: str,
966
- profile_normalized: str | None,
1262
+ profile_normalized: str | None = None,
967
1263
  snapshot_provenance: dict[str, bool] | None = None,
968
1264
  skip_model_load: bool = False,
969
1265
  ) -> dict[str, Any] | None:
970
1266
  """Execute the bare-control run for overhead estimation and return payload."""
971
1267
  from invarlock.core.runner import CoreRunner as _CoreRunner
972
1268
 
973
- console.print("🧪 Running bare control (guards disabled) for overhead check")
1269
+ _event(
1270
+ console,
1271
+ "EXEC",
1272
+ "Running bare control (guards disabled) for overhead check",
1273
+ emoji="🧪",
1274
+ profile=profile_normalized,
1275
+ )
974
1276
  set_seed(seed_bundle["python"]) # type: ignore[arg-type]
975
1277
 
976
1278
  bare_runner = _CoreRunner()
@@ -979,6 +1281,12 @@ def _run_bare_control(
979
1281
  bare_context = copy.deepcopy(run_config.context)
980
1282
  bare_context.setdefault("validation", {})["guard_overhead_mode"] = "bare"
981
1283
  bare_config.context = bare_context
1284
+ runtime_edit_config = dict(edit_config or {})
1285
+ runtime_edit_config.setdefault("console", console)
1286
+ runtime_edit_config.setdefault(
1287
+ "output_style", _style_from_console(console, profile=profile_normalized)
1288
+ )
1289
+ runtime_edit_config.setdefault("emit", True)
982
1290
 
983
1291
  private_model_loaded = False
984
1292
  bare_target_model = None
@@ -992,7 +1300,9 @@ def _run_bare_control(
992
1300
  elif skip_model_load:
993
1301
  bare_target_model = model or SimpleNamespace(name="bare_stub_model")
994
1302
  else:
995
- bare_target_model = _load_model_with_cfg(adapter, cfg, resolved_device)
1303
+ bare_target_model = _load_model_with_cfg(
1304
+ adapter, cfg, resolved_device, profile=profile_normalized
1305
+ )
996
1306
  private_model_loaded = True
997
1307
  if snapshot_provenance is not None:
998
1308
  snapshot_provenance["reload_path_used"] = True
@@ -1005,7 +1315,7 @@ def _run_bare_control(
1005
1315
  config=bare_config,
1006
1316
  calibration_data=calibration_data,
1007
1317
  auto_config=auto_config,
1008
- edit_config=edit_config,
1318
+ edit_config=runtime_edit_config,
1009
1319
  preview_n=preview_count,
1010
1320
  final_n=final_count,
1011
1321
  )
@@ -1029,8 +1339,12 @@ def _run_bare_control(
1029
1339
  return False
1030
1340
 
1031
1341
  if not (_finite(bare_ppl_preview) and _finite(bare_ppl_final)):
1032
- console.print(
1033
- "[yellow]⚠️ Primary metric non-finite during bare control; continuing with diagnostics.[/yellow]"
1342
+ _event(
1343
+ console,
1344
+ "WARN",
1345
+ "Primary metric non-finite during bare control; continuing with diagnostics.",
1346
+ emoji="⚠️",
1347
+ profile=profile_normalized,
1034
1348
  )
1035
1349
 
1036
1350
  payload: dict[str, Any] = {
@@ -1082,6 +1396,7 @@ def _execute_guarded_run(
1082
1396
  final_count: int,
1083
1397
  restore_fn: Any | None,
1084
1398
  resolved_device: str,
1399
+ profile_normalized: str | None = None,
1085
1400
  console: Console,
1086
1401
  snapshot_provenance: dict[str, bool] | None = None,
1087
1402
  skip_model_load: bool = False,
@@ -1095,11 +1410,26 @@ def _execute_guarded_run(
1095
1410
  elif skip_model_load:
1096
1411
  model = model or SimpleNamespace(name="guarded_stub_model")
1097
1412
  else:
1098
- console.print(f"🔧 Loading model: {cfg.model.id} (attempt 1)")
1099
- model = _load_model_with_cfg(adapter, cfg, resolved_device)
1413
+ _event(
1414
+ console,
1415
+ "INIT",
1416
+ f"Loading model: {cfg.model.id} (attempt 1)",
1417
+ emoji="🔧",
1418
+ profile=profile_normalized,
1419
+ )
1420
+ model = _load_model_with_cfg(
1421
+ adapter, cfg, resolved_device, profile=profile_normalized
1422
+ )
1100
1423
  if snapshot_provenance is not None:
1101
1424
  snapshot_provenance["reload_path_used"] = True
1102
1425
 
1426
+ runtime_edit_config = dict(edit_config or {})
1427
+ runtime_edit_config.setdefault("console", console)
1428
+ runtime_edit_config.setdefault(
1429
+ "output_style", _style_from_console(console, profile=profile_normalized)
1430
+ )
1431
+ runtime_edit_config.setdefault("emit", True)
1432
+
1103
1433
  core_report = runner.execute(
1104
1434
  model=model,
1105
1435
  adapter=adapter,
@@ -1108,7 +1438,7 @@ def _execute_guarded_run(
1108
1438
  config=run_config,
1109
1439
  calibration_data=calibration_data,
1110
1440
  auto_config=auto_config,
1111
- edit_config=edit_config,
1441
+ edit_config=runtime_edit_config,
1112
1442
  preview_n=preview_count,
1113
1443
  final_n=final_count,
1114
1444
  )
@@ -1145,10 +1475,10 @@ def _postprocess_and_summarize(
1145
1475
  saved_files = _emit_run_artifacts(
1146
1476
  report=report, out_dir=run_dir, filename_prefix="report", console=console
1147
1477
  )
1148
- console.print("[green]✅ Run completed successfully![/green]")
1149
- console.print(f"📄 Report: {saved_files['json']}")
1478
+ _event(console, "PASS", "Run completed successfully!", emoji="✅")
1479
+ _event(console, "DATA", f"Report: {saved_files['json']}", emoji="📄")
1150
1480
  if run_config.event_path:
1151
- console.print(f"📝 Events: {run_config.event_path}")
1481
+ _event(console, "DATA", f"Events: {run_config.event_path}", emoji="📝")
1152
1482
  return saved_files
1153
1483
 
1154
1484
 
@@ -1238,9 +1568,14 @@ def _validate_and_harvest_baseline_schedule(
1238
1568
  message = f"PAIRING-EVIDENCE-MISSING: {path}: {reason}"
1239
1569
  if prof in {"ci", "release"}:
1240
1570
  raise InvarlockError(code="E001", message=message)
1241
- _print(
1242
- f"[red]❌ Baseline pairing schedule '{path}' is incompatible: {reason}[/red]"
1243
- )
1571
+ if console is not None:
1572
+ _event(
1573
+ console,
1574
+ "FAIL",
1575
+ f"Baseline pairing schedule '{path}' is incompatible: {reason}",
1576
+ emoji="❌",
1577
+ profile=prof,
1578
+ )
1244
1579
  raise typer.Exit(1)
1245
1580
 
1246
1581
  baseline_meta = (
@@ -1297,7 +1632,7 @@ def _validate_and_harvest_baseline_schedule(
1297
1632
  _fail_schedule(f"{label} input_ids empty at index {idx}")
1298
1633
  seqs.append(seq_ints)
1299
1634
 
1300
- # attention_masks are required for pairing, but legacy baselines may omit them.
1635
+ # attention_masks are required for pairing, but some baselines may omit them.
1301
1636
  # When absent, default to all-ones masks (cannot infer padding reliably).
1302
1637
  masks_rows: list[list[int]] = []
1303
1638
  masks_missing = masks is None or masks == []
@@ -1395,9 +1730,14 @@ def _validate_and_harvest_baseline_schedule(
1395
1730
  prof = (profile or "dev").strip().lower()
1396
1731
  if prof in {"ci", "release"}:
1397
1732
  _fail_schedule("preview_hash mismatch vs baseline report data")
1398
- _print(
1399
- "[yellow]⚠️ Baseline preview_hash mismatch; continuing in dev profile.[/yellow]"
1400
- )
1733
+ if console is not None:
1734
+ _event(
1735
+ console,
1736
+ "WARN",
1737
+ "Baseline preview_hash mismatch; continuing in dev profile.",
1738
+ emoji="⚠️",
1739
+ profile=prof,
1740
+ )
1401
1741
  if (
1402
1742
  isinstance(baseline_final_hash, str)
1403
1743
  and baseline_final_hash
@@ -1406,9 +1746,14 @@ def _validate_and_harvest_baseline_schedule(
1406
1746
  prof = (profile or "dev").strip().lower()
1407
1747
  if prof in {"ci", "release"}:
1408
1748
  _fail_schedule("final_hash mismatch vs baseline report data")
1409
- _print(
1410
- "[yellow]⚠️ Baseline final_hash mismatch; continuing in dev profile.[/yellow]"
1411
- )
1749
+ if console is not None:
1750
+ _event(
1751
+ console,
1752
+ "WARN",
1753
+ "Baseline final_hash mismatch; continuing in dev profile.",
1754
+ emoji="⚠️",
1755
+ profile=prof,
1756
+ )
1412
1757
  if (
1413
1758
  isinstance(baseline_dataset_hash, str)
1414
1759
  and baseline_dataset_hash
@@ -1417,9 +1762,14 @@ def _validate_and_harvest_baseline_schedule(
1417
1762
  prof = (profile or "dev").strip().lower()
1418
1763
  if prof in {"ci", "release"}:
1419
1764
  _fail_schedule("dataset_hash mismatch vs baseline report data")
1420
- _print(
1421
- "[yellow]⚠️ Baseline dataset_hash mismatch; continuing in dev profile.[/yellow]"
1422
- )
1765
+ if console is not None:
1766
+ _event(
1767
+ console,
1768
+ "WARN",
1769
+ "Baseline dataset_hash mismatch; continuing in dev profile.",
1770
+ emoji="⚠️",
1771
+ profile=prof,
1772
+ )
1423
1773
  except InvarlockError:
1424
1774
  raise
1425
1775
  except typer.Exit:
@@ -1441,10 +1791,14 @@ def _validate_and_harvest_baseline_schedule(
1441
1791
  and baseline_final is not None
1442
1792
  and baseline_final != cfg_final
1443
1793
  ):
1444
- _print(
1445
- "[yellow]⚠️ Adjusting evaluation window counts to match baseline schedule "
1446
- f"({baseline_preview}/{baseline_final}).[/yellow]"
1447
- )
1794
+ if console is not None:
1795
+ _event(
1796
+ console,
1797
+ "WARN",
1798
+ f"Adjusting evaluation window counts to match baseline schedule ({baseline_preview}/{baseline_final}).",
1799
+ emoji="⚠️",
1800
+ profile=profile,
1801
+ )
1448
1802
 
1449
1803
  effective_preview = int(baseline_preview)
1450
1804
  effective_final = int(baseline_final)
@@ -1607,10 +1961,11 @@ def _resolve_metric_and_provider(
1607
1961
  model_profile: Any,
1608
1962
  *,
1609
1963
  resolved_loss_type: str | None = None,
1964
+ metric_kind_override: str | None = None,
1610
1965
  ) -> tuple[str, str, dict[str, float]]:
1611
1966
  """Resolve metric kind, provider kind, and metric options from config with precedence.
1612
1967
 
1613
- Precedence: CLI args (not handled here) → config → ModelProfile defaults → legacy fallback.
1968
+ Precedence: CLI args (not handled here) → config → ModelProfile defaults → fallback.
1614
1969
  Primary metric (metric‑v1) is canonical in dev‑phase; no env flag toggles.
1615
1970
  """
1616
1971
  # Provider kind
@@ -1646,9 +2001,13 @@ def _resolve_metric_and_provider(
1646
2001
  metric_cfg = None
1647
2002
 
1648
2003
  metric_kind = None
2004
+ if isinstance(metric_kind_override, str) and metric_kind_override.strip():
2005
+ mk_override = metric_kind_override.strip().lower()
2006
+ if mk_override != "auto":
2007
+ metric_kind = mk_override
1649
2008
  reps = None
1650
2009
  ci_level = None
1651
- if metric_cfg is not None:
2010
+ if metric_kind is None and metric_cfg is not None:
1652
2011
  try:
1653
2012
  metric_kind = (
1654
2013
  metric_cfg.get("kind")
@@ -1684,11 +2043,11 @@ def _resolve_metric_and_provider(
1684
2043
  else:
1685
2044
  metric_kind = None
1686
2045
 
1687
- # Fallback to model profile default or legacy resolution by loss type
2046
+ # Fallback to model profile default or loss-type mapping
1688
2047
  if not metric_kind and hasattr(model_profile, "default_metric"):
1689
2048
  metric_kind = model_profile.default_metric
1690
2049
  if not metric_kind:
1691
- # Legacy: map from loss kind
2050
+ # Map from loss kind
1692
2051
  lk = (resolved_loss_type or "causal").lower()
1693
2052
  if lk == "mlm":
1694
2053
  metric_kind = "ppl_mlm"
@@ -1770,18 +2129,25 @@ def _plan_release_windows(
1770
2129
  candidate_msg = f", candidate_unique={int(candidate_unique)}" + (
1771
2130
  f"/{int(candidate_limit)}" if candidate_limit is not None else ""
1772
2131
  )
1773
- console.print(
1774
- "📏 Release window capacity:"
2132
+ _event(
2133
+ console,
2134
+ "METRIC",
2135
+ "Release window capacity:"
1775
2136
  f" unique={available_unique}, reserve={reserve_windows} "
1776
2137
  f"(calib {calibration_windows}, buffer {buffer_windows}), "
1777
2138
  f"usable={available_for_eval}, "
1778
2139
  f"per-arm raw={actual_per_arm_raw} → selected {actual_per_arm} "
1779
- f"(target {target_per_arm}{candidate_msg})"
2140
+ f"(target {target_per_arm}{candidate_msg})",
2141
+ emoji="📏",
2142
+ profile="release",
1780
2143
  )
1781
2144
  if actual_per_arm < target_per_arm:
1782
- console.print(
1783
- "[yellow]⚠️ Adjusted per-arm windows down from "
1784
- f"{target_per_arm} to {actual_per_arm} based on capacity.[/yellow]"
2145
+ _event(
2146
+ console,
2147
+ "WARN",
2148
+ f"Adjusted per-arm windows down from {target_per_arm} to {actual_per_arm} based on capacity.",
2149
+ emoji="⚠️",
2150
+ profile="release",
1785
2151
  )
1786
2152
 
1787
2153
  plan = {
@@ -1832,15 +2198,30 @@ def run_command(
1832
2198
  None, "--device", help="Device override (auto|cuda|mps|cpu)"
1833
2199
  ),
1834
2200
  profile: str | None = typer.Option(
1835
- None, "--profile", help="Profile to apply (ci|release)"
2201
+ None,
2202
+ "--profile",
2203
+ help="Profile to apply (e.g. ci, release, ci_cpu; dev is a no-op)",
1836
2204
  ),
1837
2205
  out: str | None = typer.Option(None, "--out", help="Output directory override"),
1838
2206
  edit: str | None = typer.Option(None, "--edit", help="Edit kind (quant|mixed)"),
2207
+ edit_label: str | None = typer.Option(
2208
+ None,
2209
+ "--edit-label",
2210
+ help=(
2211
+ "Edit algorithm label for BYOE models. Use 'noop' for baseline, "
2212
+ "'quant_rtn' etc. for built-in edits, 'custom' for pre-edited models."
2213
+ ),
2214
+ ),
1839
2215
  tier: str | None = typer.Option(
1840
2216
  None,
1841
2217
  "--tier",
1842
2218
  help="Auto-tuning tier override (conservative|balanced|aggressive)",
1843
2219
  ),
2220
+ metric_kind: str | None = typer.Option(
2221
+ None,
2222
+ "--metric-kind",
2223
+ help="Primary metric kind override (ppl_causal|ppl_mlm|accuracy|etc.)",
2224
+ ),
1844
2225
  probes: int | None = typer.Option(
1845
2226
  None, "--probes", help="Number of micro-probes (0=deterministic, >0=adaptive)"
1846
2227
  ),
@@ -1861,6 +2242,19 @@ def run_command(
1861
2242
  no_cleanup: bool = typer.Option(
1862
2243
  False, "--no-cleanup", help="Skip cleanup of temporary artifacts"
1863
2244
  ),
2245
+ style: str | None = typer.Option(
2246
+ None, "--style", help="Output style (audit|friendly)"
2247
+ ),
2248
+ progress: bool = typer.Option(
2249
+ False, "--progress", help="Show progress done messages"
2250
+ ),
2251
+ timing: bool = typer.Option(False, "--timing", help="Show timing summary"),
2252
+ telemetry: bool = typer.Option(
2253
+ False, "--telemetry", help="Write telemetry JSON alongside the report"
2254
+ ),
2255
+ no_color: bool = typer.Option(
2256
+ False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
2257
+ ),
1864
2258
  ):
1865
2259
  """
1866
2260
  Run InvarLock pipeline with the given configuration.
@@ -1879,24 +2273,57 @@ def run_command(
1879
2273
  config = _coerce_option(config)
1880
2274
  device = _coerce_option(device)
1881
2275
  profile = _coerce_option(profile)
2276
+ profile_normalized = (str(profile or "")).strip().lower()
1882
2277
  out = _coerce_option(out)
1883
2278
  edit = _coerce_option(edit)
2279
+ edit_label = _coerce_option(edit_label)
1884
2280
  tier = _coerce_option(tier)
2281
+ metric_kind = _coerce_option(metric_kind)
1885
2282
  probes = _coerce_option(probes)
1886
2283
  until_pass = bool(_coerce_option(until_pass, False))
1887
2284
  max_attempts = int(_coerce_option(max_attempts, 3))
1888
2285
  timeout = _coerce_option(timeout)
1889
2286
  baseline = _coerce_option(baseline)
1890
2287
  no_cleanup = bool(_coerce_option(no_cleanup, False))
2288
+ style = _coerce_option(style)
2289
+ progress = bool(_coerce_option(progress, False))
2290
+ timing = bool(_coerce_option(timing, False))
2291
+ telemetry = bool(_coerce_option(telemetry, False))
2292
+ no_color = bool(_coerce_option(no_color, False))
2293
+
2294
+ output_style = resolve_output_style(
2295
+ style=str(style) if style is not None else None,
2296
+ profile=profile_normalized,
2297
+ progress=progress,
2298
+ timing=timing,
2299
+ no_color=no_color,
2300
+ )
2301
+ console._invarlock_output_style = output_style
2302
+ if not output_style.color:
2303
+ console.no_color = True
2304
+ timings: dict[str, float] = {}
2305
+ collect_timings = bool(output_style.timing or telemetry)
2306
+ total_start: float | None = perf_counter() if collect_timings else None
2307
+
2308
+ _apply_warning_filters(profile_normalized)
1891
2309
 
1892
2310
  # Use shared CLI coercers from invarlock.cli.utils
1893
2311
  report_path_out: str | None = None
1894
2312
 
1895
2313
  def _fail_run(message: str) -> None:
1896
- console.print(f"[red]❌ {message}[/red]")
2314
+ _event(console, "FAIL", message, emoji="❌", profile=profile_normalized)
1897
2315
  # Generic failure path → exit 1 (InvarlockError paths handle code 3 separately)
1898
2316
  raise typer.Exit(1)
1899
2317
 
2318
+ def _provider_event(tag: str, message: str, emoji: str | None = None) -> None:
2319
+ _event(
2320
+ console,
2321
+ tag,
2322
+ message,
2323
+ emoji=emoji,
2324
+ profile=profile_normalized,
2325
+ )
2326
+
1900
2327
  # Fail fast when torch is missing so users see a clear extras hint instead of
1901
2328
  # a raw ModuleNotFoundError from deeper imports.
1902
2329
  try:
@@ -1904,12 +2331,14 @@ def run_command(
1904
2331
 
1905
2332
  _ = _torch # pragma: no cover
1906
2333
  except (ImportError, ModuleNotFoundError) as e:
1907
- console.print(
1908
- "❌ Torch is required for this command. "
2334
+ _event(
2335
+ console,
2336
+ "FAIL",
2337
+ "Torch is required for this command. "
1909
2338
  'Install extras with: pip install "invarlock[hf]" '
1910
2339
  'or "invarlock[adapters]".',
1911
- style="red",
1912
- markup=False,
2340
+ emoji="",
2341
+ profile=profile_normalized,
1913
2342
  )
1914
2343
  raise typer.Exit(1) from e
1915
2344
 
@@ -1987,7 +2416,7 @@ def run_command(
1987
2416
  seed_value = 42
1988
2417
  set_seed(seed_value)
1989
2418
  # Enforce deterministic algorithms in CI/Release profiles when torch is available
1990
- profile_label = (str(profile or "").lower()) if profile else None
2419
+ profile_label = profile_normalized or None
1991
2420
  if torch is not None and profile_label in {"ci", "release"}:
1992
2421
  try: # pragma: no cover - behavior depends on torch availability
1993
2422
  if hasattr(torch, "use_deterministic_algorithms"):
@@ -2016,10 +2445,14 @@ def run_command(
2016
2445
  "numpy": int(numpy_seed),
2017
2446
  "torch": int(torch_seed) if torch_seed is not None else None,
2018
2447
  }
2019
- console.print(
2020
- "🎲 Deterministic seeds → "
2448
+ _event(
2449
+ console,
2450
+ "INIT",
2451
+ "Deterministic seeds → "
2021
2452
  f"python={seed_bundle['python']}, numpy={seed_bundle['numpy']}, "
2022
- f"torch={seed_bundle['torch'] if seed_bundle['torch'] is not None else 'N/A'}"
2453
+ f"torch={seed_bundle['torch'] if seed_bundle['torch'] is not None else 'N/A'}",
2454
+ emoji="🎲",
2455
+ profile=profile_normalized,
2023
2456
  )
2024
2457
 
2025
2458
  # Resolve device and output directory
@@ -2054,8 +2487,8 @@ def run_command(
2054
2487
 
2055
2488
  run_id = f"{output_dir.name}-{timestamp}" if output_dir.name else timestamp
2056
2489
 
2057
- console.print(f"📁 Output directory: {run_dir}")
2058
- console.print(f"🆔 Run ID: {run_id}")
2490
+ console.print(_format_kv_line("Output", str(run_dir)))
2491
+ console.print(_format_kv_line("Run ID", run_id))
2059
2492
 
2060
2493
  # Initialize retry controller if --until-pass mode enabled
2061
2494
  retry_controller = _init_retry_controller(
@@ -2070,7 +2503,6 @@ def run_command(
2070
2503
  pairing_schedule: dict[str, Any] | None = None
2071
2504
  if baseline:
2072
2505
  baseline_path = Path(baseline)
2073
- profile_normalized = (profile or "").strip().lower()
2074
2506
  strict_baseline = profile_normalized in {"ci", "release"}
2075
2507
  if not baseline_path.exists():
2076
2508
  msg = (
@@ -2079,8 +2511,12 @@ def run_command(
2079
2511
  )
2080
2512
  if strict_baseline:
2081
2513
  raise InvarlockError(code="E001", message=msg)
2082
- console.print(
2083
- f"[yellow]⚠️ {msg}. Falling back to dataset schedule.[/yellow]"
2514
+ _event(
2515
+ console,
2516
+ "WARN",
2517
+ f"{msg}. Falling back to dataset schedule.",
2518
+ emoji="⚠️",
2519
+ profile=profile_normalized,
2084
2520
  )
2085
2521
  else:
2086
2522
  try:
@@ -2090,8 +2526,12 @@ def run_command(
2090
2526
  msg = f"PAIRING-EVIDENCE-MISSING: baseline report JSON parse failed ({exc})"
2091
2527
  if strict_baseline:
2092
2528
  raise InvarlockError(code="E001", message=msg) from exc
2093
- console.print(
2094
- f"[yellow]⚠️ {msg}. Falling back to dataset schedule.[/yellow]"
2529
+ _event(
2530
+ console,
2531
+ "WARN",
2532
+ f"{msg}. Falling back to dataset schedule.",
2533
+ emoji="⚠️",
2534
+ profile=profile_normalized,
2095
2535
  )
2096
2536
  baseline_report_data = None
2097
2537
  if isinstance(baseline_report_data, dict):
@@ -2099,11 +2539,28 @@ def run_command(
2099
2539
  if pairing_schedule:
2100
2540
  # Normalize baseline report in-memory so downstream digest/parity
2101
2541
  # computations see a consistent window_id + mask shape even for
2102
- # legacy baselines missing some fields.
2542
+ # baselines missing some fields.
2103
2543
  try:
2104
- baseline_report_data["evaluation_windows"] = (
2105
- pairing_schedule
2106
- )
2544
+ ew = baseline_report_data.get("evaluation_windows")
2545
+ if not isinstance(ew, dict):
2546
+ ew = {}
2547
+ baseline_report_data["evaluation_windows"] = ew
2548
+ # Merge the sanitized pairing schedule into existing
2549
+ # evaluation_windows without discarding logloss/token_counts.
2550
+ for arm in ("preview", "final"):
2551
+ src = (
2552
+ pairing_schedule.get(arm)
2553
+ if isinstance(pairing_schedule, dict)
2554
+ else None
2555
+ )
2556
+ if not isinstance(src, dict):
2557
+ continue
2558
+ dst = ew.get(arm)
2559
+ if not isinstance(dst, dict):
2560
+ ew[arm] = dict(src)
2561
+ continue
2562
+ for key, value in src.items():
2563
+ dst[key] = value
2107
2564
  except Exception:
2108
2565
  pass
2109
2566
  # Harvest tokenizer hash provenance from baseline when present.
@@ -2132,8 +2589,12 @@ def run_command(
2132
2589
  tokenizer_hash = tok
2133
2590
  except Exception:
2134
2591
  pass
2135
- console.print(
2136
- "🧬 Loaded baseline evaluation schedule for pairing"
2592
+ _event(
2593
+ console,
2594
+ "DATA",
2595
+ "Loaded baseline evaluation schedule for pairing",
2596
+ emoji="🧬",
2597
+ profile=profile_normalized,
2137
2598
  )
2138
2599
  else:
2139
2600
  msg = (
@@ -2142,8 +2603,12 @@ def run_command(
2142
2603
  )
2143
2604
  if strict_baseline:
2144
2605
  raise InvarlockError(code="E001", message=msg)
2145
- console.print(
2146
- f"[yellow]⚠️ {msg}. Falling back to dataset schedule.[/yellow]"
2606
+ _event(
2607
+ console,
2608
+ "WARN",
2609
+ f"{msg}. Falling back to dataset schedule.",
2610
+ emoji="⚠️",
2611
+ profile=profile_normalized,
2147
2612
  )
2148
2613
  baseline_report_data = None
2149
2614
  pairing_schedule = None
@@ -2169,15 +2634,23 @@ def run_command(
2169
2634
  adapter = registry.get_adapter(cfg.model.adapter)
2170
2635
  edit_name = getattr(getattr(cfg, "edit", None), "name", None)
2171
2636
  if not isinstance(edit_name, str) or not edit_name.strip():
2172
- console.print(
2173
- "[red]❌ Edit configuration must specify a non-empty `edit.name`.[/red]"
2637
+ _event(
2638
+ console,
2639
+ "FAIL",
2640
+ "Edit configuration must specify a non-empty `edit.name`.",
2641
+ emoji="❌",
2642
+ profile=profile_normalized,
2174
2643
  )
2175
2644
  raise typer.Exit(1)
2176
2645
  try:
2177
2646
  edit_op = registry.get_edit(edit_name.strip())
2178
2647
  except Exception:
2179
- console.print(
2180
- f"[yellow]⚠️ Unknown edit '{edit_name.strip()}'. Using pass-through shim.[/yellow]"
2648
+ _event(
2649
+ console,
2650
+ "WARN",
2651
+ f"Unknown edit '{edit_name.strip()}'. Using pass-through shim.",
2652
+ emoji="⚠️",
2653
+ profile=profile_normalized,
2181
2654
  )
2182
2655
  edit_op = SimpleNamespace(name=edit_name.strip())
2183
2656
 
@@ -2213,8 +2686,12 @@ def run_command(
2213
2686
  registry.get_plugin_metadata(guard_name, "guards")
2214
2687
  )
2215
2688
  except KeyError:
2216
- console.print(
2217
- f"[yellow]⚠️ Guard '{guard_name}' not found, skipping[/yellow]"
2689
+ _event(
2690
+ console,
2691
+ "WARN",
2692
+ f"Guard '{guard_name}' not found, skipping",
2693
+ emoji="⚠️",
2694
+ profile=profile_normalized,
2218
2695
  )
2219
2696
  plugin_provenance = {
2220
2697
  "adapter": adapter_meta,
@@ -2222,54 +2699,22 @@ def run_command(
2222
2699
  "guards": guard_metadata,
2223
2700
  }
2224
2701
  pm_acceptance_range = _resolve_pm_acceptance_range(cfg)
2225
-
2226
- console.print(f"🔌 Adapter: {adapter.name}")
2702
+ pm_drift_band = _resolve_pm_drift_band(cfg)
2703
+
2704
+ _event(
2705
+ console,
2706
+ "DATA",
2707
+ f"Adapter: {adapter.name}",
2708
+ emoji="🔌",
2709
+ profile=profile_normalized,
2710
+ )
2227
2711
 
2228
2712
  # Create run configuration
2229
- def _to_serialisable_dict(section: object) -> dict[str, Any]:
2230
- """Coerce config fragments to plain dicts.
2231
-
2232
- Handles InvarLockConfig sections (which wrap dicts in a private `_Obj` with
2233
- `_data`) so downstream components (core.runner) see canonical mappings,
2234
- e.g. `eval.bootstrap.replicates`.
2235
- """
2236
- # Prefer native dump methods
2237
- if hasattr(section, "model_dump"):
2238
- return section.model_dump() # type: ignore[return-value]
2239
- if hasattr(section, "dict"):
2240
- try:
2241
- return section.dict() # type: ignore[return-value]
2242
- except Exception:
2243
- pass
2244
- # Unwrap CLI _Obj wrapper used by InvarLockConfig for attribute access
2245
- try:
2246
- raw = getattr(section, "_data", None)
2247
- if isinstance(raw, dict):
2248
- return raw
2249
- except Exception:
2250
- pass
2251
- # Already a mapping
2252
- if isinstance(section, dict):
2253
- return section
2254
- # Best-effort attribute dump
2255
- try:
2256
- data = vars(section)
2257
- # Common case: {'_data': {...}}
2258
- if isinstance(data, dict) and isinstance(data.get("_data"), dict):
2259
- return data["_data"]
2260
- return data # type: ignore[return-value]
2261
- except TypeError:
2262
- return {}
2263
-
2264
- def _dump_guard(section: object) -> dict[str, Any]:
2265
- data = _to_serialisable_dict(section)
2266
- return data if isinstance(data, dict) else {}
2267
-
2268
2713
  guard_overrides = {
2269
- "spectral": _dump_guard(getattr(cfg.guards, "spectral", {})),
2270
- "rmt": _dump_guard(getattr(cfg.guards, "rmt", {})),
2271
- "variance": _dump_guard(getattr(cfg.guards, "variance", {})),
2272
- "invariants": _dump_guard(getattr(cfg.guards, "invariants", {})),
2714
+ "spectral": _to_serialisable_dict(getattr(cfg.guards, "spectral", {})),
2715
+ "rmt": _to_serialisable_dict(getattr(cfg.guards, "rmt", {})),
2716
+ "variance": _to_serialisable_dict(getattr(cfg.guards, "variance", {})),
2717
+ "invariants": _to_serialisable_dict(getattr(cfg.guards, "invariants", {})),
2273
2718
  }
2274
2719
 
2275
2720
  if model_profile.invariants:
@@ -2297,10 +2742,38 @@ def run_command(
2297
2742
  "plugins": plugin_provenance,
2298
2743
  "run_id": run_id,
2299
2744
  }
2745
+ # Provide baseline per-window logloss to the CoreRunner for paired tail
2746
+ # evidence and (optionally) fail/rollback enforcement.
2747
+ try:
2748
+ if isinstance(baseline_report_data, dict):
2749
+ ew = baseline_report_data.get("evaluation_windows")
2750
+ if isinstance(ew, dict):
2751
+ final = ew.get("final")
2752
+ if (
2753
+ isinstance(final, dict)
2754
+ and isinstance(final.get("window_ids"), list)
2755
+ and isinstance(final.get("logloss"), list)
2756
+ ):
2757
+ base_eval: dict[str, Any] = {
2758
+ "final": {
2759
+ "window_ids": list(final.get("window_ids") or []),
2760
+ "logloss": list(final.get("logloss") or []),
2761
+ }
2762
+ }
2763
+ if isinstance(final.get("token_counts"), list):
2764
+ base_eval["final"]["token_counts"] = list(
2765
+ final.get("token_counts") or []
2766
+ )
2767
+ run_context["baseline_eval_windows"] = base_eval
2768
+ except Exception:
2769
+ pass
2300
2770
  run_context.setdefault("primary_metric", {})["acceptance_range"] = (
2301
2771
  pm_acceptance_range
2302
2772
  )
2303
2773
  run_context["pm_acceptance_range"] = pm_acceptance_range
2774
+ if pm_drift_band:
2775
+ run_context.setdefault("primary_metric", {})["drift_band"] = pm_drift_band
2776
+ run_context["pm_drift_band"] = pm_drift_band
2304
2777
  run_context["model_profile"] = {
2305
2778
  "family": model_profile.family,
2306
2779
  "default_loss": model_profile.default_loss,
@@ -2331,6 +2804,7 @@ def run_command(
2331
2804
  dataset_meta: dict[str, Any] = {}
2332
2805
  baseline_meta: dict[str, Any] = {}
2333
2806
  window_plan: dict[str, Any] | None = None
2807
+ dataset_timing_start: float | None = perf_counter() if collect_timings else None
2334
2808
  if pairing_schedule:
2335
2809
  harvested = _validate_and_harvest_baseline_schedule(
2336
2810
  cfg,
@@ -2353,7 +2827,7 @@ def run_command(
2353
2827
  try:
2354
2828
  tokenizer, tokenizer_hash = resolve_tokenizer(model_profile)
2355
2829
  except Exception as exc:
2356
- console.print(f"[red]{exc}[/red]")
2830
+ _event(console, "FAIL", str(exc), emoji="❌", profile=profile)
2357
2831
  raise typer.Exit(1) from exc
2358
2832
  preview_window_ids = pairing_schedule["preview"].get("window_ids")
2359
2833
  preview_labels = pairing_schedule["preview"].get("labels")
@@ -2575,7 +3049,13 @@ def run_command(
2575
3049
  if capacity_meta and "window_capacity" not in dataset_meta:
2576
3050
  dataset_meta["window_capacity"] = capacity_meta
2577
3051
  elif cfg.dataset.provider:
2578
- console.print(f"📊 Loading dataset: {cfg.dataset.provider}")
3052
+ _event(
3053
+ console,
3054
+ "DATA",
3055
+ f"Loading dataset: {cfg.dataset.provider}",
3056
+ emoji="📊",
3057
+ profile=profile_normalized,
3058
+ )
2579
3059
  # Pass through provider-specific kwargs when available
2580
3060
  provider_kwargs = {}
2581
3061
  for key in (
@@ -2635,6 +3115,7 @@ def run_command(
2635
3115
  provider_kwargs=provider_kwargs,
2636
3116
  console=console,
2637
3117
  resolved_device=resolved_device,
3118
+ emit=_provider_event,
2638
3119
  )
2639
3120
  )
2640
3121
 
@@ -2642,7 +3123,7 @@ def run_command(
2642
3123
  try:
2643
3124
  tokenizer, tokenizer_hash = resolve_tokenizer(model_profile)
2644
3125
  except Exception as exc:
2645
- console.print(f"[red]{exc}[/red]")
3126
+ _event(console, "FAIL", str(exc), emoji="❌", profile=profile)
2646
3127
  raise typer.Exit(1) from exc
2647
3128
 
2648
3129
  dataset_stride = getattr(
@@ -2676,7 +3157,7 @@ def run_command(
2676
3157
  console=console,
2677
3158
  )
2678
3159
  except RuntimeError as err:
2679
- console.print(f"[red]{err}[/red]")
3160
+ _event(console, "FAIL", str(err), emoji="❌", profile=profile)
2680
3161
  raise typer.Exit(1) from err
2681
3162
 
2682
3163
  actual_per_arm = int(window_plan["actual_preview"])
@@ -2688,9 +3169,12 @@ def run_command(
2688
3169
  cfg.dataset, "stride", getattr(cfg.dataset, "seq_len", 0)
2689
3170
  )
2690
3171
  else:
2691
- console.print(
2692
- "[yellow]⚠️ Release profile requested but dataset provider "
2693
- "does not expose capacity estimation; using configured window counts.[/yellow]"
3172
+ _event(
3173
+ console,
3174
+ "WARN",
3175
+ "Release profile requested but dataset provider does not expose capacity estimation; using configured window counts.",
3176
+ emoji="⚠️",
3177
+ profile=profile_normalized,
2694
3178
  )
2695
3179
 
2696
3180
  preview_records: list[tuple[list[int], list[int]]] = []
@@ -2894,8 +3378,12 @@ def run_command(
2894
3378
  raise RuntimeError(
2895
3379
  "Unable to construct non-overlapping windows within minimum window floor."
2896
3380
  )
2897
- console.print(
2898
- f"[yellow]⚠️ Detected {deficit} duplicate windows; reducing per-arm windows to {proposed_per_arm} and retrying stratification.[/yellow]"
3381
+ _event(
3382
+ console,
3383
+ "WARN",
3384
+ f"Detected {deficit} duplicate windows; reducing per-arm windows to {proposed_per_arm} and retrying stratification.",
3385
+ emoji="⚠️",
3386
+ profile=profile_normalized,
2899
3387
  )
2900
3388
 
2901
3389
  effective_preview = proposed_per_arm
@@ -3037,6 +3525,10 @@ def run_command(
3037
3525
  run_context["dataset_meta"] = dataset_meta
3038
3526
  if window_plan:
3039
3527
  run_context["window_plan"] = window_plan
3528
+ if dataset_timing_start is not None:
3529
+ timings["load_dataset"] = max(
3530
+ 0.0, float(perf_counter() - dataset_timing_start)
3531
+ )
3040
3532
 
3041
3533
  if os.environ.get("INVARLOCK_DEBUG_TRACE"):
3042
3534
  console.print(
@@ -3060,7 +3552,13 @@ def run_command(
3060
3552
  )
3061
3553
 
3062
3554
  # Execute the real pipeline using CoreRunner
3063
- console.print(f"⚙️ Executing pipeline with {len(guards)} guards...")
3555
+ _event(
3556
+ console,
3557
+ "EXEC",
3558
+ f"Executing pipeline with {len(guards)} guards...",
3559
+ emoji="⚙️",
3560
+ profile=profile_normalized,
3561
+ )
3064
3562
  runner = CoreRunner()
3065
3563
 
3066
3564
  # Prepare auto configuration for tier resolution
@@ -3125,8 +3623,8 @@ def run_command(
3125
3623
  for key, values in model_profile.module_selectors.items()
3126
3624
  }
3127
3625
 
3128
- console.print(f"✂️ Edit: {edit_op.name}")
3129
- console.print(f"🛡️ Guards: {[g.name for g in guards]}")
3626
+ console.print(_format_kv_line("Edit", str(edit_op.name)))
3627
+ console.print(_format_kv_line("Guards", _format_guard_chain(guards)))
3130
3628
 
3131
3629
  # Model load/snapshot strategy
3132
3630
  model = None
@@ -3140,8 +3638,25 @@ def run_command(
3140
3638
  # Try single-load with snapshot/restore if adapter supports it; fallback to reload per attempt
3141
3639
  try:
3142
3640
  # Load once
3143
- console.print(f"🔧 Loading model once: {cfg.model.id}")
3144
- model = _load_model_with_cfg(adapter, cfg, resolved_device)
3641
+ _event(
3642
+ console,
3643
+ "INIT",
3644
+ f"Loading model once: {cfg.model.id}",
3645
+ emoji="🔧",
3646
+ profile=profile_normalized,
3647
+ )
3648
+ with timed_step(
3649
+ console=console,
3650
+ style=_style_from_console(console, profile=profile_normalized),
3651
+ timings=timings,
3652
+ key="load_model",
3653
+ tag="INIT",
3654
+ message="Load model",
3655
+ emoji="🔧",
3656
+ ):
3657
+ model = _load_model_with_cfg(
3658
+ adapter, cfg, resolved_device, profile=profile_normalized
3659
+ )
3145
3660
 
3146
3661
  # No edit-specific bootstrap logic
3147
3662
 
@@ -3297,9 +3812,13 @@ def run_command(
3297
3812
  return "reload"
3298
3813
 
3299
3814
  mode = _choose_snapshot_mode()
3300
- # Emit deterministic snapshot mode status line
3301
- console.print(
3302
- f"snapshot_mode: {'enabled' if mode in {'bytes', 'chunked'} else 'disabled'}"
3815
+ enabled = mode in {"bytes", "chunked"}
3816
+ _event(
3817
+ console,
3818
+ "INIT",
3819
+ f"Snapshot mode: {'enabled' if enabled else 'disabled'}",
3820
+ emoji="💾",
3821
+ profile=profile_normalized,
3303
3822
  )
3304
3823
  if mode == "chunked":
3305
3824
  snapshot_tmpdir = adapter.snapshot_chunked(model) # type: ignore[attr-defined]
@@ -3342,13 +3861,16 @@ def run_command(
3342
3861
 
3343
3862
  # RETRY LOOP - All report processing inside loop
3344
3863
  attempt = 1
3345
- profile_normalized = (profile or "").lower()
3346
3864
  measure_guard_overhead, skip_overhead = _should_measure_overhead(
3347
3865
  profile_normalized
3348
3866
  )
3349
3867
  if skip_overhead and profile_normalized in {"ci", "release"}:
3350
- console.print(
3351
- "[yellow]⚠️ Overhead check skipped via INVARLOCK_SKIP_OVERHEAD_CHECK[/yellow]"
3868
+ _event(
3869
+ console,
3870
+ "WARN",
3871
+ "Overhead check skipped via INVARLOCK_SKIP_OVERHEAD_CHECK",
3872
+ emoji="⚠️",
3873
+ profile=profile_normalized,
3352
3874
  )
3353
3875
 
3354
3876
  while True:
@@ -3356,12 +3878,32 @@ def run_command(
3356
3878
  set_seed(seed_bundle["python"])
3357
3879
 
3358
3880
  if retry_controller:
3359
- console.print(f"\n🚀 Attempt {attempt}/{max_attempts}")
3881
+ console.print("\n")
3882
+ _event(
3883
+ console,
3884
+ "EXEC",
3885
+ f"Attempt {attempt}/{max_attempts}",
3886
+ emoji="🚀",
3887
+ profile=profile_normalized,
3888
+ )
3360
3889
  if attempt > 1:
3361
- console.print(f"🔄 Retry attempt {attempt}/{max_attempts}")
3890
+ _event(
3891
+ console,
3892
+ "EXEC",
3893
+ f"Retry attempt {attempt}/{max_attempts}",
3894
+ emoji="🔄",
3895
+ profile=profile_normalized,
3896
+ )
3362
3897
  else:
3363
3898
  if attempt > 1:
3364
- console.print(f"\n🚀 Attempt {attempt}")
3899
+ console.print("\n")
3900
+ _event(
3901
+ console,
3902
+ "EXEC",
3903
+ f"Attempt {attempt}",
3904
+ emoji="🚀",
3905
+ profile=profile_normalized,
3906
+ )
3365
3907
 
3366
3908
  # Adjust parameters for retry attempts
3367
3909
  if retry_controller and attempt > 1:
@@ -3390,6 +3932,8 @@ def run_command(
3390
3932
  "checks": {},
3391
3933
  }
3392
3934
  elif measure_guard_overhead:
3935
+ bare_edit_config = dict(edit_config or {})
3936
+ bare_edit_config["emit"] = False
3393
3937
  guard_overhead_payload = _run_bare_control(
3394
3938
  adapter=adapter,
3395
3939
  edit_op=edit_op,
@@ -3398,7 +3942,7 @@ def run_command(
3398
3942
  run_config=run_config,
3399
3943
  calibration_data=calibration_data,
3400
3944
  auto_config=auto_config,
3401
- edit_config=edit_config,
3945
+ edit_config=bare_edit_config,
3402
3946
  preview_count=preview_count,
3403
3947
  final_count=final_count,
3404
3948
  seed_bundle=seed_bundle,
@@ -3412,34 +3956,53 @@ def run_command(
3412
3956
  )
3413
3957
 
3414
3958
  # Ensure clean state for guarded run
3415
- core_report, model = _execute_guarded_run(
3416
- runner=runner,
3417
- adapter=adapter,
3418
- model=model,
3419
- cfg=cfg,
3420
- edit_op=edit_op,
3421
- run_config=run_config,
3422
- guards=guards,
3423
- calibration_data=calibration_data,
3424
- auto_config=auto_config,
3425
- edit_config=edit_config,
3426
- preview_count=preview_count,
3427
- final_count=final_count,
3428
- restore_fn=restore_fn,
3429
- resolved_device=resolved_device,
3959
+ with timed_step(
3430
3960
  console=console,
3431
- snapshot_provenance=snapshot_provenance,
3432
- skip_model_load=skip_model_load,
3433
- )
3961
+ style=_style_from_console(console, profile=profile_normalized),
3962
+ timings=timings,
3963
+ key="execute",
3964
+ tag="EXEC",
3965
+ message="Execute pipeline",
3966
+ emoji="⚙️",
3967
+ ):
3968
+ core_report, model = _execute_guarded_run(
3969
+ runner=runner,
3970
+ adapter=adapter,
3971
+ model=model,
3972
+ cfg=cfg,
3973
+ edit_op=edit_op,
3974
+ run_config=run_config,
3975
+ guards=guards,
3976
+ calibration_data=calibration_data,
3977
+ auto_config=auto_config,
3978
+ edit_config=edit_config,
3979
+ preview_count=preview_count,
3980
+ final_count=final_count,
3981
+ restore_fn=restore_fn,
3982
+ resolved_device=resolved_device,
3983
+ profile_normalized=profile_normalized,
3984
+ console=console,
3985
+ snapshot_provenance=snapshot_provenance,
3986
+ skip_model_load=skip_model_load,
3987
+ )
3434
3988
  except _SnapshotRestoreFailed as exc:
3435
3989
  snapshot_provenance["restore_failed"] = True
3436
3990
  _free_model_memory(model)
3437
3991
  model = None
3438
3992
  restore_fn = None
3439
- console.print(
3440
- "[yellow]⚠️ Snapshot restore failed; switching to reload-per-attempt.[/yellow]"
3993
+ _event(
3994
+ console,
3995
+ "WARN",
3996
+ "Snapshot restore failed; switching to reload-per-attempt.",
3997
+ emoji="⚠️",
3998
+ profile=profile_normalized,
3999
+ )
4000
+ _event(
4001
+ console,
4002
+ "WARN",
4003
+ f"↳ {exc}",
4004
+ profile=profile_normalized,
3441
4005
  )
3442
- console.print(f"[yellow]↳ {exc}[/yellow]")
3443
4006
  if retry_controller:
3444
4007
  retry_controller.record_attempt(
3445
4008
  attempt,
@@ -3461,6 +4024,16 @@ def run_command(
3461
4024
  # Convert CoreRunner report to evaluation report
3462
4025
  report = create_empty_report()
3463
4026
 
4027
+ # Persist minimal run context for certificate/report provenance.
4028
+ try:
4029
+ report["context"] = {
4030
+ "profile": profile_normalized,
4031
+ "auto": dict(auto_config),
4032
+ "assurance": dict(run_context.get("assurance") or {}),
4033
+ }
4034
+ except Exception:
4035
+ pass
4036
+
3464
4037
  # Code provenance: commit hash and InvarLock version
3465
4038
  commit_value = (
3466
4039
  getattr(cfg.meta, "commit", "") if hasattr(cfg, "meta") else ""
@@ -3561,6 +4134,8 @@ def run_command(
3561
4134
  report["meta"].update(meta_payload)
3562
4135
  if pm_acceptance_range:
3563
4136
  report["meta"]["pm_acceptance_range"] = pm_acceptance_range
4137
+ if pm_drift_band:
4138
+ report["meta"]["pm_drift_band"] = pm_drift_band
3564
4139
  report["meta"]["model_profile"] = {
3565
4140
  "family": model_profile.family,
3566
4141
  "default_loss": model_profile.default_loss,
@@ -3644,6 +4219,14 @@ def run_command(
3644
4219
  }
3645
4220
  )
3646
4221
 
4222
+ if edit_label:
4223
+ report.setdefault("edit", {})
4224
+ report["edit"]["name"] = edit_label
4225
+ report["edit"]["algorithm"] = edit_label
4226
+ if isinstance(core_report.context, dict):
4227
+ core_report.context.setdefault("edit", {})
4228
+ core_report.context["edit"]["name"] = edit_label
4229
+
3647
4230
  mask_artifact_path = _persist_ref_masks(core_report, run_dir)
3648
4231
  if mask_artifact_path:
3649
4232
  report.setdefault("artifacts", {})
@@ -3651,6 +4234,22 @@ def run_command(
3651
4234
 
3652
4235
  # Transfer metrics (PM-only: do not write ppl_* fields)
3653
4236
  if hasattr(core_report, "metrics") and core_report.metrics:
4237
+ if isinstance(core_report.metrics, dict):
4238
+ core_timings = core_report.metrics.get("timings")
4239
+ if isinstance(core_timings, dict):
4240
+ for key in (
4241
+ "prepare",
4242
+ "prepare_guards",
4243
+ "edit",
4244
+ "guards",
4245
+ "eval",
4246
+ "finalize",
4247
+ ):
4248
+ if key in core_timings:
4249
+ try:
4250
+ timings[key] = float(core_timings[key])
4251
+ except Exception:
4252
+ timings[key] = core_timings[key]
3654
4253
  metrics_payload = {
3655
4254
  "latency_ms_per_tok": core_report.metrics.get(
3656
4255
  "latency_ms_per_tok", 0.0
@@ -3696,11 +4295,17 @@ def run_command(
3696
4295
  "window_pairing_final",
3697
4296
  "paired_windows",
3698
4297
  "paired_delta_summary",
4298
+ "primary_metric_tail",
3699
4299
  "preview_total_tokens",
3700
4300
  "final_total_tokens",
3701
4301
  "masked_tokens_total",
3702
4302
  "masked_tokens_preview",
3703
4303
  "masked_tokens_final",
4304
+ "timings",
4305
+ "guard_timings",
4306
+ "memory_snapshots",
4307
+ "gpu_memory_mb_peak",
4308
+ "gpu_memory_reserved_mb_peak",
3704
4309
  "reduction",
3705
4310
  ]
3706
4311
  for key in optional_keys:
@@ -3864,8 +4469,12 @@ def run_command(
3864
4469
  },
3865
4470
  }
3866
4471
  elif had_baseline and (profile or "").lower() in {"ci", "release"}:
3867
- console.print(
3868
- "[red]❌ [INVARLOCK:E001] PAIRING-SCHEDULE-MISMATCH: baseline pairing requested but evaluation windows were not produced. Check capacity/pairing config.[/red]"
4472
+ _event(
4473
+ console,
4474
+ "FAIL",
4475
+ "[INVARLOCK:E001] PAIRING-SCHEDULE-MISMATCH: baseline pairing requested but evaluation windows were not produced. Check capacity/pairing config.",
4476
+ emoji="❌",
4477
+ profile=profile_normalized,
3869
4478
  )
3870
4479
  raise typer.Exit(3)
3871
4480
  else:
@@ -4076,12 +4685,20 @@ def run_command(
4076
4685
  if ok:
4077
4686
  report["artifacts"]["checkpoint_path"] = str(export_dir)
4078
4687
  else:
4079
- console.print(
4080
- "[yellow]⚠️ Model export requested but adapter did not save a HF directory.[/yellow]"
4688
+ _event(
4689
+ console,
4690
+ "WARN",
4691
+ "Model export requested but adapter did not save a HF directory.",
4692
+ emoji="⚠️",
4693
+ profile=profile_normalized,
4081
4694
  )
4082
4695
  except Exception:
4083
- console.print(
4084
- "[yellow]⚠️ Model export requested but failed due to an unexpected error.[/yellow]"
4696
+ _event(
4697
+ console,
4698
+ "WARN",
4699
+ "Model export requested but failed due to an unexpected error.",
4700
+ emoji="⚠️",
4701
+ profile=profile_normalized,
4085
4702
  )
4086
4703
 
4087
4704
  # Set flags
@@ -4302,7 +4919,10 @@ def run_command(
4302
4919
  try:
4303
4920
  metric_kind_resolved, _provider_kind, metric_opts = (
4304
4921
  _resolve_metric_and_provider(
4305
- cfg, model_profile, resolved_loss_type=resolved_loss_type
4922
+ cfg,
4923
+ model_profile,
4924
+ resolved_loss_type=resolved_loss_type,
4925
+ metric_kind_override=metric_kind,
4306
4926
  )
4307
4927
  )
4308
4928
  if metric_kind_resolved:
@@ -4313,6 +4933,12 @@ def run_command(
4313
4933
  pm = compute_primary_metric_from_report(
4314
4934
  report, kind=metric_kind_resolved, baseline=baseline_report_data
4315
4935
  )
4936
+ core_primary_metric = None
4937
+ if hasattr(core_report, "metrics") and isinstance(
4938
+ core_report.metrics, dict
4939
+ ):
4940
+ core_primary_metric = core_report.metrics.get("primary_metric")
4941
+ pm = _merge_primary_metric_health(pm, core_primary_metric)
4316
4942
  report.setdefault("metrics", {})["primary_metric"] = pm
4317
4943
  # Attach configured reps/ci_level when provided
4318
4944
  if metric_opts:
@@ -4327,7 +4953,7 @@ def run_command(
4327
4953
  ) # type: ignore[index]
4328
4954
  except Exception:
4329
4955
  pass
4330
- # Shadow parity check against legacy ppl fields (best-effort)
4956
+ # Shadow parity check against ppl_* fields (best-effort)
4331
4957
  try:
4332
4958
  pm_blk = report.get("metrics", {}).get("primary_metric", {})
4333
4959
  ppl_final_v1 = float(pm_blk.get("final"))
@@ -4375,6 +5001,13 @@ def run_command(
4375
5001
  except Exception:
4376
5002
  pass
4377
5003
 
5004
+ telemetry_path: Path | None = None
5005
+ if telemetry:
5006
+ telemetry_path = run_dir / "telemetry.json"
5007
+ report.setdefault("artifacts", {})["telemetry_path"] = str(
5008
+ telemetry_path
5009
+ )
5010
+
4378
5011
  saved_files = _postprocess_and_summarize(
4379
5012
  report=report,
4380
5013
  run_dir=run_dir,
@@ -4391,6 +5024,31 @@ def run_command(
4391
5024
  except Exception:
4392
5025
  pass
4393
5026
 
5027
+ if telemetry and telemetry_path is not None:
5028
+ try:
5029
+ from invarlock.reporting.telemetry import save_telemetry_report
5030
+
5031
+ saved_path = save_telemetry_report(
5032
+ report, run_dir, filename=telemetry_path.name
5033
+ )
5034
+ if isinstance(saved_files, dict):
5035
+ saved_files["telemetry"] = str(saved_path)
5036
+ _event(
5037
+ console,
5038
+ "DATA",
5039
+ f"Telemetry: {saved_path}",
5040
+ emoji="📈",
5041
+ profile=profile_normalized,
5042
+ )
5043
+ except Exception as exc: # pragma: no cover - best-effort
5044
+ _event(
5045
+ console,
5046
+ "WARN",
5047
+ f"Telemetry export failed: {exc}",
5048
+ emoji="⚠️",
5049
+ profile=profile_normalized,
5050
+ )
5051
+
4394
5052
  # Metrics display
4395
5053
  pm_obj = None
4396
5054
  try:
@@ -4405,15 +5063,23 @@ def run_command(
4405
5063
  if isinstance(pm_prev, (int | float)) and isinstance(
4406
5064
  pm_fin, (int | float)
4407
5065
  ):
4408
- console.print(
4409
- f"📌 Primary Metric [{pm_kind}] — preview: {pm_prev:.3f}, final: {pm_fin:.3f}"
5066
+ _event(
5067
+ console,
5068
+ "METRIC",
5069
+ f"Primary Metric [{pm_kind}] — preview: {pm_prev:.3f}, final: {pm_fin:.3f}",
5070
+ emoji="📌",
5071
+ profile=profile_normalized,
4410
5072
  )
4411
5073
  ratio_vs_base = pm_obj.get("ratio_vs_baseline")
4412
5074
  if isinstance(ratio_vs_base, (int | float)) and math.isfinite(
4413
5075
  ratio_vs_base
4414
5076
  ):
4415
- console.print(
4416
- f"🔗 Ratio vs baseline [{pm_kind}]: {ratio_vs_base:.3f}"
5077
+ _event(
5078
+ console,
5079
+ "METRIC",
5080
+ f"Ratio vs baseline [{pm_kind}]: {ratio_vs_base:.3f}",
5081
+ emoji="🔗",
5082
+ profile=profile_normalized,
4417
5083
  )
4418
5084
  except Exception:
4419
5085
  pass
@@ -4425,8 +5091,12 @@ def run_command(
4425
5091
  console, guard_overhead_info
4426
5092
  )
4427
5093
  if not guard_overhead_info.get("passed", True):
4428
- console.print(
4429
- "[red]⚠️ Guard overhead gate FAILED: Guards add more than the permitted budget[/red]"
5094
+ _event(
5095
+ console,
5096
+ "FAIL",
5097
+ "Guard overhead gate FAILED: Guards add more than the permitted budget",
5098
+ emoji="⚠️",
5099
+ profile=profile_normalized,
4430
5100
  )
4431
5101
  # Only fail hard when the overhead check was actually evaluated
4432
5102
  # (e.g., for causal LMs with available bare/guarded PM). For
@@ -4467,7 +5137,13 @@ def run_command(
4467
5137
  if baseline_report is None:
4468
5138
  raise FileNotFoundError("Baseline report unavailable")
4469
5139
 
4470
- console.print("📜 Generating safety certificate...")
5140
+ _event(
5141
+ console,
5142
+ "EXEC",
5143
+ "Generating evaluation certificate...",
5144
+ emoji="📜",
5145
+ profile=profile_normalized,
5146
+ )
4471
5147
  certificate = make_certificate(report, baseline_report)
4472
5148
 
4473
5149
  validation = certificate.get("validation", {})
@@ -4484,11 +5160,21 @@ def run_command(
4484
5160
  )
4485
5161
 
4486
5162
  if certificate_passed:
4487
- console.print("[green]✅ Certificate PASSED all gates![/green]")
5163
+ _event(
5164
+ console,
5165
+ "PASS",
5166
+ "Certificate PASSED all gates!",
5167
+ emoji="✅",
5168
+ profile=profile_normalized,
5169
+ )
4488
5170
  break
4489
5171
  else:
4490
- console.print(
4491
- f"[yellow]⚠️ Certificate FAILED gates: {', '.join(failed_gates)}[/yellow]"
5172
+ _event(
5173
+ console,
5174
+ "FAIL",
5175
+ f"Certificate FAILED gates: {', '.join(failed_gates)}",
5176
+ emoji="⚠️",
5177
+ profile=profile_normalized,
4492
5178
  )
4493
5179
 
4494
5180
  # Auto-tune mask-only heads (binary search on keep count)
@@ -4533,8 +5219,12 @@ def run_command(
4533
5219
  }
4534
5220
  )
4535
5221
  head_section["global_k"] = next_keep
4536
- console.print(
4537
- f"🔧 Auto-tune adjust: global_k → {next_keep} (bounds {keep_low}-{keep_high})"
5222
+ _event(
5223
+ console,
5224
+ "INIT",
5225
+ f"Auto-tune adjust: global_k → {next_keep} (bounds {keep_low}-{keep_high})",
5226
+ emoji="🔧",
5227
+ profile=profile_normalized,
4538
5228
  )
4539
5229
  except Exception:
4540
5230
  pass
@@ -4543,14 +5233,22 @@ def run_command(
4543
5233
  attempt += 1
4544
5234
  continue
4545
5235
  else:
4546
- console.print(
4547
- f"[red]❌ Exhausted retry budget after {attempt} attempts[/red]"
5236
+ _event(
5237
+ console,
5238
+ "FAIL",
5239
+ f"Exhausted retry budget after {attempt} attempts",
5240
+ emoji="❌",
5241
+ profile=profile_normalized,
4548
5242
  )
4549
5243
  break
4550
5244
 
4551
5245
  except Exception as cert_error:
4552
- console.print(
4553
- f"[yellow]⚠️ Certificate validation failed: {cert_error}[/yellow]"
5246
+ _event(
5247
+ console,
5248
+ "WARN",
5249
+ f"Certificate validation failed: {cert_error}",
5250
+ emoji="⚠️",
5251
+ profile=profile_normalized,
4554
5252
  )
4555
5253
  if retry_controller:
4556
5254
  retry_controller.record_attempt(
@@ -4579,11 +5277,82 @@ def run_command(
4579
5277
  # (moved) Cleanup printing occurs after loop to guarantee execution
4580
5278
  pass
4581
5279
 
5280
+ if output_style.timing:
5281
+ total_duration = (
5282
+ max(0.0, float(perf_counter() - total_start))
5283
+ if total_start is not None
5284
+ else None
5285
+ )
5286
+ timings_for_summary: dict[str, float] = {}
5287
+ for key, value in timings.items():
5288
+ if isinstance(value, (int | float)):
5289
+ timings_for_summary[key] = float(value)
5290
+ if total_duration is not None:
5291
+ timings_for_summary["total"] = total_duration
5292
+
5293
+ has_breakdown = any(
5294
+ key in timings_for_summary
5295
+ for key in (
5296
+ "prepare",
5297
+ "prepare_guards",
5298
+ "edit",
5299
+ "guards",
5300
+ "eval",
5301
+ "finalize",
5302
+ )
5303
+ )
5304
+
5305
+ order: list[tuple[str, str]] = []
5306
+
5307
+ def _add(label: str, key: str) -> None:
5308
+ if key in timings_for_summary:
5309
+ order.append((label, key))
5310
+
5311
+ _add("Load model", "load_model")
5312
+ _add("Load data", "load_dataset")
5313
+ if has_breakdown:
5314
+ _add("Prepare", "prepare")
5315
+ _add("Prep guards", "prepare_guards")
5316
+ _add("Edit", "edit")
5317
+ _add("Guards", "guards")
5318
+ _add("Eval", "eval")
5319
+ _add("Finalize", "finalize")
5320
+ else:
5321
+ _add("Execute", "execute")
5322
+ _add("Total", "total")
5323
+
5324
+ extra_lines: list[str] = []
5325
+ metrics_section = (
5326
+ report.get("metrics", {}) if isinstance(report, dict) else {}
5327
+ )
5328
+ if isinstance(metrics_section, dict):
5329
+ mem_peak = metrics_section.get("memory_mb_peak")
5330
+ gpu_peak = metrics_section.get("gpu_memory_mb_peak")
5331
+ if isinstance(mem_peak, (int | float)):
5332
+ extra_lines.append(f" Peak Memory : {float(mem_peak):.2f} MB")
5333
+ if isinstance(gpu_peak, (int | float)):
5334
+ extra_lines.append(f" Peak GPU Mem: {float(gpu_peak):.2f} MB")
5335
+
5336
+ if timings_for_summary and order:
5337
+ print_timing_summary(
5338
+ console,
5339
+ timings_for_summary,
5340
+ style=output_style,
5341
+ order=order,
5342
+ extra_lines=extra_lines,
5343
+ )
5344
+
4582
5345
  # Normal path falls through; cleanup handled below in finally
4583
5346
  return report_path_out
4584
5347
 
4585
5348
  except FileNotFoundError as e:
4586
- console.print(f"[red]❌ Configuration file not found: {e}[/red]")
5349
+ _event(
5350
+ console,
5351
+ "FAIL",
5352
+ f"Configuration file not found: {e}",
5353
+ emoji="❌",
5354
+ profile=profile_normalized,
5355
+ )
4587
5356
  raise typer.Exit(1) from e
4588
5357
  except InvarlockError as ce:
4589
5358
  # InvarlockError → code 3 only in CI/Release; dev → 1
@@ -4599,12 +5368,22 @@ def run_command(
4599
5368
  traceback.print_exc()
4600
5369
  # Emit a clearer message for schema failures (exit 2)
4601
5370
  if isinstance(e, ValueError) and "Invalid RunReport" in str(e):
4602
- console.print(
4603
- "[red]❌ Schema invalid: run report structure failed validation[/red]"
5371
+ _event(
5372
+ console,
5373
+ "FAIL",
5374
+ "Schema invalid: run report structure failed validation",
5375
+ emoji="❌",
5376
+ profile=profile_normalized,
4604
5377
  )
4605
5378
  code = 2
4606
5379
  else:
4607
- console.print(f"[red]❌ Pipeline execution failed: {e}[/red]")
5380
+ _event(
5381
+ console,
5382
+ "FAIL",
5383
+ f"Pipeline execution failed: {e}",
5384
+ emoji="❌",
5385
+ profile=profile_normalized,
5386
+ )
4608
5387
  code = _resolve_exit_code(e, profile=profile)
4609
5388
  raise typer.Exit(code) from e
4610
5389
  finally:
@@ -4618,20 +5397,53 @@ def run_command(
4618
5397
  except Exception:
4619
5398
  pass
4620
5399
  finally:
4621
- console.print("cleanup: removed")
5400
+ _event(
5401
+ console,
5402
+ "INFO",
5403
+ "Cleanup: removed",
5404
+ emoji="🧹",
5405
+ profile=profile_normalized,
5406
+ )
4622
5407
  else:
4623
- console.print("cleanup: skipped")
5408
+ _event(
5409
+ console,
5410
+ "INFO",
5411
+ "Cleanup: skipped",
5412
+ emoji="🧹",
5413
+ profile=profile_normalized,
5414
+ )
4624
5415
  except Exception:
4625
5416
  # Best-effort cleanup printing; never raise from finally
4626
5417
  pass
4627
5418
 
4628
5419
 
5420
+ def _merge_primary_metric_health(
5421
+ primary_metric: dict[str, Any] | None,
5422
+ core_primary_metric: dict[str, Any] | None,
5423
+ ) -> dict[str, Any]:
5424
+ if not isinstance(primary_metric, dict):
5425
+ return {}
5426
+ merged = dict(primary_metric)
5427
+ if not isinstance(core_primary_metric, dict):
5428
+ return merged
5429
+ if core_primary_metric.get("invalid") is True:
5430
+ merged["invalid"] = True
5431
+ merged["degraded"] = True
5432
+ if core_primary_metric.get("degraded") is True:
5433
+ merged["degraded"] = True
5434
+ core_reason = core_primary_metric.get("degraded_reason")
5435
+ if isinstance(core_reason, str) and core_reason:
5436
+ merged["degraded_reason"] = core_reason
5437
+ merged["degraded"] = True
5438
+ return merged
5439
+
5440
+
4629
5441
  def _format_debug_metric_diffs(
4630
5442
  pm: dict[str, float] | None,
4631
5443
  metrics: dict[str, float] | None,
4632
5444
  baseline_report_data: dict | None,
4633
5445
  ) -> str:
4634
- """Build a compact DEBUG_METRIC_DIFFS line comparing current snapshot vs legacy ppl_*.
5446
+ """Build a compact DEBUG_METRIC_DIFFS line comparing current snapshot vs ppl_*.
4635
5447
 
4636
5448
  Returns a semicolon-separated string of deltas like
4637
5449
  "final: v1-v1 = +0.000000000; Δlog(final): +0.000000000; ...". Safe to call with
@@ -4746,11 +5558,9 @@ def _print_guard_overhead_summary(
4746
5558
  """Print a concise guard-overhead console summary. Returns threshold fraction used."""
4747
5559
  evaluated = bool(guard_overhead_info.get("evaluated", True))
4748
5560
  if not evaluated:
4749
- console.print("🛡️ Guard Overhead: not evaluated")
5561
+ _event(console, "METRIC", "Guard Overhead: not evaluated", emoji="🛡️")
4750
5562
  return GUARD_OVERHEAD_THRESHOLD
4751
- overhead_status = (
4752
- "✅ PASS" if guard_overhead_info.get("passed", True) else "❌ FAIL"
4753
- )
5563
+ overhead_status = "PASS" if guard_overhead_info.get("passed", True) else "FAIL"
4754
5564
  overhead_percent = guard_overhead_info.get("overhead_percent")
4755
5565
  if isinstance(overhead_percent, (int | float)) and math.isfinite(
4756
5566
  float(overhead_percent)
@@ -4769,8 +5579,11 @@ def _print_guard_overhead_summary(
4769
5579
  except (TypeError, ValueError):
4770
5580
  threshold_fraction = GUARD_OVERHEAD_THRESHOLD
4771
5581
  threshold_display = f"≤ +{threshold_fraction * 100:.1f}%"
4772
- console.print(
4773
- f"🛡️ Guard Overhead: {overhead_status} {overhead_display} ({threshold_display})"
5582
+ _event(
5583
+ console,
5584
+ "METRIC",
5585
+ f"Guard Overhead: {overhead_status} {overhead_display} ({threshold_display})",
5586
+ emoji="🛡️",
4774
5587
  )
4775
5588
  return threshold_fraction
4776
5589
 
@@ -4780,8 +5593,12 @@ def _print_retry_summary(console: Console, retry_controller: Any | None) -> None
4780
5593
  try:
4781
5594
  if retry_controller and getattr(retry_controller, "attempt_history", None):
4782
5595
  summary = retry_controller.get_attempt_summary()
4783
- console.print(
4784
- f"\n📊 Retry Summary: {summary['total_attempts']} attempts in {summary['elapsed_time']:.1f}s"
5596
+ console.print("\n")
5597
+ _event(
5598
+ console,
5599
+ "METRIC",
5600
+ f"Retry Summary: {summary['total_attempts']} attempts in {summary['elapsed_time']:.1f}s",
5601
+ emoji="📊",
4785
5602
  )
4786
5603
  except Exception:
4787
5604
  # Never break the run for summary printing
@@ -4804,10 +5621,15 @@ def _init_retry_controller(
4804
5621
  retry_controller = RetryController(
4805
5622
  max_attempts=max_attempts, timeout=timeout, verbose=True
4806
5623
  )
4807
- console.print(f"🔄 Retry mode enabled: max {max_attempts} attempts")
5624
+ _event(
5625
+ console,
5626
+ "INIT",
5627
+ f"Retry mode enabled: max {max_attempts} attempts",
5628
+ emoji="🔄",
5629
+ )
4808
5630
  if baseline:
4809
- console.print(f"📋 Using baseline: {baseline}")
5631
+ _event(console, "DATA", f"Using baseline: {baseline}", emoji="📋")
4810
5632
  else:
4811
5633
  if baseline:
4812
- console.print(f"📋 Using baseline: {baseline}")
5634
+ _event(console, "DATA", f"Using baseline: {baseline}", emoji="📋")
4813
5635
  return retry_controller