invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +4 -4
- invarlock/adapters/__init__.py +10 -14
- invarlock/adapters/auto.py +37 -50
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +53 -9
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/assurance/__init__.py +15 -23
- invarlock/cli/adapter_auto.py +32 -26
- invarlock/cli/app.py +128 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/doctor.py +8 -10
- invarlock/cli/commands/evaluate.py +986 -0
- invarlock/cli/commands/explain_gates.py +25 -17
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +326 -92
- invarlock/cli/commands/run.py +1160 -228
- invarlock/cli/commands/verify.py +157 -97
- invarlock/cli/config.py +1 -1
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +4 -4
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/registry.py +9 -11
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +112 -26
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +67 -39
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +14 -10
- invarlock/eval/data.py +68 -23
- invarlock/eval/metrics.py +59 -1
- invarlock/eval/primary_metric.py +1 -1
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/rmt.py +2 -2
- invarlock/guards/spectral.py +1 -1
- invarlock/guards/variance.py +2 -2
- invarlock/model_profile.py +64 -62
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/guards_analysis.py +154 -4
- invarlock/reporting/html.py +61 -11
- invarlock/reporting/normalizer.py +9 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +876 -510
- invarlock/reporting/report.py +72 -30
- invarlock/reporting/{certificate.py → report_builder.py} +252 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock/reporting/report_types.py +6 -1
- invarlock/reporting/telemetry.py +86 -0
- invarlock-0.3.8.dist-info/METADATA +283 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- invarlock/cli/commands/certify.py +0 -422
- invarlock-0.3.6.dist-info/METADATA +0 -588
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
invarlock/cli/commands/run.py
CHANGED
|
@@ -4,21 +4,25 @@ InvarLock CLI Run Command
|
|
|
4
4
|
|
|
5
5
|
Run a guarded pipeline from a YAML config. Intended for local smokes,
|
|
6
6
|
plugin demos, and development. Advanced: for pairwise certification,
|
|
7
|
-
prefer Compare &
|
|
7
|
+
prefer Compare & Evaluate via `invarlock evaluate --baseline ... --subject ...`.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import copy
|
|
11
11
|
import hashlib
|
|
12
12
|
import inspect
|
|
13
13
|
import json
|
|
14
|
+
import logging
|
|
14
15
|
import math
|
|
15
16
|
import os
|
|
16
17
|
import random
|
|
18
|
+
import re
|
|
17
19
|
import shutil
|
|
18
20
|
import sys as _sys
|
|
19
21
|
import types as _types
|
|
22
|
+
import warnings
|
|
20
23
|
from array import array
|
|
21
|
-
from collections.abc import Iterable, Sequence
|
|
24
|
+
from collections.abc import Callable, Iterable, Iterator, Sequence
|
|
25
|
+
from contextlib import contextmanager
|
|
22
26
|
from datetime import datetime
|
|
23
27
|
from pathlib import Path
|
|
24
28
|
from types import SimpleNamespace
|
|
@@ -30,6 +34,16 @@ import psutil
|
|
|
30
34
|
import typer
|
|
31
35
|
from rich.console import Console
|
|
32
36
|
|
|
37
|
+
from invarlock.cli.output import (
|
|
38
|
+
OutputStyle,
|
|
39
|
+
make_console,
|
|
40
|
+
perf_counter,
|
|
41
|
+
print_event,
|
|
42
|
+
print_timing_summary,
|
|
43
|
+
resolve_output_style,
|
|
44
|
+
timed_step,
|
|
45
|
+
)
|
|
46
|
+
|
|
33
47
|
try:
|
|
34
48
|
import torch
|
|
35
49
|
except ImportError:
|
|
@@ -63,7 +77,42 @@ from ..config import (
|
|
|
63
77
|
)
|
|
64
78
|
from ..overhead_utils import _extract_pm_snapshot_for_overhead
|
|
65
79
|
|
|
66
|
-
console =
|
|
80
|
+
console = make_console()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _style_from_console(console: Console, profile: str | None = None) -> OutputStyle:
|
|
84
|
+
style = getattr(console, "_invarlock_output_style", None)
|
|
85
|
+
if isinstance(style, OutputStyle):
|
|
86
|
+
return style
|
|
87
|
+
return resolve_output_style(
|
|
88
|
+
style=None,
|
|
89
|
+
profile=profile,
|
|
90
|
+
progress=False,
|
|
91
|
+
timing=False,
|
|
92
|
+
no_color=False,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _event(
|
|
97
|
+
console: Console,
|
|
98
|
+
tag: str,
|
|
99
|
+
message: str,
|
|
100
|
+
*,
|
|
101
|
+
emoji: str | None = None,
|
|
102
|
+
console_style: str | None = None,
|
|
103
|
+
profile: str | None = None,
|
|
104
|
+
) -> None:
|
|
105
|
+
style = _style_from_console(console, profile=profile)
|
|
106
|
+
print_event(
|
|
107
|
+
console,
|
|
108
|
+
tag,
|
|
109
|
+
message,
|
|
110
|
+
style=style,
|
|
111
|
+
emoji=emoji,
|
|
112
|
+
console_style=console_style,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
67
116
|
LIGHT_IMPORT = os.getenv("INVARLOCK_LIGHT_IMPORT", "").strip().lower() in {
|
|
68
117
|
"1",
|
|
69
118
|
"true",
|
|
@@ -76,6 +125,232 @@ RELEASE_MIN_WINDOWS_PER_ARM = 200
|
|
|
76
125
|
RELEASE_CALIBRATION_MIN = 16
|
|
77
126
|
RELEASE_CALIBRATION_MAX = 24
|
|
78
127
|
GUARD_OVERHEAD_THRESHOLD = 0.01
|
|
128
|
+
KV_LABEL_WIDTH = 10
|
|
129
|
+
|
|
130
|
+
_NOISY_WARNING_PATTERNS = (r".*loss_type=None.*unrecognized.*",)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _resolve_warning_suppression(profile: str | None) -> tuple[bool, bool]:
|
|
134
|
+
suppress_all = os.getenv("INVARLOCK_SUPPRESS_WARNINGS", "").strip().lower() in {
|
|
135
|
+
"1",
|
|
136
|
+
"true",
|
|
137
|
+
"yes",
|
|
138
|
+
"on",
|
|
139
|
+
}
|
|
140
|
+
profile_norm = (profile or "").strip().lower()
|
|
141
|
+
enabled = bool(suppress_all) or profile_norm in {"ci", "ci_cpu", "release"}
|
|
142
|
+
return enabled, suppress_all
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _apply_warning_filters(profile: str | None) -> bool:
|
|
146
|
+
enabled, suppress_all = _resolve_warning_suppression(profile)
|
|
147
|
+
if not enabled:
|
|
148
|
+
return False
|
|
149
|
+
if suppress_all:
|
|
150
|
+
warnings.simplefilter("ignore")
|
|
151
|
+
else:
|
|
152
|
+
for pattern in _NOISY_WARNING_PATTERNS:
|
|
153
|
+
warnings.filterwarnings("ignore", message=pattern)
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@contextmanager
|
|
158
|
+
def _suppress_noisy_warnings(
|
|
159
|
+
profile: str | None,
|
|
160
|
+
*,
|
|
161
|
+
event_path: Path | None = None,
|
|
162
|
+
context: dict[str, Any] | None = None,
|
|
163
|
+
) -> Iterator[None]:
|
|
164
|
+
enabled, suppress_all = _resolve_warning_suppression(profile)
|
|
165
|
+
if not enabled:
|
|
166
|
+
yield
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
prev_tf_verbosity = os.environ.get("TRANSFORMERS_VERBOSITY")
|
|
170
|
+
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
|
|
171
|
+
transformers_logger = logging.getLogger("transformers")
|
|
172
|
+
prev_tf_level = transformers_logger.level
|
|
173
|
+
transformers_logger.setLevel(logging.ERROR)
|
|
174
|
+
|
|
175
|
+
patterns = [re.compile(p) for p in _NOISY_WARNING_PATTERNS]
|
|
176
|
+
suppressed: list[str] = []
|
|
177
|
+
|
|
178
|
+
class _NoisyLogFilter(logging.Filter):
|
|
179
|
+
def filter(self, record: logging.LogRecord) -> bool: # noqa: A003
|
|
180
|
+
try:
|
|
181
|
+
message = record.getMessage()
|
|
182
|
+
except Exception:
|
|
183
|
+
return True
|
|
184
|
+
if any(p.search(message) for p in patterns):
|
|
185
|
+
suppressed.append(message)
|
|
186
|
+
return False
|
|
187
|
+
return True
|
|
188
|
+
|
|
189
|
+
def _iter_handlers() -> list[logging.Handler]:
|
|
190
|
+
handlers: list[logging.Handler] = []
|
|
191
|
+
seen: set[int] = set()
|
|
192
|
+
for logger in (
|
|
193
|
+
logging.getLogger(),
|
|
194
|
+
logging.getLogger("transformers"),
|
|
195
|
+
logging.getLogger("huggingface_hub"),
|
|
196
|
+
logging.getLogger("datasets"),
|
|
197
|
+
):
|
|
198
|
+
for handler in getattr(logger, "handlers", []) or []:
|
|
199
|
+
if id(handler) in seen:
|
|
200
|
+
continue
|
|
201
|
+
seen.add(id(handler))
|
|
202
|
+
handlers.append(handler)
|
|
203
|
+
return handlers
|
|
204
|
+
|
|
205
|
+
log_filter = _NoisyLogFilter()
|
|
206
|
+
handlers = _iter_handlers()
|
|
207
|
+
|
|
208
|
+
def _append_suppressed_warnings() -> None:
|
|
209
|
+
if not suppressed or event_path is None:
|
|
210
|
+
return
|
|
211
|
+
try:
|
|
212
|
+
path = Path(event_path)
|
|
213
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
214
|
+
payload = {
|
|
215
|
+
"timestamp": datetime.now().isoformat(),
|
|
216
|
+
"component": "warnings",
|
|
217
|
+
"operation": "suppressed",
|
|
218
|
+
"level": "WARNING",
|
|
219
|
+
"data": {
|
|
220
|
+
"count": len(suppressed),
|
|
221
|
+
"messages": suppressed[:50],
|
|
222
|
+
"profile": profile or "",
|
|
223
|
+
**(context or {}),
|
|
224
|
+
},
|
|
225
|
+
}
|
|
226
|
+
with path.open("a", encoding="utf-8") as fh:
|
|
227
|
+
fh.write(json.dumps(payload) + "\n")
|
|
228
|
+
except Exception:
|
|
229
|
+
# Best-effort: suppressed warnings are non-fatal and logging must not
|
|
230
|
+
# impact model loading.
|
|
231
|
+
return
|
|
232
|
+
|
|
233
|
+
for handler in handlers:
|
|
234
|
+
handler.addFilter(log_filter)
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
with warnings.catch_warnings():
|
|
238
|
+
from contextlib import redirect_stderr, redirect_stdout
|
|
239
|
+
|
|
240
|
+
class _FilteredStream:
|
|
241
|
+
def __init__(self, raw: Any) -> None:
|
|
242
|
+
self._raw = raw
|
|
243
|
+
|
|
244
|
+
def __getattr__(self, name: str) -> object:
|
|
245
|
+
return getattr(self._raw, name)
|
|
246
|
+
|
|
247
|
+
def write(self, s: object) -> int:
|
|
248
|
+
try:
|
|
249
|
+
if isinstance(s, bytes):
|
|
250
|
+
text = s.decode("utf-8", errors="replace")
|
|
251
|
+
else:
|
|
252
|
+
text = str(s)
|
|
253
|
+
except Exception:
|
|
254
|
+
return int(self._raw.write(s))
|
|
255
|
+
|
|
256
|
+
# Preserve progress bars (carriage returns) by passing through
|
|
257
|
+
# all non-matching chunks immediately.
|
|
258
|
+
pieces = text.splitlines(keepends=True)
|
|
259
|
+
for piece in pieces:
|
|
260
|
+
if any(p.search(piece) for p in patterns):
|
|
261
|
+
suppressed.append(piece.rstrip("\n"))
|
|
262
|
+
continue
|
|
263
|
+
self._raw.write(piece)
|
|
264
|
+
return len(text)
|
|
265
|
+
|
|
266
|
+
def flush(self) -> None:
|
|
267
|
+
try:
|
|
268
|
+
self._raw.flush()
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
|
|
272
|
+
stdout_proxy = _FilteredStream(_sys.stdout)
|
|
273
|
+
stderr_proxy = _FilteredStream(_sys.stderr)
|
|
274
|
+
|
|
275
|
+
with redirect_stdout(stdout_proxy), redirect_stderr(stderr_proxy):
|
|
276
|
+
if suppress_all:
|
|
277
|
+
warnings.simplefilter("ignore")
|
|
278
|
+
yield
|
|
279
|
+
else:
|
|
280
|
+
original_showwarning = warnings.showwarning
|
|
281
|
+
|
|
282
|
+
def _showwarning(
|
|
283
|
+
message: Warning | str,
|
|
284
|
+
category: type[Warning],
|
|
285
|
+
filename: str,
|
|
286
|
+
lineno: int,
|
|
287
|
+
file: object | None = None,
|
|
288
|
+
line: str | None = None,
|
|
289
|
+
) -> None:
|
|
290
|
+
try:
|
|
291
|
+
rendered = warnings.formatwarning(
|
|
292
|
+
message, category, filename, lineno, line
|
|
293
|
+
)
|
|
294
|
+
except Exception:
|
|
295
|
+
rendered = str(message)
|
|
296
|
+
if any(p.search(rendered) for p in patterns):
|
|
297
|
+
suppressed.append(str(message))
|
|
298
|
+
return
|
|
299
|
+
original_showwarning(
|
|
300
|
+
message,
|
|
301
|
+
category,
|
|
302
|
+
filename,
|
|
303
|
+
lineno,
|
|
304
|
+
file=file,
|
|
305
|
+
line=line,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
warnings.showwarning = _showwarning # type: ignore[assignment]
|
|
309
|
+
try:
|
|
310
|
+
yield
|
|
311
|
+
finally:
|
|
312
|
+
warnings.showwarning = original_showwarning # type: ignore[assignment]
|
|
313
|
+
finally:
|
|
314
|
+
for handler in handlers:
|
|
315
|
+
try:
|
|
316
|
+
handler.removeFilter(log_filter)
|
|
317
|
+
except Exception:
|
|
318
|
+
pass
|
|
319
|
+
try:
|
|
320
|
+
transformers_logger.setLevel(prev_tf_level)
|
|
321
|
+
except Exception:
|
|
322
|
+
pass
|
|
323
|
+
if prev_tf_verbosity is None:
|
|
324
|
+
os.environ.pop("TRANSFORMERS_VERBOSITY", None)
|
|
325
|
+
else:
|
|
326
|
+
os.environ["TRANSFORMERS_VERBOSITY"] = prev_tf_verbosity
|
|
327
|
+
_append_suppressed_warnings()
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _format_kv_line(label: str, value: str, *, width: int = KV_LABEL_WIDTH) -> str:
|
|
331
|
+
return f" {label:<{width}}: {value}"
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _device_resolution_note(target_device: str, resolved_device: str) -> str:
|
|
335
|
+
target_norm = str(target_device or "").strip().lower()
|
|
336
|
+
resolved_norm = str(resolved_device or "").strip().lower()
|
|
337
|
+
if not target_norm or target_norm == "auto":
|
|
338
|
+
return "auto-resolved"
|
|
339
|
+
if target_norm == resolved_norm:
|
|
340
|
+
return "requested"
|
|
341
|
+
return f"resolved from {target_device}"
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _format_guard_chain(guards: list[Any]) -> str:
|
|
345
|
+
names = [str(getattr(guard, "name", "unknown")) for guard in guards]
|
|
346
|
+
seen: set[str] = set()
|
|
347
|
+
deduped: list[str] = []
|
|
348
|
+
for name in names:
|
|
349
|
+
if name in seen:
|
|
350
|
+
continue
|
|
351
|
+
seen.add(name)
|
|
352
|
+
deduped.append(name)
|
|
353
|
+
return " → ".join(deduped)
|
|
79
354
|
|
|
80
355
|
|
|
81
356
|
# Common dataset split aliases we probe in order when not explicitly set
|
|
@@ -241,6 +516,89 @@ def _resolve_pm_acceptance_range(
|
|
|
241
516
|
return {"min": float(min_val), "max": float(max_val)}
|
|
242
517
|
|
|
243
518
|
|
|
519
|
+
def _resolve_pm_drift_band(
|
|
520
|
+
cfg: InvarLockConfig | dict[str, Any] | None,
|
|
521
|
+
) -> dict[str, float]:
|
|
522
|
+
"""Resolve preview→final drift band from config/env with safe defaults.
|
|
523
|
+
|
|
524
|
+
The drift band governs the Preview Final Drift Acceptable gate. By default,
|
|
525
|
+
evaluation reports enforce 0.95–1.05 unless an explicit band is provided.
|
|
526
|
+
"""
|
|
527
|
+
|
|
528
|
+
base_min = 0.95
|
|
529
|
+
base_max = 1.05
|
|
530
|
+
|
|
531
|
+
cfg_min = None
|
|
532
|
+
cfg_max = None
|
|
533
|
+
try:
|
|
534
|
+
cfg_map = _coerce_mapping(cfg) if cfg is not None else {}
|
|
535
|
+
pm_section = cfg_map.get("primary_metric") if isinstance(cfg_map, dict) else {}
|
|
536
|
+
pm_map = _coerce_mapping(pm_section)
|
|
537
|
+
drift_band = pm_map.get("drift_band") if isinstance(pm_map, dict) else None
|
|
538
|
+
if isinstance(drift_band, dict):
|
|
539
|
+
if drift_band.get("min") is not None:
|
|
540
|
+
try:
|
|
541
|
+
cfg_min = float(drift_band["min"])
|
|
542
|
+
except (TypeError, ValueError):
|
|
543
|
+
cfg_min = None
|
|
544
|
+
if drift_band.get("max") is not None:
|
|
545
|
+
try:
|
|
546
|
+
cfg_max = float(drift_band["max"])
|
|
547
|
+
except (TypeError, ValueError):
|
|
548
|
+
cfg_max = None
|
|
549
|
+
elif isinstance(drift_band, list | tuple) and len(drift_band) == 2:
|
|
550
|
+
try:
|
|
551
|
+
cfg_min = float(drift_band[0])
|
|
552
|
+
cfg_max = float(drift_band[1])
|
|
553
|
+
except (TypeError, ValueError):
|
|
554
|
+
cfg_min = None
|
|
555
|
+
cfg_max = None
|
|
556
|
+
except Exception:
|
|
557
|
+
cfg_min = None
|
|
558
|
+
cfg_max = None
|
|
559
|
+
|
|
560
|
+
def _parse_env(name: str) -> float | None:
|
|
561
|
+
try:
|
|
562
|
+
raw = os.environ.get(name, "")
|
|
563
|
+
if raw is None or str(raw).strip() == "":
|
|
564
|
+
return None
|
|
565
|
+
return float(raw)
|
|
566
|
+
except Exception:
|
|
567
|
+
return None
|
|
568
|
+
|
|
569
|
+
env_min = _parse_env("INVARLOCK_PM_DRIFT_MIN")
|
|
570
|
+
env_max = _parse_env("INVARLOCK_PM_DRIFT_MAX")
|
|
571
|
+
|
|
572
|
+
has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
|
|
573
|
+
if not has_explicit:
|
|
574
|
+
return {}
|
|
575
|
+
|
|
576
|
+
min_val = (
|
|
577
|
+
env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
|
|
578
|
+
)
|
|
579
|
+
max_val = (
|
|
580
|
+
env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
try:
|
|
584
|
+
if min_val is not None and min_val <= 0:
|
|
585
|
+
min_val = base_min
|
|
586
|
+
except Exception:
|
|
587
|
+
min_val = base_min
|
|
588
|
+
try:
|
|
589
|
+
if max_val is not None and max_val <= 0:
|
|
590
|
+
max_val = base_max
|
|
591
|
+
except Exception:
|
|
592
|
+
max_val = base_max
|
|
593
|
+
try:
|
|
594
|
+
if min_val is not None and max_val is not None and min_val >= max_val:
|
|
595
|
+
min_val, max_val = base_min, base_max
|
|
596
|
+
except Exception:
|
|
597
|
+
min_val, max_val = base_min, base_max
|
|
598
|
+
|
|
599
|
+
return {"min": float(min_val), "max": float(max_val)}
|
|
600
|
+
|
|
601
|
+
|
|
244
602
|
def _free_model_memory(model: object | None) -> None:
|
|
245
603
|
"""Best-effort cleanup to release GPU memory for a model object."""
|
|
246
604
|
if model is None:
|
|
@@ -754,38 +1112,60 @@ def _prepare_config_for_run(
|
|
|
754
1112
|
resolve_edit_kind as _resolve_edit_kind,
|
|
755
1113
|
)
|
|
756
1114
|
|
|
757
|
-
|
|
1115
|
+
_event(
|
|
1116
|
+
console,
|
|
1117
|
+
"INIT",
|
|
1118
|
+
f"Loading configuration: {config_path}",
|
|
1119
|
+
emoji="📋",
|
|
1120
|
+
profile=profile,
|
|
1121
|
+
)
|
|
758
1122
|
cfg = _load_config(config_path)
|
|
759
1123
|
|
|
760
1124
|
# Apply profile if specified (dev is a no-op)
|
|
761
1125
|
if profile and str(profile).lower() not in {"dev"}:
|
|
762
|
-
|
|
1126
|
+
_event(
|
|
1127
|
+
console, "INIT", f"Applying profile: {profile}", emoji="🎯", profile=profile
|
|
1128
|
+
)
|
|
763
1129
|
try:
|
|
764
1130
|
cfg = _apply_profile(cfg, profile)
|
|
765
1131
|
except Exception as exc:
|
|
766
|
-
console
|
|
1132
|
+
_event(console, "FAIL", str(exc), emoji="❌", profile=profile)
|
|
767
1133
|
raise typer.Exit(1) from exc
|
|
768
1134
|
|
|
769
1135
|
# Apply edit override
|
|
770
1136
|
if edit:
|
|
771
1137
|
try:
|
|
772
1138
|
edit_name = _resolve_edit_kind(edit)
|
|
773
|
-
|
|
1139
|
+
_event(
|
|
1140
|
+
console,
|
|
1141
|
+
"EXEC",
|
|
1142
|
+
f"Edit override: {edit} → {edit_name}",
|
|
1143
|
+
emoji="✂️",
|
|
1144
|
+
profile=profile,
|
|
1145
|
+
)
|
|
774
1146
|
cfg = _apply_edit_override(cfg, edit)
|
|
775
1147
|
except ValueError as e:
|
|
776
|
-
console
|
|
1148
|
+
_event(console, "FAIL", str(e), emoji="❌", profile=profile)
|
|
777
1149
|
raise typer.Exit(1) from e
|
|
778
1150
|
|
|
779
1151
|
# Apply CLI overrides for auto configuration
|
|
780
1152
|
if tier or probes is not None:
|
|
781
1153
|
if tier and tier not in ["conservative", "balanced", "aggressive", "none"]:
|
|
782
|
-
|
|
783
|
-
|
|
1154
|
+
_event(
|
|
1155
|
+
console,
|
|
1156
|
+
"FAIL",
|
|
1157
|
+
f"Invalid tier '{tier}'. Valid options: conservative, balanced, aggressive, none",
|
|
1158
|
+
emoji="❌",
|
|
1159
|
+
profile=profile,
|
|
784
1160
|
)
|
|
785
1161
|
raise typer.Exit(1)
|
|
786
1162
|
if probes is not None and (probes < 0 or probes > 10):
|
|
787
|
-
|
|
788
|
-
|
|
1163
|
+
_event(
|
|
1164
|
+
console,
|
|
1165
|
+
"FAIL",
|
|
1166
|
+
f"Invalid probes '{probes}'. Must be between 0 and 10",
|
|
1167
|
+
emoji="❌",
|
|
1168
|
+
profile=profile,
|
|
789
1169
|
)
|
|
790
1170
|
raise typer.Exit(1)
|
|
791
1171
|
|
|
@@ -796,10 +1176,22 @@ def _prepare_config_for_run(
|
|
|
796
1176
|
cfg_dict["auto"] = auto_section
|
|
797
1177
|
if tier:
|
|
798
1178
|
auto_section["tier"] = tier
|
|
799
|
-
|
|
1179
|
+
_event(
|
|
1180
|
+
console,
|
|
1181
|
+
"INIT",
|
|
1182
|
+
f"Auto tier override: {tier}",
|
|
1183
|
+
emoji="🎛️",
|
|
1184
|
+
profile=profile,
|
|
1185
|
+
)
|
|
800
1186
|
if probes is not None:
|
|
801
1187
|
auto_section["probes"] = probes
|
|
802
|
-
|
|
1188
|
+
_event(
|
|
1189
|
+
console,
|
|
1190
|
+
"INIT",
|
|
1191
|
+
f"Auto probes override: {probes}",
|
|
1192
|
+
emoji="🔬",
|
|
1193
|
+
profile=profile,
|
|
1194
|
+
)
|
|
803
1195
|
cfg = InvarLockConfig(cfg_dict)
|
|
804
1196
|
|
|
805
1197
|
# Resolve adapter:auto to a concrete built-in adapter if requested
|
|
@@ -832,7 +1224,7 @@ def _maybe_plan_release_windows(
|
|
|
832
1224
|
|
|
833
1225
|
|
|
834
1226
|
def _print_pipeline_start(console: Console) -> None:
|
|
835
|
-
console
|
|
1227
|
+
_event(console, "INIT", "Starting InvarLock pipeline...", emoji="🚀")
|
|
836
1228
|
|
|
837
1229
|
|
|
838
1230
|
def _emit_run_artifacts(
|
|
@@ -841,7 +1233,7 @@ def _emit_run_artifacts(
|
|
|
841
1233
|
"""Save run report and return emitted artifact paths."""
|
|
842
1234
|
from invarlock.reporting.report import save_report as _save_report
|
|
843
1235
|
|
|
844
|
-
console
|
|
1236
|
+
_event(console, "DATA", "Saving run report...", emoji="💾")
|
|
845
1237
|
return _save_report(
|
|
846
1238
|
report, out_dir, formats=["json"], filename_prefix=filename_prefix
|
|
847
1239
|
)
|
|
@@ -864,12 +1256,11 @@ def _resolve_device_and_output(
|
|
|
864
1256
|
cfg_device = None
|
|
865
1257
|
target_device = device or cfg_device or "auto"
|
|
866
1258
|
resolved_device = _resolve_device(target_device)
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
)
|
|
1259
|
+
resolution_note = _device_resolution_note(target_device, resolved_device)
|
|
1260
|
+
console.print(_format_kv_line("Device", f"{resolved_device} ({resolution_note})"))
|
|
870
1261
|
is_valid, error_msg = _validate(resolved_device)
|
|
871
1262
|
if not is_valid:
|
|
872
|
-
console
|
|
1263
|
+
_event(console, "FAIL", f"Device validation failed: {error_msg}", emoji="❌")
|
|
873
1264
|
raise typer.Exit(1)
|
|
874
1265
|
|
|
875
1266
|
# Determine output directory
|
|
@@ -892,6 +1283,7 @@ def _resolve_provider_and_split(
|
|
|
892
1283
|
provider_kwargs: dict[str, Any] | None = None,
|
|
893
1284
|
console: Console,
|
|
894
1285
|
resolved_device: str | None = None,
|
|
1286
|
+
emit: Callable[[str, str, str | None], None] | None = None,
|
|
895
1287
|
) -> tuple[Any, str, bool]:
|
|
896
1288
|
"""Resolve dataset provider and split, returning (provider, split, used_fallback)."""
|
|
897
1289
|
provider_name = None
|
|
@@ -918,7 +1310,10 @@ def _resolve_provider_and_split(
|
|
|
918
1310
|
# Pass device hint only to providers that understand it (currently WikiText-2)
|
|
919
1311
|
if resolved_device and provider_name == "wikitext2":
|
|
920
1312
|
provider_kwargs.setdefault("device_hint", resolved_device)
|
|
921
|
-
|
|
1313
|
+
if emit is not None and provider_name == "wikitext2":
|
|
1314
|
+
data_provider = get_provider_fn(provider_name, emit=emit, **provider_kwargs)
|
|
1315
|
+
else:
|
|
1316
|
+
data_provider = get_provider_fn(provider_name, **provider_kwargs)
|
|
922
1317
|
|
|
923
1318
|
requested_split = None
|
|
924
1319
|
try:
|
|
@@ -951,13 +1346,24 @@ def _extract_model_load_kwargs(cfg: InvarLockConfig) -> dict[str, Any]:
|
|
|
951
1346
|
for key, value in model.items()
|
|
952
1347
|
if key not in {"id", "adapter", "device"} and value is not None
|
|
953
1348
|
}
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
1349
|
+
removed_keys: list[str] = []
|
|
1350
|
+
for key in ("torch_dtype", "load_in_8bit", "load_in_4bit"):
|
|
1351
|
+
if key in extra:
|
|
1352
|
+
removed_keys.append(key)
|
|
1353
|
+
if removed_keys:
|
|
1354
|
+
raise InvarlockError(
|
|
1355
|
+
code="E007",
|
|
1356
|
+
message=(
|
|
1357
|
+
"CONFIG-KEY-REMOVED: "
|
|
1358
|
+
+ ", ".join(removed_keys)
|
|
1359
|
+
+ ". Use model.dtype and/or model.quantization_config."
|
|
1360
|
+
),
|
|
1361
|
+
details={"removed_keys": removed_keys},
|
|
1362
|
+
)
|
|
957
1363
|
|
|
958
|
-
# Normalize
|
|
959
|
-
if "
|
|
960
|
-
dtype_str = str(extra.get("
|
|
1364
|
+
# Normalize dtype when present (keep as string for JSON-ability).
|
|
1365
|
+
if "dtype" in extra and isinstance(extra.get("dtype"), str):
|
|
1366
|
+
dtype_str = str(extra.get("dtype") or "").strip().lower()
|
|
961
1367
|
aliases = {
|
|
962
1368
|
"fp16": "float16",
|
|
963
1369
|
"half": "float16",
|
|
@@ -965,14 +1371,22 @@ def _extract_model_load_kwargs(cfg: InvarLockConfig) -> dict[str, Any]:
|
|
|
965
1371
|
"fp32": "float32",
|
|
966
1372
|
}
|
|
967
1373
|
if dtype_str in aliases:
|
|
968
|
-
extra["
|
|
1374
|
+
extra["dtype"] = aliases[dtype_str]
|
|
969
1375
|
elif dtype_str:
|
|
970
|
-
extra["
|
|
1376
|
+
extra["dtype"] = dtype_str
|
|
971
1377
|
|
|
972
1378
|
return extra
|
|
973
1379
|
|
|
974
1380
|
|
|
975
|
-
def _load_model_with_cfg(
|
|
1381
|
+
def _load_model_with_cfg(
|
|
1382
|
+
adapter: Any,
|
|
1383
|
+
cfg: InvarLockConfig,
|
|
1384
|
+
device: str,
|
|
1385
|
+
*,
|
|
1386
|
+
profile: str | None = None,
|
|
1387
|
+
event_path: Path | None = None,
|
|
1388
|
+
warning_context: dict[str, Any] | None = None,
|
|
1389
|
+
) -> Any:
|
|
976
1390
|
"""Load a model with config-provided kwargs, filtering for strict adapters."""
|
|
977
1391
|
try:
|
|
978
1392
|
model_id = cfg.model.id
|
|
@@ -985,20 +1399,25 @@ def _load_model_with_cfg(adapter: Any, cfg: InvarLockConfig, device: str) -> Any
|
|
|
985
1399
|
raise ValueError("Missing model.id in config")
|
|
986
1400
|
|
|
987
1401
|
extra = _extract_model_load_kwargs(cfg)
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1402
|
+
with _suppress_noisy_warnings(
|
|
1403
|
+
profile,
|
|
1404
|
+
event_path=event_path,
|
|
1405
|
+
context=warning_context,
|
|
1406
|
+
):
|
|
1407
|
+
try:
|
|
1408
|
+
sig = inspect.signature(adapter.load_model)
|
|
1409
|
+
accepts_var_kw = any(
|
|
1410
|
+
p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
|
|
1411
|
+
)
|
|
1412
|
+
if accepts_var_kw:
|
|
1413
|
+
return adapter.load_model(model_id, device=device, **extra)
|
|
1414
|
+
allowed = {k: v for k, v in extra.items() if k in sig.parameters}
|
|
1415
|
+
if allowed:
|
|
1416
|
+
return adapter.load_model(model_id, device=device, **allowed)
|
|
1417
|
+
except Exception:
|
|
1418
|
+
# Fall back to the strictest call shape.
|
|
1419
|
+
pass
|
|
1420
|
+
return adapter.load_model(model_id, device=device)
|
|
1002
1421
|
|
|
1003
1422
|
|
|
1004
1423
|
def _run_bare_control(
|
|
@@ -1018,14 +1437,20 @@ def _run_bare_control(
|
|
|
1018
1437
|
restore_fn: Any | None,
|
|
1019
1438
|
console: Console,
|
|
1020
1439
|
resolved_loss_type: str,
|
|
1021
|
-
profile_normalized: str | None,
|
|
1440
|
+
profile_normalized: str | None = None,
|
|
1022
1441
|
snapshot_provenance: dict[str, bool] | None = None,
|
|
1023
1442
|
skip_model_load: bool = False,
|
|
1024
1443
|
) -> dict[str, Any] | None:
|
|
1025
1444
|
"""Execute the bare-control run for overhead estimation and return payload."""
|
|
1026
1445
|
from invarlock.core.runner import CoreRunner as _CoreRunner
|
|
1027
1446
|
|
|
1028
|
-
|
|
1447
|
+
_event(
|
|
1448
|
+
console,
|
|
1449
|
+
"EXEC",
|
|
1450
|
+
"Running bare control (guards disabled) for overhead check",
|
|
1451
|
+
emoji="🧪",
|
|
1452
|
+
profile=profile_normalized,
|
|
1453
|
+
)
|
|
1029
1454
|
set_seed(seed_bundle["python"]) # type: ignore[arg-type]
|
|
1030
1455
|
|
|
1031
1456
|
bare_runner = _CoreRunner()
|
|
@@ -1034,6 +1459,12 @@ def _run_bare_control(
|
|
|
1034
1459
|
bare_context = copy.deepcopy(run_config.context)
|
|
1035
1460
|
bare_context.setdefault("validation", {})["guard_overhead_mode"] = "bare"
|
|
1036
1461
|
bare_config.context = bare_context
|
|
1462
|
+
runtime_edit_config = dict(edit_config or {})
|
|
1463
|
+
runtime_edit_config.setdefault("console", console)
|
|
1464
|
+
runtime_edit_config.setdefault(
|
|
1465
|
+
"output_style", _style_from_console(console, profile=profile_normalized)
|
|
1466
|
+
)
|
|
1467
|
+
runtime_edit_config.setdefault("emit", True)
|
|
1037
1468
|
|
|
1038
1469
|
private_model_loaded = False
|
|
1039
1470
|
bare_target_model = None
|
|
@@ -1047,23 +1478,30 @@ def _run_bare_control(
|
|
|
1047
1478
|
elif skip_model_load:
|
|
1048
1479
|
bare_target_model = model or SimpleNamespace(name="bare_stub_model")
|
|
1049
1480
|
else:
|
|
1050
|
-
bare_target_model = _load_model_with_cfg(
|
|
1481
|
+
bare_target_model = _load_model_with_cfg(
|
|
1482
|
+
adapter, cfg, resolved_device, profile=profile_normalized
|
|
1483
|
+
)
|
|
1051
1484
|
private_model_loaded = True
|
|
1052
1485
|
if snapshot_provenance is not None:
|
|
1053
1486
|
snapshot_provenance["reload_path_used"] = True
|
|
1054
1487
|
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1488
|
+
with _suppress_noisy_warnings(
|
|
1489
|
+
profile_normalized,
|
|
1490
|
+
event_path=getattr(run_config, "event_path", None),
|
|
1491
|
+
context={"phase": "guard_overhead_bare"},
|
|
1492
|
+
):
|
|
1493
|
+
bare_report = bare_runner.execute(
|
|
1494
|
+
model=bare_target_model,
|
|
1495
|
+
adapter=adapter,
|
|
1496
|
+
edit=edit_op,
|
|
1497
|
+
guards=[],
|
|
1498
|
+
config=bare_config,
|
|
1499
|
+
calibration_data=calibration_data,
|
|
1500
|
+
auto_config=auto_config,
|
|
1501
|
+
edit_config=runtime_edit_config,
|
|
1502
|
+
preview_n=preview_count,
|
|
1503
|
+
final_n=final_count,
|
|
1504
|
+
)
|
|
1067
1505
|
finally:
|
|
1068
1506
|
if private_model_loaded:
|
|
1069
1507
|
_free_model_memory(bare_target_model)
|
|
@@ -1084,8 +1522,12 @@ def _run_bare_control(
|
|
|
1084
1522
|
return False
|
|
1085
1523
|
|
|
1086
1524
|
if not (_finite(bare_ppl_preview) and _finite(bare_ppl_final)):
|
|
1087
|
-
|
|
1088
|
-
|
|
1525
|
+
_event(
|
|
1526
|
+
console,
|
|
1527
|
+
"WARN",
|
|
1528
|
+
"Primary metric non-finite during bare control; continuing with diagnostics.",
|
|
1529
|
+
emoji="⚠️",
|
|
1530
|
+
profile=profile_normalized,
|
|
1089
1531
|
)
|
|
1090
1532
|
|
|
1091
1533
|
payload: dict[str, Any] = {
|
|
@@ -1137,6 +1579,7 @@ def _execute_guarded_run(
|
|
|
1137
1579
|
final_count: int,
|
|
1138
1580
|
restore_fn: Any | None,
|
|
1139
1581
|
resolved_device: str,
|
|
1582
|
+
profile_normalized: str | None = None,
|
|
1140
1583
|
console: Console,
|
|
1141
1584
|
snapshot_provenance: dict[str, bool] | None = None,
|
|
1142
1585
|
skip_model_load: bool = False,
|
|
@@ -1150,23 +1593,56 @@ def _execute_guarded_run(
|
|
|
1150
1593
|
elif skip_model_load:
|
|
1151
1594
|
model = model or SimpleNamespace(name="guarded_stub_model")
|
|
1152
1595
|
else:
|
|
1153
|
-
|
|
1154
|
-
|
|
1596
|
+
_event(
|
|
1597
|
+
console,
|
|
1598
|
+
"INIT",
|
|
1599
|
+
f"Loading model: {cfg.model.id} (attempt 1)",
|
|
1600
|
+
emoji="🔧",
|
|
1601
|
+
profile=profile_normalized,
|
|
1602
|
+
)
|
|
1603
|
+
warning_context: dict[str, Any] = {"phase": "load_model"}
|
|
1604
|
+
try:
|
|
1605
|
+
if hasattr(run_config, "context") and isinstance(run_config.context, dict):
|
|
1606
|
+
rid = run_config.context.get("run_id")
|
|
1607
|
+
if isinstance(rid, str) and rid:
|
|
1608
|
+
warning_context["run_id"] = rid
|
|
1609
|
+
except Exception:
|
|
1610
|
+
pass
|
|
1611
|
+
model = _load_model_with_cfg(
|
|
1612
|
+
adapter,
|
|
1613
|
+
cfg,
|
|
1614
|
+
resolved_device,
|
|
1615
|
+
profile=profile_normalized,
|
|
1616
|
+
event_path=getattr(run_config, "event_path", None),
|
|
1617
|
+
warning_context=warning_context,
|
|
1618
|
+
)
|
|
1155
1619
|
if snapshot_provenance is not None:
|
|
1156
1620
|
snapshot_provenance["reload_path_used"] = True
|
|
1157
1621
|
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
guards=guards,
|
|
1163
|
-
config=run_config,
|
|
1164
|
-
calibration_data=calibration_data,
|
|
1165
|
-
auto_config=auto_config,
|
|
1166
|
-
edit_config=edit_config,
|
|
1167
|
-
preview_n=preview_count,
|
|
1168
|
-
final_n=final_count,
|
|
1622
|
+
runtime_edit_config = dict(edit_config or {})
|
|
1623
|
+
runtime_edit_config.setdefault("console", console)
|
|
1624
|
+
runtime_edit_config.setdefault(
|
|
1625
|
+
"output_style", _style_from_console(console, profile=profile_normalized)
|
|
1169
1626
|
)
|
|
1627
|
+
runtime_edit_config.setdefault("emit", True)
|
|
1628
|
+
|
|
1629
|
+
with _suppress_noisy_warnings(
|
|
1630
|
+
profile_normalized,
|
|
1631
|
+
event_path=getattr(run_config, "event_path", None),
|
|
1632
|
+
context={"phase": "core_runner_execute"},
|
|
1633
|
+
):
|
|
1634
|
+
core_report = runner.execute(
|
|
1635
|
+
model=model,
|
|
1636
|
+
adapter=adapter,
|
|
1637
|
+
edit=edit_op,
|
|
1638
|
+
guards=guards,
|
|
1639
|
+
config=run_config,
|
|
1640
|
+
calibration_data=calibration_data,
|
|
1641
|
+
auto_config=auto_config,
|
|
1642
|
+
edit_config=runtime_edit_config,
|
|
1643
|
+
preview_n=preview_count,
|
|
1644
|
+
final_n=final_count,
|
|
1645
|
+
)
|
|
1170
1646
|
return core_report, model
|
|
1171
1647
|
|
|
1172
1648
|
|
|
@@ -1200,10 +1676,10 @@ def _postprocess_and_summarize(
|
|
|
1200
1676
|
saved_files = _emit_run_artifacts(
|
|
1201
1677
|
report=report, out_dir=run_dir, filename_prefix="report", console=console
|
|
1202
1678
|
)
|
|
1203
|
-
console
|
|
1204
|
-
console
|
|
1679
|
+
_event(console, "PASS", "Run completed successfully!", emoji="✅")
|
|
1680
|
+
_event(console, "DATA", f"Report: {saved_files['json']}", emoji="📄")
|
|
1205
1681
|
if run_config.event_path:
|
|
1206
|
-
console
|
|
1682
|
+
_event(console, "DATA", f"Events: {run_config.event_path}", emoji="📝")
|
|
1207
1683
|
return saved_files
|
|
1208
1684
|
|
|
1209
1685
|
|
|
@@ -1293,9 +1769,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1293
1769
|
message = f"PAIRING-EVIDENCE-MISSING: {path}: {reason}"
|
|
1294
1770
|
if prof in {"ci", "release"}:
|
|
1295
1771
|
raise InvarlockError(code="E001", message=message)
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1772
|
+
if console is not None:
|
|
1773
|
+
_event(
|
|
1774
|
+
console,
|
|
1775
|
+
"FAIL",
|
|
1776
|
+
f"Baseline pairing schedule '{path}' is incompatible: {reason}",
|
|
1777
|
+
emoji="❌",
|
|
1778
|
+
profile=prof,
|
|
1779
|
+
)
|
|
1299
1780
|
raise typer.Exit(1)
|
|
1300
1781
|
|
|
1301
1782
|
baseline_meta = (
|
|
@@ -1450,9 +1931,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1450
1931
|
prof = (profile or "dev").strip().lower()
|
|
1451
1932
|
if prof in {"ci", "release"}:
|
|
1452
1933
|
_fail_schedule("preview_hash mismatch vs baseline report data")
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1934
|
+
if console is not None:
|
|
1935
|
+
_event(
|
|
1936
|
+
console,
|
|
1937
|
+
"WARN",
|
|
1938
|
+
"Baseline preview_hash mismatch; continuing in dev profile.",
|
|
1939
|
+
emoji="⚠️",
|
|
1940
|
+
profile=prof,
|
|
1941
|
+
)
|
|
1456
1942
|
if (
|
|
1457
1943
|
isinstance(baseline_final_hash, str)
|
|
1458
1944
|
and baseline_final_hash
|
|
@@ -1461,9 +1947,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1461
1947
|
prof = (profile or "dev").strip().lower()
|
|
1462
1948
|
if prof in {"ci", "release"}:
|
|
1463
1949
|
_fail_schedule("final_hash mismatch vs baseline report data")
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1950
|
+
if console is not None:
|
|
1951
|
+
_event(
|
|
1952
|
+
console,
|
|
1953
|
+
"WARN",
|
|
1954
|
+
"Baseline final_hash mismatch; continuing in dev profile.",
|
|
1955
|
+
emoji="⚠️",
|
|
1956
|
+
profile=prof,
|
|
1957
|
+
)
|
|
1467
1958
|
if (
|
|
1468
1959
|
isinstance(baseline_dataset_hash, str)
|
|
1469
1960
|
and baseline_dataset_hash
|
|
@@ -1472,9 +1963,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1472
1963
|
prof = (profile or "dev").strip().lower()
|
|
1473
1964
|
if prof in {"ci", "release"}:
|
|
1474
1965
|
_fail_schedule("dataset_hash mismatch vs baseline report data")
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1966
|
+
if console is not None:
|
|
1967
|
+
_event(
|
|
1968
|
+
console,
|
|
1969
|
+
"WARN",
|
|
1970
|
+
"Baseline dataset_hash mismatch; continuing in dev profile.",
|
|
1971
|
+
emoji="⚠️",
|
|
1972
|
+
profile=prof,
|
|
1973
|
+
)
|
|
1478
1974
|
except InvarlockError:
|
|
1479
1975
|
raise
|
|
1480
1976
|
except typer.Exit:
|
|
@@ -1496,10 +1992,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1496
1992
|
and baseline_final is not None
|
|
1497
1993
|
and baseline_final != cfg_final
|
|
1498
1994
|
):
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1995
|
+
if console is not None:
|
|
1996
|
+
_event(
|
|
1997
|
+
console,
|
|
1998
|
+
"WARN",
|
|
1999
|
+
f"Adjusting evaluation window counts to match baseline schedule ({baseline_preview}/{baseline_final}).",
|
|
2000
|
+
emoji="⚠️",
|
|
2001
|
+
profile=profile,
|
|
2002
|
+
)
|
|
1503
2003
|
|
|
1504
2004
|
effective_preview = int(baseline_preview)
|
|
1505
2005
|
effective_final = int(baseline_final)
|
|
@@ -1662,6 +2162,7 @@ def _resolve_metric_and_provider(
|
|
|
1662
2162
|
model_profile: Any,
|
|
1663
2163
|
*,
|
|
1664
2164
|
resolved_loss_type: str | None = None,
|
|
2165
|
+
metric_kind_override: str | None = None,
|
|
1665
2166
|
) -> tuple[str, str, dict[str, float]]:
|
|
1666
2167
|
"""Resolve metric kind, provider kind, and metric options from config with precedence.
|
|
1667
2168
|
|
|
@@ -1701,9 +2202,13 @@ def _resolve_metric_and_provider(
|
|
|
1701
2202
|
metric_cfg = None
|
|
1702
2203
|
|
|
1703
2204
|
metric_kind = None
|
|
2205
|
+
if isinstance(metric_kind_override, str) and metric_kind_override.strip():
|
|
2206
|
+
mk_override = metric_kind_override.strip().lower()
|
|
2207
|
+
if mk_override != "auto":
|
|
2208
|
+
metric_kind = mk_override
|
|
1704
2209
|
reps = None
|
|
1705
2210
|
ci_level = None
|
|
1706
|
-
if metric_cfg is not None:
|
|
2211
|
+
if metric_kind is None and metric_cfg is not None:
|
|
1707
2212
|
try:
|
|
1708
2213
|
metric_kind = (
|
|
1709
2214
|
metric_cfg.get("kind")
|
|
@@ -1825,18 +2330,25 @@ def _plan_release_windows(
|
|
|
1825
2330
|
candidate_msg = f", candidate_unique={int(candidate_unique)}" + (
|
|
1826
2331
|
f"/{int(candidate_limit)}" if candidate_limit is not None else ""
|
|
1827
2332
|
)
|
|
1828
|
-
|
|
1829
|
-
|
|
2333
|
+
_event(
|
|
2334
|
+
console,
|
|
2335
|
+
"METRIC",
|
|
2336
|
+
"Release window capacity:"
|
|
1830
2337
|
f" unique={available_unique}, reserve={reserve_windows} "
|
|
1831
2338
|
f"(calib {calibration_windows}, buffer {buffer_windows}), "
|
|
1832
2339
|
f"usable={available_for_eval}, "
|
|
1833
2340
|
f"per-arm raw={actual_per_arm_raw} → selected {actual_per_arm} "
|
|
1834
|
-
f"(target {target_per_arm}{candidate_msg})"
|
|
2341
|
+
f"(target {target_per_arm}{candidate_msg})",
|
|
2342
|
+
emoji="📏",
|
|
2343
|
+
profile="release",
|
|
1835
2344
|
)
|
|
1836
2345
|
if actual_per_arm < target_per_arm:
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
2346
|
+
_event(
|
|
2347
|
+
console,
|
|
2348
|
+
"WARN",
|
|
2349
|
+
f"Adjusted per-arm windows down from {target_per_arm} to {actual_per_arm} based on capacity.",
|
|
2350
|
+
emoji="⚠️",
|
|
2351
|
+
profile="release",
|
|
1840
2352
|
)
|
|
1841
2353
|
|
|
1842
2354
|
plan = {
|
|
@@ -1893,16 +2405,31 @@ def run_command(
|
|
|
1893
2405
|
),
|
|
1894
2406
|
out: str | None = typer.Option(None, "--out", help="Output directory override"),
|
|
1895
2407
|
edit: str | None = typer.Option(None, "--edit", help="Edit kind (quant|mixed)"),
|
|
2408
|
+
edit_label: str | None = typer.Option(
|
|
2409
|
+
None,
|
|
2410
|
+
"--edit-label",
|
|
2411
|
+
help=(
|
|
2412
|
+
"Edit algorithm label for BYOE models. Use 'noop' for baseline, "
|
|
2413
|
+
"'quant_rtn' etc. for built-in edits, 'custom' for pre-edited models."
|
|
2414
|
+
),
|
|
2415
|
+
),
|
|
1896
2416
|
tier: str | None = typer.Option(
|
|
1897
2417
|
None,
|
|
1898
2418
|
"--tier",
|
|
1899
2419
|
help="Auto-tuning tier override (conservative|balanced|aggressive)",
|
|
1900
2420
|
),
|
|
2421
|
+
metric_kind: str | None = typer.Option(
|
|
2422
|
+
None,
|
|
2423
|
+
"--metric-kind",
|
|
2424
|
+
help="Primary metric kind override (ppl_causal|ppl_mlm|accuracy|etc.)",
|
|
2425
|
+
),
|
|
1901
2426
|
probes: int | None = typer.Option(
|
|
1902
2427
|
None, "--probes", help="Number of micro-probes (0=deterministic, >0=adaptive)"
|
|
1903
2428
|
),
|
|
1904
2429
|
until_pass: bool = typer.Option(
|
|
1905
|
-
False,
|
|
2430
|
+
False,
|
|
2431
|
+
"--until-pass",
|
|
2432
|
+
help="Retry until evaluation report passes gates (max 3 attempts)",
|
|
1906
2433
|
),
|
|
1907
2434
|
max_attempts: int = typer.Option(
|
|
1908
2435
|
3, "--max-attempts", help="Maximum retry attempts for --until-pass mode"
|
|
@@ -1913,11 +2440,24 @@ def run_command(
|
|
|
1913
2440
|
baseline: str | None = typer.Option(
|
|
1914
2441
|
None,
|
|
1915
2442
|
"--baseline",
|
|
1916
|
-
help="Path to baseline report.json for
|
|
2443
|
+
help="Path to baseline report.json for evaluation report validation",
|
|
1917
2444
|
),
|
|
1918
2445
|
no_cleanup: bool = typer.Option(
|
|
1919
2446
|
False, "--no-cleanup", help="Skip cleanup of temporary artifacts"
|
|
1920
2447
|
),
|
|
2448
|
+
style: str | None = typer.Option(
|
|
2449
|
+
None, "--style", help="Output style (audit|friendly)"
|
|
2450
|
+
),
|
|
2451
|
+
progress: bool = typer.Option(
|
|
2452
|
+
False, "--progress", help="Show progress done messages"
|
|
2453
|
+
),
|
|
2454
|
+
timing: bool = typer.Option(False, "--timing", help="Show timing summary"),
|
|
2455
|
+
telemetry: bool = typer.Option(
|
|
2456
|
+
False, "--telemetry", help="Write telemetry JSON alongside the report"
|
|
2457
|
+
),
|
|
2458
|
+
no_color: bool = typer.Option(
|
|
2459
|
+
False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
|
|
2460
|
+
),
|
|
1921
2461
|
):
|
|
1922
2462
|
"""
|
|
1923
2463
|
Run InvarLock pipeline with the given configuration.
|
|
@@ -1925,7 +2465,7 @@ def run_command(
|
|
|
1925
2465
|
The command assembles non-overlapping preview/final windows, executes the
|
|
1926
2466
|
GuardChain (invariants → spectral → RMT → variance), checks pairing/overlap
|
|
1927
2467
|
invariants, enforces guard-overhead ≤1 %, and emits a run report plus JSONL
|
|
1928
|
-
events suitable for
|
|
2468
|
+
events suitable for evaluation report generation.
|
|
1929
2469
|
"""
|
|
1930
2470
|
|
|
1931
2471
|
try:
|
|
@@ -1936,24 +2476,57 @@ def run_command(
|
|
|
1936
2476
|
config = _coerce_option(config)
|
|
1937
2477
|
device = _coerce_option(device)
|
|
1938
2478
|
profile = _coerce_option(profile)
|
|
2479
|
+
profile_normalized = (str(profile or "")).strip().lower()
|
|
1939
2480
|
out = _coerce_option(out)
|
|
1940
2481
|
edit = _coerce_option(edit)
|
|
2482
|
+
edit_label = _coerce_option(edit_label)
|
|
1941
2483
|
tier = _coerce_option(tier)
|
|
2484
|
+
metric_kind = _coerce_option(metric_kind)
|
|
1942
2485
|
probes = _coerce_option(probes)
|
|
1943
2486
|
until_pass = bool(_coerce_option(until_pass, False))
|
|
1944
2487
|
max_attempts = int(_coerce_option(max_attempts, 3))
|
|
1945
2488
|
timeout = _coerce_option(timeout)
|
|
1946
2489
|
baseline = _coerce_option(baseline)
|
|
1947
2490
|
no_cleanup = bool(_coerce_option(no_cleanup, False))
|
|
2491
|
+
style = _coerce_option(style)
|
|
2492
|
+
progress = bool(_coerce_option(progress, False))
|
|
2493
|
+
timing = bool(_coerce_option(timing, False))
|
|
2494
|
+
telemetry = bool(_coerce_option(telemetry, False))
|
|
2495
|
+
no_color = bool(_coerce_option(no_color, False))
|
|
2496
|
+
|
|
2497
|
+
output_style = resolve_output_style(
|
|
2498
|
+
style=str(style) if style is not None else None,
|
|
2499
|
+
profile=profile_normalized,
|
|
2500
|
+
progress=progress,
|
|
2501
|
+
timing=timing,
|
|
2502
|
+
no_color=no_color,
|
|
2503
|
+
)
|
|
2504
|
+
console._invarlock_output_style = output_style
|
|
2505
|
+
if not output_style.color:
|
|
2506
|
+
console.no_color = True
|
|
2507
|
+
timings: dict[str, float] = {}
|
|
2508
|
+
collect_timings = bool(output_style.timing or telemetry)
|
|
2509
|
+
total_start: float | None = perf_counter() if collect_timings else None
|
|
2510
|
+
|
|
2511
|
+
_apply_warning_filters(profile_normalized)
|
|
1948
2512
|
|
|
1949
2513
|
# Use shared CLI coercers from invarlock.cli.utils
|
|
1950
2514
|
report_path_out: str | None = None
|
|
1951
2515
|
|
|
1952
2516
|
def _fail_run(message: str) -> None:
|
|
1953
|
-
console
|
|
2517
|
+
_event(console, "FAIL", message, emoji="❌", profile=profile_normalized)
|
|
1954
2518
|
# Generic failure path → exit 1 (InvarlockError paths handle code 3 separately)
|
|
1955
2519
|
raise typer.Exit(1)
|
|
1956
2520
|
|
|
2521
|
+
def _provider_event(tag: str, message: str, emoji: str | None = None) -> None:
|
|
2522
|
+
_event(
|
|
2523
|
+
console,
|
|
2524
|
+
tag,
|
|
2525
|
+
message,
|
|
2526
|
+
emoji=emoji,
|
|
2527
|
+
profile=profile_normalized,
|
|
2528
|
+
)
|
|
2529
|
+
|
|
1957
2530
|
# Fail fast when torch is missing so users see a clear extras hint instead of
|
|
1958
2531
|
# a raw ModuleNotFoundError from deeper imports.
|
|
1959
2532
|
try:
|
|
@@ -1961,12 +2534,14 @@ def run_command(
|
|
|
1961
2534
|
|
|
1962
2535
|
_ = _torch # pragma: no cover
|
|
1963
2536
|
except (ImportError, ModuleNotFoundError) as e:
|
|
1964
|
-
|
|
1965
|
-
|
|
2537
|
+
_event(
|
|
2538
|
+
console,
|
|
2539
|
+
"FAIL",
|
|
2540
|
+
"Torch is required for this command. "
|
|
1966
2541
|
'Install extras with: pip install "invarlock[hf]" '
|
|
1967
2542
|
'or "invarlock[adapters]".',
|
|
1968
|
-
|
|
1969
|
-
|
|
2543
|
+
emoji="❌",
|
|
2544
|
+
profile=profile_normalized,
|
|
1970
2545
|
)
|
|
1971
2546
|
raise typer.Exit(1) from e
|
|
1972
2547
|
|
|
@@ -2044,7 +2619,7 @@ def run_command(
|
|
|
2044
2619
|
seed_value = 42
|
|
2045
2620
|
set_seed(seed_value)
|
|
2046
2621
|
# Enforce deterministic algorithms in CI/Release profiles when torch is available
|
|
2047
|
-
profile_label =
|
|
2622
|
+
profile_label = profile_normalized or None
|
|
2048
2623
|
if torch is not None and profile_label in {"ci", "release"}:
|
|
2049
2624
|
try: # pragma: no cover - behavior depends on torch availability
|
|
2050
2625
|
if hasattr(torch, "use_deterministic_algorithms"):
|
|
@@ -2073,10 +2648,14 @@ def run_command(
|
|
|
2073
2648
|
"numpy": int(numpy_seed),
|
|
2074
2649
|
"torch": int(torch_seed) if torch_seed is not None else None,
|
|
2075
2650
|
}
|
|
2076
|
-
|
|
2077
|
-
|
|
2651
|
+
_event(
|
|
2652
|
+
console,
|
|
2653
|
+
"INIT",
|
|
2654
|
+
"Deterministic seeds → "
|
|
2078
2655
|
f"python={seed_bundle['python']}, numpy={seed_bundle['numpy']}, "
|
|
2079
|
-
f"torch={seed_bundle['torch'] if seed_bundle['torch'] is not None else 'N/A'}"
|
|
2656
|
+
f"torch={seed_bundle['torch'] if seed_bundle['torch'] is not None else 'N/A'}",
|
|
2657
|
+
emoji="🎲",
|
|
2658
|
+
profile=profile_normalized,
|
|
2080
2659
|
)
|
|
2081
2660
|
|
|
2082
2661
|
# Resolve device and output directory
|
|
@@ -2111,8 +2690,8 @@ def run_command(
|
|
|
2111
2690
|
|
|
2112
2691
|
run_id = f"{output_dir.name}-{timestamp}" if output_dir.name else timestamp
|
|
2113
2692
|
|
|
2114
|
-
console.print(
|
|
2115
|
-
console.print(
|
|
2693
|
+
console.print(_format_kv_line("Output", str(run_dir)))
|
|
2694
|
+
console.print(_format_kv_line("Run ID", run_id))
|
|
2116
2695
|
|
|
2117
2696
|
# Initialize retry controller if --until-pass mode enabled
|
|
2118
2697
|
retry_controller = _init_retry_controller(
|
|
@@ -2127,7 +2706,6 @@ def run_command(
|
|
|
2127
2706
|
pairing_schedule: dict[str, Any] | None = None
|
|
2128
2707
|
if baseline:
|
|
2129
2708
|
baseline_path = Path(baseline)
|
|
2130
|
-
profile_normalized = (profile or "").strip().lower()
|
|
2131
2709
|
strict_baseline = profile_normalized in {"ci", "release"}
|
|
2132
2710
|
if not baseline_path.exists():
|
|
2133
2711
|
msg = (
|
|
@@ -2136,8 +2714,12 @@ def run_command(
|
|
|
2136
2714
|
)
|
|
2137
2715
|
if strict_baseline:
|
|
2138
2716
|
raise InvarlockError(code="E001", message=msg)
|
|
2139
|
-
|
|
2140
|
-
|
|
2717
|
+
_event(
|
|
2718
|
+
console,
|
|
2719
|
+
"WARN",
|
|
2720
|
+
f"{msg}. Falling back to dataset schedule.",
|
|
2721
|
+
emoji="⚠️",
|
|
2722
|
+
profile=profile_normalized,
|
|
2141
2723
|
)
|
|
2142
2724
|
else:
|
|
2143
2725
|
try:
|
|
@@ -2147,8 +2729,12 @@ def run_command(
|
|
|
2147
2729
|
msg = f"PAIRING-EVIDENCE-MISSING: baseline report JSON parse failed ({exc})"
|
|
2148
2730
|
if strict_baseline:
|
|
2149
2731
|
raise InvarlockError(code="E001", message=msg) from exc
|
|
2150
|
-
|
|
2151
|
-
|
|
2732
|
+
_event(
|
|
2733
|
+
console,
|
|
2734
|
+
"WARN",
|
|
2735
|
+
f"{msg}. Falling back to dataset schedule.",
|
|
2736
|
+
emoji="⚠️",
|
|
2737
|
+
profile=profile_normalized,
|
|
2152
2738
|
)
|
|
2153
2739
|
baseline_report_data = None
|
|
2154
2740
|
if isinstance(baseline_report_data, dict):
|
|
@@ -2206,8 +2792,12 @@ def run_command(
|
|
|
2206
2792
|
tokenizer_hash = tok
|
|
2207
2793
|
except Exception:
|
|
2208
2794
|
pass
|
|
2209
|
-
|
|
2210
|
-
|
|
2795
|
+
_event(
|
|
2796
|
+
console,
|
|
2797
|
+
"DATA",
|
|
2798
|
+
"Loaded baseline evaluation schedule for pairing",
|
|
2799
|
+
emoji="🧬",
|
|
2800
|
+
profile=profile_normalized,
|
|
2211
2801
|
)
|
|
2212
2802
|
else:
|
|
2213
2803
|
msg = (
|
|
@@ -2216,8 +2806,12 @@ def run_command(
|
|
|
2216
2806
|
)
|
|
2217
2807
|
if strict_baseline:
|
|
2218
2808
|
raise InvarlockError(code="E001", message=msg)
|
|
2219
|
-
|
|
2220
|
-
|
|
2809
|
+
_event(
|
|
2810
|
+
console,
|
|
2811
|
+
"WARN",
|
|
2812
|
+
f"{msg}. Falling back to dataset schedule.",
|
|
2813
|
+
emoji="⚠️",
|
|
2814
|
+
profile=profile_normalized,
|
|
2221
2815
|
)
|
|
2222
2816
|
baseline_report_data = None
|
|
2223
2817
|
pairing_schedule = None
|
|
@@ -2243,15 +2837,23 @@ def run_command(
|
|
|
2243
2837
|
adapter = registry.get_adapter(cfg.model.adapter)
|
|
2244
2838
|
edit_name = getattr(getattr(cfg, "edit", None), "name", None)
|
|
2245
2839
|
if not isinstance(edit_name, str) or not edit_name.strip():
|
|
2246
|
-
|
|
2247
|
-
|
|
2840
|
+
_event(
|
|
2841
|
+
console,
|
|
2842
|
+
"FAIL",
|
|
2843
|
+
"Edit configuration must specify a non-empty `edit.name`.",
|
|
2844
|
+
emoji="❌",
|
|
2845
|
+
profile=profile_normalized,
|
|
2248
2846
|
)
|
|
2249
2847
|
raise typer.Exit(1)
|
|
2250
2848
|
try:
|
|
2251
2849
|
edit_op = registry.get_edit(edit_name.strip())
|
|
2252
2850
|
except Exception:
|
|
2253
|
-
|
|
2254
|
-
|
|
2851
|
+
_event(
|
|
2852
|
+
console,
|
|
2853
|
+
"WARN",
|
|
2854
|
+
f"Unknown edit '{edit_name.strip()}'. Using pass-through shim.",
|
|
2855
|
+
emoji="⚠️",
|
|
2856
|
+
profile=profile_normalized,
|
|
2255
2857
|
)
|
|
2256
2858
|
edit_op = SimpleNamespace(name=edit_name.strip())
|
|
2257
2859
|
|
|
@@ -2287,8 +2889,12 @@ def run_command(
|
|
|
2287
2889
|
registry.get_plugin_metadata(guard_name, "guards")
|
|
2288
2890
|
)
|
|
2289
2891
|
except KeyError:
|
|
2290
|
-
|
|
2291
|
-
|
|
2892
|
+
_event(
|
|
2893
|
+
console,
|
|
2894
|
+
"WARN",
|
|
2895
|
+
f"Guard '{guard_name}' not found, skipping",
|
|
2896
|
+
emoji="⚠️",
|
|
2897
|
+
profile=profile_normalized,
|
|
2292
2898
|
)
|
|
2293
2899
|
plugin_provenance = {
|
|
2294
2900
|
"adapter": adapter_meta,
|
|
@@ -2296,8 +2902,15 @@ def run_command(
|
|
|
2296
2902
|
"guards": guard_metadata,
|
|
2297
2903
|
}
|
|
2298
2904
|
pm_acceptance_range = _resolve_pm_acceptance_range(cfg)
|
|
2299
|
-
|
|
2300
|
-
|
|
2905
|
+
pm_drift_band = _resolve_pm_drift_band(cfg)
|
|
2906
|
+
|
|
2907
|
+
_event(
|
|
2908
|
+
console,
|
|
2909
|
+
"DATA",
|
|
2910
|
+
f"Adapter: {adapter.name}",
|
|
2911
|
+
emoji="🔌",
|
|
2912
|
+
profile=profile_normalized,
|
|
2913
|
+
)
|
|
2301
2914
|
|
|
2302
2915
|
# Create run configuration
|
|
2303
2916
|
guard_overrides = {
|
|
@@ -2361,6 +2974,9 @@ def run_command(
|
|
|
2361
2974
|
pm_acceptance_range
|
|
2362
2975
|
)
|
|
2363
2976
|
run_context["pm_acceptance_range"] = pm_acceptance_range
|
|
2977
|
+
if pm_drift_band:
|
|
2978
|
+
run_context.setdefault("primary_metric", {})["drift_band"] = pm_drift_band
|
|
2979
|
+
run_context["pm_drift_band"] = pm_drift_band
|
|
2364
2980
|
run_context["model_profile"] = {
|
|
2365
2981
|
"family": model_profile.family,
|
|
2366
2982
|
"default_loss": model_profile.default_loss,
|
|
@@ -2391,6 +3007,7 @@ def run_command(
|
|
|
2391
3007
|
dataset_meta: dict[str, Any] = {}
|
|
2392
3008
|
baseline_meta: dict[str, Any] = {}
|
|
2393
3009
|
window_plan: dict[str, Any] | None = None
|
|
3010
|
+
dataset_timing_start: float | None = perf_counter() if collect_timings else None
|
|
2394
3011
|
if pairing_schedule:
|
|
2395
3012
|
harvested = _validate_and_harvest_baseline_schedule(
|
|
2396
3013
|
cfg,
|
|
@@ -2413,7 +3030,7 @@ def run_command(
|
|
|
2413
3030
|
try:
|
|
2414
3031
|
tokenizer, tokenizer_hash = resolve_tokenizer(model_profile)
|
|
2415
3032
|
except Exception as exc:
|
|
2416
|
-
console
|
|
3033
|
+
_event(console, "FAIL", str(exc), emoji="❌", profile=profile)
|
|
2417
3034
|
raise typer.Exit(1) from exc
|
|
2418
3035
|
preview_window_ids = pairing_schedule["preview"].get("window_ids")
|
|
2419
3036
|
preview_labels = pairing_schedule["preview"].get("labels")
|
|
@@ -2635,7 +3252,13 @@ def run_command(
|
|
|
2635
3252
|
if capacity_meta and "window_capacity" not in dataset_meta:
|
|
2636
3253
|
dataset_meta["window_capacity"] = capacity_meta
|
|
2637
3254
|
elif cfg.dataset.provider:
|
|
2638
|
-
|
|
3255
|
+
_event(
|
|
3256
|
+
console,
|
|
3257
|
+
"DATA",
|
|
3258
|
+
f"Loading dataset: {cfg.dataset.provider}",
|
|
3259
|
+
emoji="📊",
|
|
3260
|
+
profile=profile_normalized,
|
|
3261
|
+
)
|
|
2639
3262
|
# Pass through provider-specific kwargs when available
|
|
2640
3263
|
provider_kwargs = {}
|
|
2641
3264
|
for key in (
|
|
@@ -2695,6 +3318,7 @@ def run_command(
|
|
|
2695
3318
|
provider_kwargs=provider_kwargs,
|
|
2696
3319
|
console=console,
|
|
2697
3320
|
resolved_device=resolved_device,
|
|
3321
|
+
emit=_provider_event,
|
|
2698
3322
|
)
|
|
2699
3323
|
)
|
|
2700
3324
|
|
|
@@ -2702,7 +3326,7 @@ def run_command(
|
|
|
2702
3326
|
try:
|
|
2703
3327
|
tokenizer, tokenizer_hash = resolve_tokenizer(model_profile)
|
|
2704
3328
|
except Exception as exc:
|
|
2705
|
-
console
|
|
3329
|
+
_event(console, "FAIL", str(exc), emoji="❌", profile=profile)
|
|
2706
3330
|
raise typer.Exit(1) from exc
|
|
2707
3331
|
|
|
2708
3332
|
dataset_stride = getattr(
|
|
@@ -2736,7 +3360,7 @@ def run_command(
|
|
|
2736
3360
|
console=console,
|
|
2737
3361
|
)
|
|
2738
3362
|
except RuntimeError as err:
|
|
2739
|
-
console
|
|
3363
|
+
_event(console, "FAIL", str(err), emoji="❌", profile=profile)
|
|
2740
3364
|
raise typer.Exit(1) from err
|
|
2741
3365
|
|
|
2742
3366
|
actual_per_arm = int(window_plan["actual_preview"])
|
|
@@ -2748,9 +3372,12 @@ def run_command(
|
|
|
2748
3372
|
cfg.dataset, "stride", getattr(cfg.dataset, "seq_len", 0)
|
|
2749
3373
|
)
|
|
2750
3374
|
else:
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
"
|
|
3375
|
+
_event(
|
|
3376
|
+
console,
|
|
3377
|
+
"WARN",
|
|
3378
|
+
"Release profile requested but dataset provider does not expose capacity estimation; using configured window counts.",
|
|
3379
|
+
emoji="⚠️",
|
|
3380
|
+
profile=profile_normalized,
|
|
2754
3381
|
)
|
|
2755
3382
|
|
|
2756
3383
|
preview_records: list[tuple[list[int], list[int]]] = []
|
|
@@ -2954,8 +3581,12 @@ def run_command(
|
|
|
2954
3581
|
raise RuntimeError(
|
|
2955
3582
|
"Unable to construct non-overlapping windows within minimum window floor."
|
|
2956
3583
|
)
|
|
2957
|
-
|
|
2958
|
-
|
|
3584
|
+
_event(
|
|
3585
|
+
console,
|
|
3586
|
+
"WARN",
|
|
3587
|
+
f"Detected {deficit} duplicate windows; reducing per-arm windows to {proposed_per_arm} and retrying stratification.",
|
|
3588
|
+
emoji="⚠️",
|
|
3589
|
+
profile=profile_normalized,
|
|
2959
3590
|
)
|
|
2960
3591
|
|
|
2961
3592
|
effective_preview = proposed_per_arm
|
|
@@ -3097,6 +3728,10 @@ def run_command(
|
|
|
3097
3728
|
run_context["dataset_meta"] = dataset_meta
|
|
3098
3729
|
if window_plan:
|
|
3099
3730
|
run_context["window_plan"] = window_plan
|
|
3731
|
+
if dataset_timing_start is not None:
|
|
3732
|
+
timings["load_dataset"] = max(
|
|
3733
|
+
0.0, float(perf_counter() - dataset_timing_start)
|
|
3734
|
+
)
|
|
3100
3735
|
|
|
3101
3736
|
if os.environ.get("INVARLOCK_DEBUG_TRACE"):
|
|
3102
3737
|
console.print(
|
|
@@ -3120,7 +3755,13 @@ def run_command(
|
|
|
3120
3755
|
)
|
|
3121
3756
|
|
|
3122
3757
|
# Execute the real pipeline using CoreRunner
|
|
3123
|
-
|
|
3758
|
+
_event(
|
|
3759
|
+
console,
|
|
3760
|
+
"EXEC",
|
|
3761
|
+
f"Executing pipeline with {len(guards)} guards...",
|
|
3762
|
+
emoji="⚙️",
|
|
3763
|
+
profile=profile_normalized,
|
|
3764
|
+
)
|
|
3124
3765
|
runner = CoreRunner()
|
|
3125
3766
|
|
|
3126
3767
|
# Prepare auto configuration for tier resolution
|
|
@@ -3185,8 +3826,8 @@ def run_command(
|
|
|
3185
3826
|
for key, values in model_profile.module_selectors.items()
|
|
3186
3827
|
}
|
|
3187
3828
|
|
|
3188
|
-
console.print(
|
|
3189
|
-
console.print(
|
|
3829
|
+
console.print(_format_kv_line("Edit", str(edit_op.name)))
|
|
3830
|
+
console.print(_format_kv_line("Guards", _format_guard_chain(guards)))
|
|
3190
3831
|
|
|
3191
3832
|
# Model load/snapshot strategy
|
|
3192
3833
|
model = None
|
|
@@ -3200,8 +3841,30 @@ def run_command(
|
|
|
3200
3841
|
# Try single-load with snapshot/restore if adapter supports it; fallback to reload per attempt
|
|
3201
3842
|
try:
|
|
3202
3843
|
# Load once
|
|
3203
|
-
|
|
3204
|
-
|
|
3844
|
+
_event(
|
|
3845
|
+
console,
|
|
3846
|
+
"INIT",
|
|
3847
|
+
f"Loading model once: {cfg.model.id}",
|
|
3848
|
+
emoji="🔧",
|
|
3849
|
+
profile=profile_normalized,
|
|
3850
|
+
)
|
|
3851
|
+
with timed_step(
|
|
3852
|
+
console=console,
|
|
3853
|
+
style=_style_from_console(console, profile=profile_normalized),
|
|
3854
|
+
timings=timings,
|
|
3855
|
+
key="load_model",
|
|
3856
|
+
tag="INIT",
|
|
3857
|
+
message="Load model",
|
|
3858
|
+
emoji="🔧",
|
|
3859
|
+
):
|
|
3860
|
+
model = _load_model_with_cfg(
|
|
3861
|
+
adapter,
|
|
3862
|
+
cfg,
|
|
3863
|
+
resolved_device,
|
|
3864
|
+
profile=profile_normalized,
|
|
3865
|
+
event_path=run_dir / "events.jsonl",
|
|
3866
|
+
warning_context={"phase": "load_model", "run_id": run_id},
|
|
3867
|
+
)
|
|
3205
3868
|
|
|
3206
3869
|
# No edit-specific bootstrap logic
|
|
3207
3870
|
|
|
@@ -3357,9 +4020,13 @@ def run_command(
|
|
|
3357
4020
|
return "reload"
|
|
3358
4021
|
|
|
3359
4022
|
mode = _choose_snapshot_mode()
|
|
3360
|
-
|
|
3361
|
-
|
|
3362
|
-
|
|
4023
|
+
enabled = mode in {"bytes", "chunked"}
|
|
4024
|
+
_event(
|
|
4025
|
+
console,
|
|
4026
|
+
"INIT",
|
|
4027
|
+
f"Snapshot mode: {'enabled' if enabled else 'disabled'}",
|
|
4028
|
+
emoji="💾",
|
|
4029
|
+
profile=profile_normalized,
|
|
3363
4030
|
)
|
|
3364
4031
|
if mode == "chunked":
|
|
3365
4032
|
snapshot_tmpdir = adapter.snapshot_chunked(model) # type: ignore[attr-defined]
|
|
@@ -3402,13 +4069,16 @@ def run_command(
|
|
|
3402
4069
|
|
|
3403
4070
|
# RETRY LOOP - All report processing inside loop
|
|
3404
4071
|
attempt = 1
|
|
3405
|
-
profile_normalized = (profile or "").lower()
|
|
3406
4072
|
measure_guard_overhead, skip_overhead = _should_measure_overhead(
|
|
3407
4073
|
profile_normalized
|
|
3408
4074
|
)
|
|
3409
4075
|
if skip_overhead and profile_normalized in {"ci", "release"}:
|
|
3410
|
-
|
|
3411
|
-
|
|
4076
|
+
_event(
|
|
4077
|
+
console,
|
|
4078
|
+
"WARN",
|
|
4079
|
+
"Overhead check skipped via INVARLOCK_SKIP_OVERHEAD_CHECK",
|
|
4080
|
+
emoji="⚠️",
|
|
4081
|
+
profile=profile_normalized,
|
|
3412
4082
|
)
|
|
3413
4083
|
|
|
3414
4084
|
while True:
|
|
@@ -3416,12 +4086,32 @@ def run_command(
|
|
|
3416
4086
|
set_seed(seed_bundle["python"])
|
|
3417
4087
|
|
|
3418
4088
|
if retry_controller:
|
|
3419
|
-
console.print(
|
|
4089
|
+
console.print("\n")
|
|
4090
|
+
_event(
|
|
4091
|
+
console,
|
|
4092
|
+
"EXEC",
|
|
4093
|
+
f"Attempt {attempt}/{max_attempts}",
|
|
4094
|
+
emoji="🚀",
|
|
4095
|
+
profile=profile_normalized,
|
|
4096
|
+
)
|
|
3420
4097
|
if attempt > 1:
|
|
3421
|
-
|
|
4098
|
+
_event(
|
|
4099
|
+
console,
|
|
4100
|
+
"EXEC",
|
|
4101
|
+
f"Retry attempt {attempt}/{max_attempts}",
|
|
4102
|
+
emoji="🔄",
|
|
4103
|
+
profile=profile_normalized,
|
|
4104
|
+
)
|
|
3422
4105
|
else:
|
|
3423
4106
|
if attempt > 1:
|
|
3424
|
-
console.print(
|
|
4107
|
+
console.print("\n")
|
|
4108
|
+
_event(
|
|
4109
|
+
console,
|
|
4110
|
+
"EXEC",
|
|
4111
|
+
f"Attempt {attempt}",
|
|
4112
|
+
emoji="🚀",
|
|
4113
|
+
profile=profile_normalized,
|
|
4114
|
+
)
|
|
3425
4115
|
|
|
3426
4116
|
# Adjust parameters for retry attempts
|
|
3427
4117
|
if retry_controller and attempt > 1:
|
|
@@ -3450,6 +4140,8 @@ def run_command(
|
|
|
3450
4140
|
"checks": {},
|
|
3451
4141
|
}
|
|
3452
4142
|
elif measure_guard_overhead:
|
|
4143
|
+
bare_edit_config = dict(edit_config or {})
|
|
4144
|
+
bare_edit_config["emit"] = False
|
|
3453
4145
|
guard_overhead_payload = _run_bare_control(
|
|
3454
4146
|
adapter=adapter,
|
|
3455
4147
|
edit_op=edit_op,
|
|
@@ -3458,7 +4150,7 @@ def run_command(
|
|
|
3458
4150
|
run_config=run_config,
|
|
3459
4151
|
calibration_data=calibration_data,
|
|
3460
4152
|
auto_config=auto_config,
|
|
3461
|
-
edit_config=
|
|
4153
|
+
edit_config=bare_edit_config,
|
|
3462
4154
|
preview_count=preview_count,
|
|
3463
4155
|
final_count=final_count,
|
|
3464
4156
|
seed_bundle=seed_bundle,
|
|
@@ -3472,34 +4164,53 @@ def run_command(
|
|
|
3472
4164
|
)
|
|
3473
4165
|
|
|
3474
4166
|
# Ensure clean state for guarded run
|
|
3475
|
-
|
|
3476
|
-
runner=runner,
|
|
3477
|
-
adapter=adapter,
|
|
3478
|
-
model=model,
|
|
3479
|
-
cfg=cfg,
|
|
3480
|
-
edit_op=edit_op,
|
|
3481
|
-
run_config=run_config,
|
|
3482
|
-
guards=guards,
|
|
3483
|
-
calibration_data=calibration_data,
|
|
3484
|
-
auto_config=auto_config,
|
|
3485
|
-
edit_config=edit_config,
|
|
3486
|
-
preview_count=preview_count,
|
|
3487
|
-
final_count=final_count,
|
|
3488
|
-
restore_fn=restore_fn,
|
|
3489
|
-
resolved_device=resolved_device,
|
|
4167
|
+
with timed_step(
|
|
3490
4168
|
console=console,
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
4169
|
+
style=_style_from_console(console, profile=profile_normalized),
|
|
4170
|
+
timings=timings,
|
|
4171
|
+
key="execute",
|
|
4172
|
+
tag="EXEC",
|
|
4173
|
+
message="Execute pipeline",
|
|
4174
|
+
emoji="⚙️",
|
|
4175
|
+
):
|
|
4176
|
+
core_report, model = _execute_guarded_run(
|
|
4177
|
+
runner=runner,
|
|
4178
|
+
adapter=adapter,
|
|
4179
|
+
model=model,
|
|
4180
|
+
cfg=cfg,
|
|
4181
|
+
edit_op=edit_op,
|
|
4182
|
+
run_config=run_config,
|
|
4183
|
+
guards=guards,
|
|
4184
|
+
calibration_data=calibration_data,
|
|
4185
|
+
auto_config=auto_config,
|
|
4186
|
+
edit_config=edit_config,
|
|
4187
|
+
preview_count=preview_count,
|
|
4188
|
+
final_count=final_count,
|
|
4189
|
+
restore_fn=restore_fn,
|
|
4190
|
+
resolved_device=resolved_device,
|
|
4191
|
+
profile_normalized=profile_normalized,
|
|
4192
|
+
console=console,
|
|
4193
|
+
snapshot_provenance=snapshot_provenance,
|
|
4194
|
+
skip_model_load=skip_model_load,
|
|
4195
|
+
)
|
|
3494
4196
|
except _SnapshotRestoreFailed as exc:
|
|
3495
4197
|
snapshot_provenance["restore_failed"] = True
|
|
3496
4198
|
_free_model_memory(model)
|
|
3497
4199
|
model = None
|
|
3498
4200
|
restore_fn = None
|
|
3499
|
-
|
|
3500
|
-
|
|
4201
|
+
_event(
|
|
4202
|
+
console,
|
|
4203
|
+
"WARN",
|
|
4204
|
+
"Snapshot restore failed; switching to reload-per-attempt.",
|
|
4205
|
+
emoji="⚠️",
|
|
4206
|
+
profile=profile_normalized,
|
|
4207
|
+
)
|
|
4208
|
+
_event(
|
|
4209
|
+
console,
|
|
4210
|
+
"WARN",
|
|
4211
|
+
f"↳ {exc}",
|
|
4212
|
+
profile=profile_normalized,
|
|
3501
4213
|
)
|
|
3502
|
-
console.print(f"[yellow]↳ {exc}[/yellow]")
|
|
3503
4214
|
if retry_controller:
|
|
3504
4215
|
retry_controller.record_attempt(
|
|
3505
4216
|
attempt,
|
|
@@ -3521,7 +4232,7 @@ def run_command(
|
|
|
3521
4232
|
# Convert CoreRunner report to evaluation report
|
|
3522
4233
|
report = create_empty_report()
|
|
3523
4234
|
|
|
3524
|
-
# Persist minimal run context for
|
|
4235
|
+
# Persist minimal run context for evaluation report provenance.
|
|
3525
4236
|
try:
|
|
3526
4237
|
report["context"] = {
|
|
3527
4238
|
"profile": profile_normalized,
|
|
@@ -3631,6 +4342,8 @@ def run_command(
|
|
|
3631
4342
|
report["meta"].update(meta_payload)
|
|
3632
4343
|
if pm_acceptance_range:
|
|
3633
4344
|
report["meta"]["pm_acceptance_range"] = pm_acceptance_range
|
|
4345
|
+
if pm_drift_band:
|
|
4346
|
+
report["meta"]["pm_drift_band"] = pm_drift_band
|
|
3634
4347
|
report["meta"]["model_profile"] = {
|
|
3635
4348
|
"family": model_profile.family,
|
|
3636
4349
|
"default_loss": model_profile.default_loss,
|
|
@@ -3714,6 +4427,14 @@ def run_command(
|
|
|
3714
4427
|
}
|
|
3715
4428
|
)
|
|
3716
4429
|
|
|
4430
|
+
if edit_label:
|
|
4431
|
+
report.setdefault("edit", {})
|
|
4432
|
+
report["edit"]["name"] = edit_label
|
|
4433
|
+
report["edit"]["algorithm"] = edit_label
|
|
4434
|
+
if isinstance(core_report.context, dict):
|
|
4435
|
+
core_report.context.setdefault("edit", {})
|
|
4436
|
+
core_report.context["edit"]["name"] = edit_label
|
|
4437
|
+
|
|
3717
4438
|
mask_artifact_path = _persist_ref_masks(core_report, run_dir)
|
|
3718
4439
|
if mask_artifact_path:
|
|
3719
4440
|
report.setdefault("artifacts", {})
|
|
@@ -3721,6 +4442,22 @@ def run_command(
|
|
|
3721
4442
|
|
|
3722
4443
|
# Transfer metrics (PM-only: do not write ppl_* fields)
|
|
3723
4444
|
if hasattr(core_report, "metrics") and core_report.metrics:
|
|
4445
|
+
if isinstance(core_report.metrics, dict):
|
|
4446
|
+
core_timings = core_report.metrics.get("timings")
|
|
4447
|
+
if isinstance(core_timings, dict):
|
|
4448
|
+
for key in (
|
|
4449
|
+
"prepare",
|
|
4450
|
+
"prepare_guards",
|
|
4451
|
+
"edit",
|
|
4452
|
+
"guards",
|
|
4453
|
+
"eval",
|
|
4454
|
+
"finalize",
|
|
4455
|
+
):
|
|
4456
|
+
if key in core_timings:
|
|
4457
|
+
try:
|
|
4458
|
+
timings[key] = float(core_timings[key])
|
|
4459
|
+
except Exception:
|
|
4460
|
+
timings[key] = core_timings[key]
|
|
3724
4461
|
metrics_payload = {
|
|
3725
4462
|
"latency_ms_per_tok": core_report.metrics.get(
|
|
3726
4463
|
"latency_ms_per_tok", 0.0
|
|
@@ -3772,6 +4509,11 @@ def run_command(
|
|
|
3772
4509
|
"masked_tokens_total",
|
|
3773
4510
|
"masked_tokens_preview",
|
|
3774
4511
|
"masked_tokens_final",
|
|
4512
|
+
"timings",
|
|
4513
|
+
"guard_timings",
|
|
4514
|
+
"memory_snapshots",
|
|
4515
|
+
"gpu_memory_mb_peak",
|
|
4516
|
+
"gpu_memory_reserved_mb_peak",
|
|
3775
4517
|
"reduction",
|
|
3776
4518
|
]
|
|
3777
4519
|
for key in optional_keys:
|
|
@@ -3935,8 +4677,12 @@ def run_command(
|
|
|
3935
4677
|
},
|
|
3936
4678
|
}
|
|
3937
4679
|
elif had_baseline and (profile or "").lower() in {"ci", "release"}:
|
|
3938
|
-
|
|
3939
|
-
|
|
4680
|
+
_event(
|
|
4681
|
+
console,
|
|
4682
|
+
"FAIL",
|
|
4683
|
+
"[INVARLOCK:E001] PAIRING-SCHEDULE-MISMATCH: baseline pairing requested but evaluation windows were not produced. Check capacity/pairing config.",
|
|
4684
|
+
emoji="❌",
|
|
4685
|
+
profile=profile_normalized,
|
|
3940
4686
|
)
|
|
3941
4687
|
raise typer.Exit(3)
|
|
3942
4688
|
else:
|
|
@@ -4147,12 +4893,20 @@ def run_command(
|
|
|
4147
4893
|
if ok:
|
|
4148
4894
|
report["artifacts"]["checkpoint_path"] = str(export_dir)
|
|
4149
4895
|
else:
|
|
4150
|
-
|
|
4151
|
-
|
|
4896
|
+
_event(
|
|
4897
|
+
console,
|
|
4898
|
+
"WARN",
|
|
4899
|
+
"Model export requested but adapter did not save a HF directory.",
|
|
4900
|
+
emoji="⚠️",
|
|
4901
|
+
profile=profile_normalized,
|
|
4152
4902
|
)
|
|
4153
4903
|
except Exception:
|
|
4154
|
-
|
|
4155
|
-
|
|
4904
|
+
_event(
|
|
4905
|
+
console,
|
|
4906
|
+
"WARN",
|
|
4907
|
+
"Model export requested but failed due to an unexpected error.",
|
|
4908
|
+
emoji="⚠️",
|
|
4909
|
+
profile=profile_normalized,
|
|
4156
4910
|
)
|
|
4157
4911
|
|
|
4158
4912
|
# Set flags
|
|
@@ -4373,7 +5127,10 @@ def run_command(
|
|
|
4373
5127
|
try:
|
|
4374
5128
|
metric_kind_resolved, _provider_kind, metric_opts = (
|
|
4375
5129
|
_resolve_metric_and_provider(
|
|
4376
|
-
cfg,
|
|
5130
|
+
cfg,
|
|
5131
|
+
model_profile,
|
|
5132
|
+
resolved_loss_type=resolved_loss_type,
|
|
5133
|
+
metric_kind_override=metric_kind,
|
|
4377
5134
|
)
|
|
4378
5135
|
)
|
|
4379
5136
|
if metric_kind_resolved:
|
|
@@ -4452,6 +5209,13 @@ def run_command(
|
|
|
4452
5209
|
except Exception:
|
|
4453
5210
|
pass
|
|
4454
5211
|
|
|
5212
|
+
telemetry_path: Path | None = None
|
|
5213
|
+
if telemetry:
|
|
5214
|
+
telemetry_path = run_dir / "telemetry.json"
|
|
5215
|
+
report.setdefault("artifacts", {})["telemetry_path"] = str(
|
|
5216
|
+
telemetry_path
|
|
5217
|
+
)
|
|
5218
|
+
|
|
4455
5219
|
saved_files = _postprocess_and_summarize(
|
|
4456
5220
|
report=report,
|
|
4457
5221
|
run_dir=run_dir,
|
|
@@ -4468,6 +5232,31 @@ def run_command(
|
|
|
4468
5232
|
except Exception:
|
|
4469
5233
|
pass
|
|
4470
5234
|
|
|
5235
|
+
if telemetry and telemetry_path is not None:
|
|
5236
|
+
try:
|
|
5237
|
+
from invarlock.reporting.telemetry import save_telemetry_report
|
|
5238
|
+
|
|
5239
|
+
saved_path = save_telemetry_report(
|
|
5240
|
+
report, run_dir, filename=telemetry_path.name
|
|
5241
|
+
)
|
|
5242
|
+
if isinstance(saved_files, dict):
|
|
5243
|
+
saved_files["telemetry"] = str(saved_path)
|
|
5244
|
+
_event(
|
|
5245
|
+
console,
|
|
5246
|
+
"DATA",
|
|
5247
|
+
f"Telemetry: {saved_path}",
|
|
5248
|
+
emoji="📈",
|
|
5249
|
+
profile=profile_normalized,
|
|
5250
|
+
)
|
|
5251
|
+
except Exception as exc: # pragma: no cover - best-effort
|
|
5252
|
+
_event(
|
|
5253
|
+
console,
|
|
5254
|
+
"WARN",
|
|
5255
|
+
f"Telemetry export failed: {exc}",
|
|
5256
|
+
emoji="⚠️",
|
|
5257
|
+
profile=profile_normalized,
|
|
5258
|
+
)
|
|
5259
|
+
|
|
4471
5260
|
# Metrics display
|
|
4472
5261
|
pm_obj = None
|
|
4473
5262
|
try:
|
|
@@ -4482,15 +5271,23 @@ def run_command(
|
|
|
4482
5271
|
if isinstance(pm_prev, (int | float)) and isinstance(
|
|
4483
5272
|
pm_fin, (int | float)
|
|
4484
5273
|
):
|
|
4485
|
-
|
|
4486
|
-
|
|
5274
|
+
_event(
|
|
5275
|
+
console,
|
|
5276
|
+
"METRIC",
|
|
5277
|
+
f"Primary Metric [{pm_kind}] — preview: {pm_prev:.3f}, final: {pm_fin:.3f}",
|
|
5278
|
+
emoji="📌",
|
|
5279
|
+
profile=profile_normalized,
|
|
4487
5280
|
)
|
|
4488
5281
|
ratio_vs_base = pm_obj.get("ratio_vs_baseline")
|
|
4489
5282
|
if isinstance(ratio_vs_base, (int | float)) and math.isfinite(
|
|
4490
5283
|
ratio_vs_base
|
|
4491
5284
|
):
|
|
4492
|
-
|
|
4493
|
-
|
|
5285
|
+
_event(
|
|
5286
|
+
console,
|
|
5287
|
+
"METRIC",
|
|
5288
|
+
f"Ratio vs baseline [{pm_kind}]: {ratio_vs_base:.3f}",
|
|
5289
|
+
emoji="🔗",
|
|
5290
|
+
profile=profile_normalized,
|
|
4494
5291
|
)
|
|
4495
5292
|
except Exception:
|
|
4496
5293
|
pass
|
|
@@ -4502,8 +5299,12 @@ def run_command(
|
|
|
4502
5299
|
console, guard_overhead_info
|
|
4503
5300
|
)
|
|
4504
5301
|
if not guard_overhead_info.get("passed", True):
|
|
4505
|
-
|
|
4506
|
-
|
|
5302
|
+
_event(
|
|
5303
|
+
console,
|
|
5304
|
+
"FAIL",
|
|
5305
|
+
"Guard overhead gate FAILED: Guards add more than the permitted budget",
|
|
5306
|
+
emoji="⚠️",
|
|
5307
|
+
profile=profile_normalized,
|
|
4507
5308
|
)
|
|
4508
5309
|
# Only fail hard when the overhead check was actually evaluated
|
|
4509
5310
|
# (e.g., for causal LMs with available bare/guarded PM). For
|
|
@@ -4528,11 +5329,11 @@ def run_command(
|
|
|
4528
5329
|
f"(>{threshold_fraction * 100:.1f}% increase)"
|
|
4529
5330
|
)
|
|
4530
5331
|
|
|
4531
|
-
# Drift gate status is no longer surfaced in console; rely on
|
|
5332
|
+
# Drift gate status is no longer surfaced in console; rely on evaluation report gates
|
|
4532
5333
|
|
|
4533
|
-
#
|
|
5334
|
+
# Evaluation report validation for --until-pass mode
|
|
4534
5335
|
if retry_controller and baseline:
|
|
4535
|
-
from invarlock.reporting.
|
|
5336
|
+
from invarlock.reporting.report_builder import make_report
|
|
4536
5337
|
|
|
4537
5338
|
try:
|
|
4538
5339
|
baseline_report = baseline_report_data
|
|
@@ -4544,15 +5345,21 @@ def run_command(
|
|
|
4544
5345
|
if baseline_report is None:
|
|
4545
5346
|
raise FileNotFoundError("Baseline report unavailable")
|
|
4546
5347
|
|
|
4547
|
-
|
|
4548
|
-
|
|
5348
|
+
_event(
|
|
5349
|
+
console,
|
|
5350
|
+
"EXEC",
|
|
5351
|
+
"Generating evaluation report...",
|
|
5352
|
+
emoji="📜",
|
|
5353
|
+
profile=profile_normalized,
|
|
5354
|
+
)
|
|
5355
|
+
evaluation_report = make_report(report, baseline_report)
|
|
4549
5356
|
|
|
4550
|
-
validation =
|
|
4551
|
-
|
|
5357
|
+
validation = evaluation_report.get("validation", {})
|
|
5358
|
+
report_passed = all(validation.values())
|
|
4552
5359
|
|
|
4553
5360
|
failed_gates = [k for k, v in validation.items() if not v]
|
|
4554
5361
|
result_summary = {
|
|
4555
|
-
"passed":
|
|
5362
|
+
"passed": report_passed,
|
|
4556
5363
|
"failures": failed_gates,
|
|
4557
5364
|
"validation": validation,
|
|
4558
5365
|
}
|
|
@@ -4560,12 +5367,22 @@ def run_command(
|
|
|
4560
5367
|
attempt, result_summary, edit_config
|
|
4561
5368
|
)
|
|
4562
5369
|
|
|
4563
|
-
if
|
|
4564
|
-
|
|
5370
|
+
if report_passed:
|
|
5371
|
+
_event(
|
|
5372
|
+
console,
|
|
5373
|
+
"PASS",
|
|
5374
|
+
"Evaluation report PASSED all gates!",
|
|
5375
|
+
emoji="✅",
|
|
5376
|
+
profile=profile_normalized,
|
|
5377
|
+
)
|
|
4565
5378
|
break
|
|
4566
5379
|
else:
|
|
4567
|
-
|
|
4568
|
-
|
|
5380
|
+
_event(
|
|
5381
|
+
console,
|
|
5382
|
+
"FAIL",
|
|
5383
|
+
f"Evaluation report FAILED gates: {', '.join(failed_gates)}",
|
|
5384
|
+
emoji="⚠️",
|
|
5385
|
+
profile=profile_normalized,
|
|
4569
5386
|
)
|
|
4570
5387
|
|
|
4571
5388
|
# Auto-tune mask-only heads (binary search on keep count)
|
|
@@ -4610,31 +5427,43 @@ def run_command(
|
|
|
4610
5427
|
}
|
|
4611
5428
|
)
|
|
4612
5429
|
head_section["global_k"] = next_keep
|
|
4613
|
-
|
|
4614
|
-
|
|
5430
|
+
_event(
|
|
5431
|
+
console,
|
|
5432
|
+
"INIT",
|
|
5433
|
+
f"Auto-tune adjust: global_k → {next_keep} (bounds {keep_low}-{keep_high})",
|
|
5434
|
+
emoji="🔧",
|
|
5435
|
+
profile=profile_normalized,
|
|
4615
5436
|
)
|
|
4616
5437
|
except Exception:
|
|
4617
5438
|
pass
|
|
4618
5439
|
|
|
4619
|
-
if retry_controller.should_retry(
|
|
5440
|
+
if retry_controller.should_retry(report_passed):
|
|
4620
5441
|
attempt += 1
|
|
4621
5442
|
continue
|
|
4622
5443
|
else:
|
|
4623
|
-
|
|
4624
|
-
|
|
5444
|
+
_event(
|
|
5445
|
+
console,
|
|
5446
|
+
"FAIL",
|
|
5447
|
+
f"Exhausted retry budget after {attempt} attempts",
|
|
5448
|
+
emoji="❌",
|
|
5449
|
+
profile=profile_normalized,
|
|
4625
5450
|
)
|
|
4626
5451
|
break
|
|
4627
5452
|
|
|
4628
|
-
except Exception as
|
|
4629
|
-
|
|
4630
|
-
|
|
5453
|
+
except Exception as report_error:
|
|
5454
|
+
_event(
|
|
5455
|
+
console,
|
|
5456
|
+
"WARN",
|
|
5457
|
+
f"Evaluation report validation failed: {report_error}",
|
|
5458
|
+
emoji="⚠️",
|
|
5459
|
+
profile=profile_normalized,
|
|
4631
5460
|
)
|
|
4632
5461
|
if retry_controller:
|
|
4633
5462
|
retry_controller.record_attempt(
|
|
4634
5463
|
attempt,
|
|
4635
5464
|
{
|
|
4636
5465
|
"passed": False,
|
|
4637
|
-
"failures": ["
|
|
5466
|
+
"failures": ["report_error"],
|
|
4638
5467
|
"validation": {},
|
|
4639
5468
|
},
|
|
4640
5469
|
edit_config,
|
|
@@ -4656,11 +5485,82 @@ def run_command(
|
|
|
4656
5485
|
# (moved) Cleanup printing occurs after loop to guarantee execution
|
|
4657
5486
|
pass
|
|
4658
5487
|
|
|
5488
|
+
if output_style.timing:
|
|
5489
|
+
total_duration = (
|
|
5490
|
+
max(0.0, float(perf_counter() - total_start))
|
|
5491
|
+
if total_start is not None
|
|
5492
|
+
else None
|
|
5493
|
+
)
|
|
5494
|
+
timings_for_summary: dict[str, float] = {}
|
|
5495
|
+
for key, value in timings.items():
|
|
5496
|
+
if isinstance(value, (int | float)):
|
|
5497
|
+
timings_for_summary[key] = float(value)
|
|
5498
|
+
if total_duration is not None:
|
|
5499
|
+
timings_for_summary["total"] = total_duration
|
|
5500
|
+
|
|
5501
|
+
has_breakdown = any(
|
|
5502
|
+
key in timings_for_summary
|
|
5503
|
+
for key in (
|
|
5504
|
+
"prepare",
|
|
5505
|
+
"prepare_guards",
|
|
5506
|
+
"edit",
|
|
5507
|
+
"guards",
|
|
5508
|
+
"eval",
|
|
5509
|
+
"finalize",
|
|
5510
|
+
)
|
|
5511
|
+
)
|
|
5512
|
+
|
|
5513
|
+
order: list[tuple[str, str]] = []
|
|
5514
|
+
|
|
5515
|
+
def _add(label: str, key: str) -> None:
|
|
5516
|
+
if key in timings_for_summary:
|
|
5517
|
+
order.append((label, key))
|
|
5518
|
+
|
|
5519
|
+
_add("Load model", "load_model")
|
|
5520
|
+
_add("Load data", "load_dataset")
|
|
5521
|
+
if has_breakdown:
|
|
5522
|
+
_add("Prepare", "prepare")
|
|
5523
|
+
_add("Prep guards", "prepare_guards")
|
|
5524
|
+
_add("Edit", "edit")
|
|
5525
|
+
_add("Guards", "guards")
|
|
5526
|
+
_add("Eval", "eval")
|
|
5527
|
+
_add("Finalize", "finalize")
|
|
5528
|
+
else:
|
|
5529
|
+
_add("Execute", "execute")
|
|
5530
|
+
_add("Total", "total")
|
|
5531
|
+
|
|
5532
|
+
extra_lines: list[str] = []
|
|
5533
|
+
metrics_section = (
|
|
5534
|
+
report.get("metrics", {}) if isinstance(report, dict) else {}
|
|
5535
|
+
)
|
|
5536
|
+
if isinstance(metrics_section, dict):
|
|
5537
|
+
mem_peak = metrics_section.get("memory_mb_peak")
|
|
5538
|
+
gpu_peak = metrics_section.get("gpu_memory_mb_peak")
|
|
5539
|
+
if isinstance(mem_peak, (int | float)):
|
|
5540
|
+
extra_lines.append(f" Peak Memory : {float(mem_peak):.2f} MB")
|
|
5541
|
+
if isinstance(gpu_peak, (int | float)):
|
|
5542
|
+
extra_lines.append(f" Peak GPU Mem: {float(gpu_peak):.2f} MB")
|
|
5543
|
+
|
|
5544
|
+
if timings_for_summary and order:
|
|
5545
|
+
print_timing_summary(
|
|
5546
|
+
console,
|
|
5547
|
+
timings_for_summary,
|
|
5548
|
+
style=output_style,
|
|
5549
|
+
order=order,
|
|
5550
|
+
extra_lines=extra_lines,
|
|
5551
|
+
)
|
|
5552
|
+
|
|
4659
5553
|
# Normal path falls through; cleanup handled below in finally
|
|
4660
5554
|
return report_path_out
|
|
4661
5555
|
|
|
4662
5556
|
except FileNotFoundError as e:
|
|
4663
|
-
|
|
5557
|
+
_event(
|
|
5558
|
+
console,
|
|
5559
|
+
"FAIL",
|
|
5560
|
+
f"Configuration file not found: {e}",
|
|
5561
|
+
emoji="❌",
|
|
5562
|
+
profile=profile_normalized,
|
|
5563
|
+
)
|
|
4664
5564
|
raise typer.Exit(1) from e
|
|
4665
5565
|
except InvarlockError as ce:
|
|
4666
5566
|
# InvarlockError → code 3 only in CI/Release; dev → 1
|
|
@@ -4676,12 +5576,22 @@ def run_command(
|
|
|
4676
5576
|
traceback.print_exc()
|
|
4677
5577
|
# Emit a clearer message for schema failures (exit 2)
|
|
4678
5578
|
if isinstance(e, ValueError) and "Invalid RunReport" in str(e):
|
|
4679
|
-
|
|
4680
|
-
|
|
5579
|
+
_event(
|
|
5580
|
+
console,
|
|
5581
|
+
"FAIL",
|
|
5582
|
+
"Schema invalid: run report structure failed validation",
|
|
5583
|
+
emoji="❌",
|
|
5584
|
+
profile=profile_normalized,
|
|
4681
5585
|
)
|
|
4682
5586
|
code = 2
|
|
4683
5587
|
else:
|
|
4684
|
-
|
|
5588
|
+
_event(
|
|
5589
|
+
console,
|
|
5590
|
+
"FAIL",
|
|
5591
|
+
f"Pipeline execution failed: {e}",
|
|
5592
|
+
emoji="❌",
|
|
5593
|
+
profile=profile_normalized,
|
|
5594
|
+
)
|
|
4685
5595
|
code = _resolve_exit_code(e, profile=profile)
|
|
4686
5596
|
raise typer.Exit(code) from e
|
|
4687
5597
|
finally:
|
|
@@ -4695,9 +5605,21 @@ def run_command(
|
|
|
4695
5605
|
except Exception:
|
|
4696
5606
|
pass
|
|
4697
5607
|
finally:
|
|
4698
|
-
|
|
5608
|
+
_event(
|
|
5609
|
+
console,
|
|
5610
|
+
"INFO",
|
|
5611
|
+
"Cleanup: removed",
|
|
5612
|
+
emoji="🧹",
|
|
5613
|
+
profile=profile_normalized,
|
|
5614
|
+
)
|
|
4699
5615
|
else:
|
|
4700
|
-
|
|
5616
|
+
_event(
|
|
5617
|
+
console,
|
|
5618
|
+
"INFO",
|
|
5619
|
+
"Cleanup: skipped",
|
|
5620
|
+
emoji="🧹",
|
|
5621
|
+
profile=profile_normalized,
|
|
5622
|
+
)
|
|
4701
5623
|
except Exception:
|
|
4702
5624
|
# Best-effort cleanup printing; never raise from finally
|
|
4703
5625
|
pass
|
|
@@ -4844,11 +5766,9 @@ def _print_guard_overhead_summary(
|
|
|
4844
5766
|
"""Print a concise guard-overhead console summary. Returns threshold fraction used."""
|
|
4845
5767
|
evaluated = bool(guard_overhead_info.get("evaluated", True))
|
|
4846
5768
|
if not evaluated:
|
|
4847
|
-
console
|
|
5769
|
+
_event(console, "METRIC", "Guard Overhead: not evaluated", emoji="🛡️")
|
|
4848
5770
|
return GUARD_OVERHEAD_THRESHOLD
|
|
4849
|
-
overhead_status = (
|
|
4850
|
-
"✅ PASS" if guard_overhead_info.get("passed", True) else "❌ FAIL"
|
|
4851
|
-
)
|
|
5771
|
+
overhead_status = "PASS" if guard_overhead_info.get("passed", True) else "FAIL"
|
|
4852
5772
|
overhead_percent = guard_overhead_info.get("overhead_percent")
|
|
4853
5773
|
if isinstance(overhead_percent, (int | float)) and math.isfinite(
|
|
4854
5774
|
float(overhead_percent)
|
|
@@ -4867,8 +5787,11 @@ def _print_guard_overhead_summary(
|
|
|
4867
5787
|
except (TypeError, ValueError):
|
|
4868
5788
|
threshold_fraction = GUARD_OVERHEAD_THRESHOLD
|
|
4869
5789
|
threshold_display = f"≤ +{threshold_fraction * 100:.1f}%"
|
|
4870
|
-
|
|
4871
|
-
|
|
5790
|
+
_event(
|
|
5791
|
+
console,
|
|
5792
|
+
"METRIC",
|
|
5793
|
+
f"Guard Overhead: {overhead_status} {overhead_display} ({threshold_display})",
|
|
5794
|
+
emoji="🛡️",
|
|
4872
5795
|
)
|
|
4873
5796
|
return threshold_fraction
|
|
4874
5797
|
|
|
@@ -4878,8 +5801,12 @@ def _print_retry_summary(console: Console, retry_controller: Any | None) -> None
|
|
|
4878
5801
|
try:
|
|
4879
5802
|
if retry_controller and getattr(retry_controller, "attempt_history", None):
|
|
4880
5803
|
summary = retry_controller.get_attempt_summary()
|
|
4881
|
-
console.print(
|
|
4882
|
-
|
|
5804
|
+
console.print("\n")
|
|
5805
|
+
_event(
|
|
5806
|
+
console,
|
|
5807
|
+
"METRIC",
|
|
5808
|
+
f"Retry Summary: {summary['total_attempts']} attempts in {summary['elapsed_time']:.1f}s",
|
|
5809
|
+
emoji="📊",
|
|
4883
5810
|
)
|
|
4884
5811
|
except Exception:
|
|
4885
5812
|
# Never break the run for summary printing
|
|
@@ -4902,10 +5829,15 @@ def _init_retry_controller(
|
|
|
4902
5829
|
retry_controller = RetryController(
|
|
4903
5830
|
max_attempts=max_attempts, timeout=timeout, verbose=True
|
|
4904
5831
|
)
|
|
4905
|
-
|
|
5832
|
+
_event(
|
|
5833
|
+
console,
|
|
5834
|
+
"INIT",
|
|
5835
|
+
f"Retry mode enabled: max {max_attempts} attempts",
|
|
5836
|
+
emoji="🔄",
|
|
5837
|
+
)
|
|
4906
5838
|
if baseline:
|
|
4907
|
-
console
|
|
5839
|
+
_event(console, "DATA", f"Using baseline: {baseline}", emoji="📋")
|
|
4908
5840
|
else:
|
|
4909
5841
|
if baseline:
|
|
4910
|
-
console
|
|
5842
|
+
_event(console, "DATA", f"Using baseline: {baseline}", emoji="📋")
|
|
4911
5843
|
return retry_controller
|