invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +2 -2
- invarlock/_data/runtime/tiers.yaml +57 -30
- invarlock/adapters/__init__.py +11 -15
- invarlock/adapters/auto.py +35 -40
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_mixin.py +25 -4
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/calibration/spectral_null.py +15 -10
- invarlock/calibration/variance_ve.py +0 -2
- invarlock/cli/adapter_auto.py +31 -21
- invarlock/cli/app.py +73 -2
- invarlock/cli/commands/calibrate.py +6 -2
- invarlock/cli/commands/certify.py +651 -91
- invarlock/cli/commands/doctor.py +11 -11
- invarlock/cli/commands/explain_gates.py +57 -8
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +233 -69
- invarlock/cli/commands/run.py +1066 -244
- invarlock/cli/commands/verify.py +154 -15
- invarlock/cli/config.py +22 -6
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +1 -1
- invarlock/core/api.py +45 -5
- invarlock/core/auto_tuning.py +65 -20
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/contracts.py +7 -1
- invarlock/core/registry.py +11 -13
- invarlock/core/runner.py +425 -75
- invarlock/edits/quant_rtn.py +65 -37
- invarlock/eval/bench.py +3 -16
- invarlock/eval/data.py +82 -51
- invarlock/eval/metrics.py +63 -2
- invarlock/eval/primary_metric.py +23 -0
- invarlock/eval/tail_stats.py +230 -0
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/_estimators.py +154 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/policies.py +16 -6
- invarlock/guards/rmt.py +627 -546
- invarlock/guards/spectral.py +348 -110
- invarlock/guards/tier_config.py +32 -30
- invarlock/guards/variance.py +7 -31
- invarlock/guards_ref/rmt_ref.py +23 -23
- invarlock/model_profile.py +90 -42
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/reporting/certificate.py +384 -55
- invarlock/reporting/certificate_schema.py +3 -2
- invarlock/reporting/dataset_hashing.py +15 -2
- invarlock/reporting/guards_analysis.py +350 -277
- invarlock/reporting/html.py +55 -5
- invarlock/reporting/normalizer.py +13 -0
- invarlock/reporting/policy_utils.py +38 -36
- invarlock/reporting/primary_metric_utils.py +71 -17
- invarlock/reporting/render.py +852 -431
- invarlock/reporting/report.py +40 -4
- invarlock/reporting/report_types.py +11 -3
- invarlock/reporting/telemetry.py +86 -0
- invarlock/reporting/validate.py +1 -18
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
invarlock/cli/commands/run.py
CHANGED
|
@@ -17,8 +17,10 @@ import random
|
|
|
17
17
|
import shutil
|
|
18
18
|
import sys as _sys
|
|
19
19
|
import types as _types
|
|
20
|
+
import warnings
|
|
20
21
|
from array import array
|
|
21
|
-
from collections.abc import Iterable, Sequence
|
|
22
|
+
from collections.abc import Callable, Iterable, Iterator, Sequence
|
|
23
|
+
from contextlib import contextmanager
|
|
22
24
|
from datetime import datetime
|
|
23
25
|
from pathlib import Path
|
|
24
26
|
from types import SimpleNamespace
|
|
@@ -30,6 +32,16 @@ import psutil
|
|
|
30
32
|
import typer
|
|
31
33
|
from rich.console import Console
|
|
32
34
|
|
|
35
|
+
from invarlock.cli.output import (
|
|
36
|
+
OutputStyle,
|
|
37
|
+
make_console,
|
|
38
|
+
perf_counter,
|
|
39
|
+
print_event,
|
|
40
|
+
print_timing_summary,
|
|
41
|
+
resolve_output_style,
|
|
42
|
+
timed_step,
|
|
43
|
+
)
|
|
44
|
+
|
|
33
45
|
try:
|
|
34
46
|
import torch
|
|
35
47
|
except ImportError:
|
|
@@ -63,7 +75,42 @@ from ..config import (
|
|
|
63
75
|
)
|
|
64
76
|
from ..overhead_utils import _extract_pm_snapshot_for_overhead
|
|
65
77
|
|
|
66
|
-
console =
|
|
78
|
+
console = make_console()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _style_from_console(console: Console, profile: str | None = None) -> OutputStyle:
|
|
82
|
+
style = getattr(console, "_invarlock_output_style", None)
|
|
83
|
+
if isinstance(style, OutputStyle):
|
|
84
|
+
return style
|
|
85
|
+
return resolve_output_style(
|
|
86
|
+
style=None,
|
|
87
|
+
profile=profile,
|
|
88
|
+
progress=False,
|
|
89
|
+
timing=False,
|
|
90
|
+
no_color=False,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _event(
|
|
95
|
+
console: Console,
|
|
96
|
+
tag: str,
|
|
97
|
+
message: str,
|
|
98
|
+
*,
|
|
99
|
+
emoji: str | None = None,
|
|
100
|
+
console_style: str | None = None,
|
|
101
|
+
profile: str | None = None,
|
|
102
|
+
) -> None:
|
|
103
|
+
style = _style_from_console(console, profile=profile)
|
|
104
|
+
print_event(
|
|
105
|
+
console,
|
|
106
|
+
tag,
|
|
107
|
+
message,
|
|
108
|
+
style=style,
|
|
109
|
+
emoji=emoji,
|
|
110
|
+
console_style=console_style,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
67
114
|
LIGHT_IMPORT = os.getenv("INVARLOCK_LIGHT_IMPORT", "").strip().lower() in {
|
|
68
115
|
"1",
|
|
69
116
|
"true",
|
|
@@ -76,6 +123,73 @@ RELEASE_MIN_WINDOWS_PER_ARM = 200
|
|
|
76
123
|
RELEASE_CALIBRATION_MIN = 16
|
|
77
124
|
RELEASE_CALIBRATION_MAX = 24
|
|
78
125
|
GUARD_OVERHEAD_THRESHOLD = 0.01
|
|
126
|
+
KV_LABEL_WIDTH = 10
|
|
127
|
+
|
|
128
|
+
_NOISY_WARNING_PATTERNS = (
|
|
129
|
+
r".*`torch_dtype` is deprecated.*",
|
|
130
|
+
r".*loss_type=None.*unrecognized.*",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _resolve_warning_suppression(profile: str | None) -> tuple[bool, bool]:
|
|
135
|
+
suppress_all = os.getenv("INVARLOCK_SUPPRESS_WARNINGS", "").strip().lower() in {
|
|
136
|
+
"1",
|
|
137
|
+
"true",
|
|
138
|
+
"yes",
|
|
139
|
+
"on",
|
|
140
|
+
}
|
|
141
|
+
profile_norm = (profile or "").strip().lower()
|
|
142
|
+
enabled = bool(suppress_all) or profile_norm in {"ci", "ci_cpu", "release", "dev"}
|
|
143
|
+
return enabled, suppress_all
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _apply_warning_filters(profile: str | None) -> bool:
|
|
147
|
+
enabled, suppress_all = _resolve_warning_suppression(profile)
|
|
148
|
+
if not enabled:
|
|
149
|
+
return False
|
|
150
|
+
if suppress_all:
|
|
151
|
+
warnings.simplefilter("ignore")
|
|
152
|
+
else:
|
|
153
|
+
for pattern in _NOISY_WARNING_PATTERNS:
|
|
154
|
+
warnings.filterwarnings("ignore", message=pattern)
|
|
155
|
+
return True
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@contextmanager
|
|
159
|
+
def _suppress_noisy_warnings(profile: str | None) -> Iterator[None]:
|
|
160
|
+
enabled, _suppress_all = _resolve_warning_suppression(profile)
|
|
161
|
+
if not enabled:
|
|
162
|
+
yield
|
|
163
|
+
return
|
|
164
|
+
with warnings.catch_warnings():
|
|
165
|
+
_apply_warning_filters(profile)
|
|
166
|
+
yield
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _format_kv_line(label: str, value: str, *, width: int = KV_LABEL_WIDTH) -> str:
|
|
170
|
+
return f" {label:<{width}}: {value}"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _device_resolution_note(target_device: str, resolved_device: str) -> str:
|
|
174
|
+
target_norm = str(target_device or "").strip().lower()
|
|
175
|
+
resolved_norm = str(resolved_device or "").strip().lower()
|
|
176
|
+
if not target_norm or target_norm == "auto":
|
|
177
|
+
return "auto-resolved"
|
|
178
|
+
if target_norm == resolved_norm:
|
|
179
|
+
return "requested"
|
|
180
|
+
return f"resolved from {target_device}"
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _format_guard_chain(guards: list[Any]) -> str:
|
|
184
|
+
names = [str(getattr(guard, "name", "unknown")) for guard in guards]
|
|
185
|
+
seen: set[str] = set()
|
|
186
|
+
deduped: list[str] = []
|
|
187
|
+
for name in names:
|
|
188
|
+
if name in seen:
|
|
189
|
+
continue
|
|
190
|
+
seen.add(name)
|
|
191
|
+
deduped.append(name)
|
|
192
|
+
return " → ".join(deduped)
|
|
79
193
|
|
|
80
194
|
|
|
81
195
|
# Common dataset split aliases we probe in order when not explicitly set
|
|
@@ -108,6 +222,64 @@ def _coerce_mapping(obj: object) -> dict[str, Any]:
|
|
|
108
222
|
return {}
|
|
109
223
|
|
|
110
224
|
|
|
225
|
+
def _prune_none_values(value: Any) -> Any:
|
|
226
|
+
"""Recursively drop keys/items whose value is None.
|
|
227
|
+
|
|
228
|
+
Used when serializing dataclass-style config sections that define many optional
|
|
229
|
+
fields defaulting to None; those should behave as "unset" rather than explicit
|
|
230
|
+
policy overrides.
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
if isinstance(value, dict):
|
|
234
|
+
return {
|
|
235
|
+
key: _prune_none_values(val)
|
|
236
|
+
for key, val in value.items()
|
|
237
|
+
if val is not None
|
|
238
|
+
}
|
|
239
|
+
if isinstance(value, list):
|
|
240
|
+
return [_prune_none_values(item) for item in value if item is not None]
|
|
241
|
+
if isinstance(value, tuple):
|
|
242
|
+
return tuple(_prune_none_values(item) for item in value if item is not None)
|
|
243
|
+
return value
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _to_serialisable_dict(section: object) -> dict[str, Any]:
|
|
247
|
+
"""Coerce config fragments to plain dicts.
|
|
248
|
+
|
|
249
|
+
Handles InvarLockConfig sections (which wrap dicts in a private `_Obj` with
|
|
250
|
+
`_data`) so downstream components (core.runner) see canonical mappings,
|
|
251
|
+
e.g. `eval.bootstrap.replicates`.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
# Prefer native dump methods
|
|
255
|
+
if hasattr(section, "model_dump"):
|
|
256
|
+
return section.model_dump() # type: ignore[return-value]
|
|
257
|
+
if hasattr(section, "dict"):
|
|
258
|
+
try:
|
|
259
|
+
return section.dict() # type: ignore[return-value]
|
|
260
|
+
except Exception:
|
|
261
|
+
pass
|
|
262
|
+
# Unwrap CLI _Obj wrapper used by InvarLockConfig for attribute access
|
|
263
|
+
try:
|
|
264
|
+
raw = getattr(section, "_data", None)
|
|
265
|
+
if isinstance(raw, dict):
|
|
266
|
+
return raw
|
|
267
|
+
except Exception:
|
|
268
|
+
pass
|
|
269
|
+
# Already a mapping
|
|
270
|
+
if isinstance(section, dict):
|
|
271
|
+
return section
|
|
272
|
+
# Best-effort attribute dump (prune None so "unset" does not override tier defaults)
|
|
273
|
+
try:
|
|
274
|
+
data = vars(section)
|
|
275
|
+
# Common case: {'_data': {...}}
|
|
276
|
+
if isinstance(data, dict) and isinstance(data.get("_data"), dict):
|
|
277
|
+
return data["_data"]
|
|
278
|
+
return _prune_none_values(data) # type: ignore[return-value]
|
|
279
|
+
except TypeError:
|
|
280
|
+
return {}
|
|
281
|
+
|
|
282
|
+
|
|
111
283
|
def _resolve_pm_acceptance_range(
|
|
112
284
|
cfg: InvarLockConfig | dict[str, Any] | None,
|
|
113
285
|
) -> dict[str, float]:
|
|
@@ -183,6 +355,89 @@ def _resolve_pm_acceptance_range(
|
|
|
183
355
|
return {"min": float(min_val), "max": float(max_val)}
|
|
184
356
|
|
|
185
357
|
|
|
358
|
+
def _resolve_pm_drift_band(
|
|
359
|
+
cfg: InvarLockConfig | dict[str, Any] | None,
|
|
360
|
+
) -> dict[str, float]:
|
|
361
|
+
"""Resolve preview→final drift band from config/env with safe defaults.
|
|
362
|
+
|
|
363
|
+
The drift band governs the Preview Final Drift Acceptable gate. By default,
|
|
364
|
+
certificates enforce 0.95–1.05 unless an explicit band is provided.
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
base_min = 0.95
|
|
368
|
+
base_max = 1.05
|
|
369
|
+
|
|
370
|
+
cfg_min = None
|
|
371
|
+
cfg_max = None
|
|
372
|
+
try:
|
|
373
|
+
cfg_map = _coerce_mapping(cfg) if cfg is not None else {}
|
|
374
|
+
pm_section = cfg_map.get("primary_metric") if isinstance(cfg_map, dict) else {}
|
|
375
|
+
pm_map = _coerce_mapping(pm_section)
|
|
376
|
+
drift_band = pm_map.get("drift_band") if isinstance(pm_map, dict) else None
|
|
377
|
+
if isinstance(drift_band, dict):
|
|
378
|
+
if drift_band.get("min") is not None:
|
|
379
|
+
try:
|
|
380
|
+
cfg_min = float(drift_band["min"])
|
|
381
|
+
except (TypeError, ValueError):
|
|
382
|
+
cfg_min = None
|
|
383
|
+
if drift_band.get("max") is not None:
|
|
384
|
+
try:
|
|
385
|
+
cfg_max = float(drift_band["max"])
|
|
386
|
+
except (TypeError, ValueError):
|
|
387
|
+
cfg_max = None
|
|
388
|
+
elif isinstance(drift_band, list | tuple) and len(drift_band) == 2:
|
|
389
|
+
try:
|
|
390
|
+
cfg_min = float(drift_band[0])
|
|
391
|
+
cfg_max = float(drift_band[1])
|
|
392
|
+
except (TypeError, ValueError):
|
|
393
|
+
cfg_min = None
|
|
394
|
+
cfg_max = None
|
|
395
|
+
except Exception:
|
|
396
|
+
cfg_min = None
|
|
397
|
+
cfg_max = None
|
|
398
|
+
|
|
399
|
+
def _parse_env(name: str) -> float | None:
|
|
400
|
+
try:
|
|
401
|
+
raw = os.environ.get(name, "")
|
|
402
|
+
if raw is None or str(raw).strip() == "":
|
|
403
|
+
return None
|
|
404
|
+
return float(raw)
|
|
405
|
+
except Exception:
|
|
406
|
+
return None
|
|
407
|
+
|
|
408
|
+
env_min = _parse_env("INVARLOCK_PM_DRIFT_MIN")
|
|
409
|
+
env_max = _parse_env("INVARLOCK_PM_DRIFT_MAX")
|
|
410
|
+
|
|
411
|
+
has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
|
|
412
|
+
if not has_explicit:
|
|
413
|
+
return {}
|
|
414
|
+
|
|
415
|
+
min_val = (
|
|
416
|
+
env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
|
|
417
|
+
)
|
|
418
|
+
max_val = (
|
|
419
|
+
env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
try:
|
|
423
|
+
if min_val is not None and min_val <= 0:
|
|
424
|
+
min_val = base_min
|
|
425
|
+
except Exception:
|
|
426
|
+
min_val = base_min
|
|
427
|
+
try:
|
|
428
|
+
if max_val is not None and max_val <= 0:
|
|
429
|
+
max_val = base_max
|
|
430
|
+
except Exception:
|
|
431
|
+
max_val = base_max
|
|
432
|
+
try:
|
|
433
|
+
if min_val is not None and max_val is not None and min_val >= max_val:
|
|
434
|
+
min_val, max_val = base_min, base_max
|
|
435
|
+
except Exception:
|
|
436
|
+
min_val, max_val = base_min, base_max
|
|
437
|
+
|
|
438
|
+
return {"min": float(min_val), "max": float(max_val)}
|
|
439
|
+
|
|
440
|
+
|
|
186
441
|
def _free_model_memory(model: object | None) -> None:
|
|
187
442
|
"""Best-effort cleanup to release GPU memory for a model object."""
|
|
188
443
|
if model is None:
|
|
@@ -296,7 +551,7 @@ def _resolve_exit_code(exc: Exception, *, profile: str | None) -> int:
|
|
|
296
551
|
return 1
|
|
297
552
|
|
|
298
553
|
|
|
299
|
-
## NOTE: Deprecated
|
|
554
|
+
## NOTE: Deprecated helper `_check_pairability_or_abort` was removed.
|
|
300
555
|
## Provider parity and pairing guarantees are enforced via guard digests and
|
|
301
556
|
## invariant checks during run execution.
|
|
302
557
|
|
|
@@ -696,38 +951,60 @@ def _prepare_config_for_run(
|
|
|
696
951
|
resolve_edit_kind as _resolve_edit_kind,
|
|
697
952
|
)
|
|
698
953
|
|
|
699
|
-
|
|
954
|
+
_event(
|
|
955
|
+
console,
|
|
956
|
+
"INIT",
|
|
957
|
+
f"Loading configuration: {config_path}",
|
|
958
|
+
emoji="📋",
|
|
959
|
+
profile=profile,
|
|
960
|
+
)
|
|
700
961
|
cfg = _load_config(config_path)
|
|
701
962
|
|
|
702
963
|
# Apply profile if specified (dev is a no-op)
|
|
703
|
-
if profile and str(profile).lower() in {"
|
|
704
|
-
|
|
964
|
+
if profile and str(profile).lower() not in {"dev"}:
|
|
965
|
+
_event(
|
|
966
|
+
console, "INIT", f"Applying profile: {profile}", emoji="🎯", profile=profile
|
|
967
|
+
)
|
|
705
968
|
try:
|
|
706
969
|
cfg = _apply_profile(cfg, profile)
|
|
707
970
|
except Exception as exc:
|
|
708
|
-
console
|
|
971
|
+
_event(console, "FAIL", str(exc), emoji="❌", profile=profile)
|
|
709
972
|
raise typer.Exit(1) from exc
|
|
710
973
|
|
|
711
974
|
# Apply edit override
|
|
712
975
|
if edit:
|
|
713
976
|
try:
|
|
714
977
|
edit_name = _resolve_edit_kind(edit)
|
|
715
|
-
|
|
978
|
+
_event(
|
|
979
|
+
console,
|
|
980
|
+
"EXEC",
|
|
981
|
+
f"Edit override: {edit} → {edit_name}",
|
|
982
|
+
emoji="✂️",
|
|
983
|
+
profile=profile,
|
|
984
|
+
)
|
|
716
985
|
cfg = _apply_edit_override(cfg, edit)
|
|
717
986
|
except ValueError as e:
|
|
718
|
-
console
|
|
987
|
+
_event(console, "FAIL", str(e), emoji="❌", profile=profile)
|
|
719
988
|
raise typer.Exit(1) from e
|
|
720
989
|
|
|
721
990
|
# Apply CLI overrides for auto configuration
|
|
722
991
|
if tier or probes is not None:
|
|
723
992
|
if tier and tier not in ["conservative", "balanced", "aggressive", "none"]:
|
|
724
|
-
|
|
725
|
-
|
|
993
|
+
_event(
|
|
994
|
+
console,
|
|
995
|
+
"FAIL",
|
|
996
|
+
f"Invalid tier '{tier}'. Valid options: conservative, balanced, aggressive, none",
|
|
997
|
+
emoji="❌",
|
|
998
|
+
profile=profile,
|
|
726
999
|
)
|
|
727
1000
|
raise typer.Exit(1)
|
|
728
1001
|
if probes is not None and (probes < 0 or probes > 10):
|
|
729
|
-
|
|
730
|
-
|
|
1002
|
+
_event(
|
|
1003
|
+
console,
|
|
1004
|
+
"FAIL",
|
|
1005
|
+
f"Invalid probes '{probes}'. Must be between 0 and 10",
|
|
1006
|
+
emoji="❌",
|
|
1007
|
+
profile=profile,
|
|
731
1008
|
)
|
|
732
1009
|
raise typer.Exit(1)
|
|
733
1010
|
|
|
@@ -738,10 +1015,22 @@ def _prepare_config_for_run(
|
|
|
738
1015
|
cfg_dict["auto"] = auto_section
|
|
739
1016
|
if tier:
|
|
740
1017
|
auto_section["tier"] = tier
|
|
741
|
-
|
|
1018
|
+
_event(
|
|
1019
|
+
console,
|
|
1020
|
+
"INIT",
|
|
1021
|
+
f"Auto tier override: {tier}",
|
|
1022
|
+
emoji="🎛️",
|
|
1023
|
+
profile=profile,
|
|
1024
|
+
)
|
|
742
1025
|
if probes is not None:
|
|
743
1026
|
auto_section["probes"] = probes
|
|
744
|
-
|
|
1027
|
+
_event(
|
|
1028
|
+
console,
|
|
1029
|
+
"INIT",
|
|
1030
|
+
f"Auto probes override: {probes}",
|
|
1031
|
+
emoji="🔬",
|
|
1032
|
+
profile=profile,
|
|
1033
|
+
)
|
|
745
1034
|
cfg = InvarLockConfig(cfg_dict)
|
|
746
1035
|
|
|
747
1036
|
# Resolve adapter:auto to a concrete built-in adapter if requested
|
|
@@ -774,7 +1063,7 @@ def _maybe_plan_release_windows(
|
|
|
774
1063
|
|
|
775
1064
|
|
|
776
1065
|
def _print_pipeline_start(console: Console) -> None:
|
|
777
|
-
console
|
|
1066
|
+
_event(console, "INIT", "Starting InvarLock pipeline...", emoji="🚀")
|
|
778
1067
|
|
|
779
1068
|
|
|
780
1069
|
def _emit_run_artifacts(
|
|
@@ -783,7 +1072,7 @@ def _emit_run_artifacts(
|
|
|
783
1072
|
"""Save run report and return emitted artifact paths."""
|
|
784
1073
|
from invarlock.reporting.report import save_report as _save_report
|
|
785
1074
|
|
|
786
|
-
console
|
|
1075
|
+
_event(console, "DATA", "Saving run report...", emoji="💾")
|
|
787
1076
|
return _save_report(
|
|
788
1077
|
report, out_dir, formats=["json"], filename_prefix=filename_prefix
|
|
789
1078
|
)
|
|
@@ -806,25 +1095,21 @@ def _resolve_device_and_output(
|
|
|
806
1095
|
cfg_device = None
|
|
807
1096
|
target_device = device or cfg_device or "auto"
|
|
808
1097
|
resolved_device = _resolve_device(target_device)
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
)
|
|
1098
|
+
resolution_note = _device_resolution_note(target_device, resolved_device)
|
|
1099
|
+
console.print(_format_kv_line("Device", f"{resolved_device} ({resolution_note})"))
|
|
812
1100
|
is_valid, error_msg = _validate(resolved_device)
|
|
813
1101
|
if not is_valid:
|
|
814
|
-
console
|
|
1102
|
+
_event(console, "FAIL", f"Device validation failed: {error_msg}", emoji="❌")
|
|
815
1103
|
raise typer.Exit(1)
|
|
816
1104
|
|
|
817
|
-
# Determine output directory
|
|
1105
|
+
# Determine output directory
|
|
818
1106
|
if out:
|
|
819
1107
|
output_dir = Path(out)
|
|
820
1108
|
else:
|
|
821
1109
|
try:
|
|
822
1110
|
output_dir = Path(cfg.output.dir)
|
|
823
1111
|
except Exception:
|
|
824
|
-
|
|
825
|
-
output_dir = Path(cfg.out.dir) # type: ignore[attr-defined]
|
|
826
|
-
except Exception:
|
|
827
|
-
output_dir = Path("runs")
|
|
1112
|
+
output_dir = Path("runs")
|
|
828
1113
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
829
1114
|
return str(resolved_device), output_dir
|
|
830
1115
|
|
|
@@ -837,6 +1122,7 @@ def _resolve_provider_and_split(
|
|
|
837
1122
|
provider_kwargs: dict[str, Any] | None = None,
|
|
838
1123
|
console: Console,
|
|
839
1124
|
resolved_device: str | None = None,
|
|
1125
|
+
emit: Callable[[str, str, str | None], None] | None = None,
|
|
840
1126
|
) -> tuple[Any, str, bool]:
|
|
841
1127
|
"""Resolve dataset provider and split, returning (provider, split, used_fallback)."""
|
|
842
1128
|
provider_name = None
|
|
@@ -863,7 +1149,10 @@ def _resolve_provider_and_split(
|
|
|
863
1149
|
# Pass device hint only to providers that understand it (currently WikiText-2)
|
|
864
1150
|
if resolved_device and provider_name == "wikitext2":
|
|
865
1151
|
provider_kwargs.setdefault("device_hint", resolved_device)
|
|
866
|
-
|
|
1152
|
+
if emit is not None and provider_name == "wikitext2":
|
|
1153
|
+
data_provider = get_provider_fn(provider_name, emit=emit, **provider_kwargs)
|
|
1154
|
+
else:
|
|
1155
|
+
data_provider = get_provider_fn(provider_name, **provider_kwargs)
|
|
867
1156
|
|
|
868
1157
|
requested_split = None
|
|
869
1158
|
try:
|
|
@@ -917,7 +1206,13 @@ def _extract_model_load_kwargs(cfg: InvarLockConfig) -> dict[str, Any]:
|
|
|
917
1206
|
return extra
|
|
918
1207
|
|
|
919
1208
|
|
|
920
|
-
def _load_model_with_cfg(
|
|
1209
|
+
def _load_model_with_cfg(
|
|
1210
|
+
adapter: Any,
|
|
1211
|
+
cfg: InvarLockConfig,
|
|
1212
|
+
device: str,
|
|
1213
|
+
*,
|
|
1214
|
+
profile: str | None = None,
|
|
1215
|
+
) -> Any:
|
|
921
1216
|
"""Load a model with config-provided kwargs, filtering for strict adapters."""
|
|
922
1217
|
try:
|
|
923
1218
|
model_id = cfg.model.id
|
|
@@ -930,20 +1225,21 @@ def _load_model_with_cfg(adapter: Any, cfg: InvarLockConfig, device: str) -> Any
|
|
|
930
1225
|
raise ValueError("Missing model.id in config")
|
|
931
1226
|
|
|
932
1227
|
extra = _extract_model_load_kwargs(cfg)
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
1228
|
+
with _suppress_noisy_warnings(profile):
|
|
1229
|
+
try:
|
|
1230
|
+
sig = inspect.signature(adapter.load_model)
|
|
1231
|
+
accepts_var_kw = any(
|
|
1232
|
+
p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
|
|
1233
|
+
)
|
|
1234
|
+
if accepts_var_kw:
|
|
1235
|
+
return adapter.load_model(model_id, device=device, **extra)
|
|
1236
|
+
allowed = {k: v for k, v in extra.items() if k in sig.parameters}
|
|
1237
|
+
if allowed:
|
|
1238
|
+
return adapter.load_model(model_id, device=device, **allowed)
|
|
1239
|
+
except Exception:
|
|
1240
|
+
# Fall back to the strictest call shape.
|
|
1241
|
+
pass
|
|
1242
|
+
return adapter.load_model(model_id, device=device)
|
|
947
1243
|
|
|
948
1244
|
|
|
949
1245
|
def _run_bare_control(
|
|
@@ -963,14 +1259,20 @@ def _run_bare_control(
|
|
|
963
1259
|
restore_fn: Any | None,
|
|
964
1260
|
console: Console,
|
|
965
1261
|
resolved_loss_type: str,
|
|
966
|
-
profile_normalized: str | None,
|
|
1262
|
+
profile_normalized: str | None = None,
|
|
967
1263
|
snapshot_provenance: dict[str, bool] | None = None,
|
|
968
1264
|
skip_model_load: bool = False,
|
|
969
1265
|
) -> dict[str, Any] | None:
|
|
970
1266
|
"""Execute the bare-control run for overhead estimation and return payload."""
|
|
971
1267
|
from invarlock.core.runner import CoreRunner as _CoreRunner
|
|
972
1268
|
|
|
973
|
-
|
|
1269
|
+
_event(
|
|
1270
|
+
console,
|
|
1271
|
+
"EXEC",
|
|
1272
|
+
"Running bare control (guards disabled) for overhead check",
|
|
1273
|
+
emoji="🧪",
|
|
1274
|
+
profile=profile_normalized,
|
|
1275
|
+
)
|
|
974
1276
|
set_seed(seed_bundle["python"]) # type: ignore[arg-type]
|
|
975
1277
|
|
|
976
1278
|
bare_runner = _CoreRunner()
|
|
@@ -979,6 +1281,12 @@ def _run_bare_control(
|
|
|
979
1281
|
bare_context = copy.deepcopy(run_config.context)
|
|
980
1282
|
bare_context.setdefault("validation", {})["guard_overhead_mode"] = "bare"
|
|
981
1283
|
bare_config.context = bare_context
|
|
1284
|
+
runtime_edit_config = dict(edit_config or {})
|
|
1285
|
+
runtime_edit_config.setdefault("console", console)
|
|
1286
|
+
runtime_edit_config.setdefault(
|
|
1287
|
+
"output_style", _style_from_console(console, profile=profile_normalized)
|
|
1288
|
+
)
|
|
1289
|
+
runtime_edit_config.setdefault("emit", True)
|
|
982
1290
|
|
|
983
1291
|
private_model_loaded = False
|
|
984
1292
|
bare_target_model = None
|
|
@@ -992,7 +1300,9 @@ def _run_bare_control(
|
|
|
992
1300
|
elif skip_model_load:
|
|
993
1301
|
bare_target_model = model or SimpleNamespace(name="bare_stub_model")
|
|
994
1302
|
else:
|
|
995
|
-
bare_target_model = _load_model_with_cfg(
|
|
1303
|
+
bare_target_model = _load_model_with_cfg(
|
|
1304
|
+
adapter, cfg, resolved_device, profile=profile_normalized
|
|
1305
|
+
)
|
|
996
1306
|
private_model_loaded = True
|
|
997
1307
|
if snapshot_provenance is not None:
|
|
998
1308
|
snapshot_provenance["reload_path_used"] = True
|
|
@@ -1005,7 +1315,7 @@ def _run_bare_control(
|
|
|
1005
1315
|
config=bare_config,
|
|
1006
1316
|
calibration_data=calibration_data,
|
|
1007
1317
|
auto_config=auto_config,
|
|
1008
|
-
edit_config=
|
|
1318
|
+
edit_config=runtime_edit_config,
|
|
1009
1319
|
preview_n=preview_count,
|
|
1010
1320
|
final_n=final_count,
|
|
1011
1321
|
)
|
|
@@ -1029,8 +1339,12 @@ def _run_bare_control(
|
|
|
1029
1339
|
return False
|
|
1030
1340
|
|
|
1031
1341
|
if not (_finite(bare_ppl_preview) and _finite(bare_ppl_final)):
|
|
1032
|
-
|
|
1033
|
-
|
|
1342
|
+
_event(
|
|
1343
|
+
console,
|
|
1344
|
+
"WARN",
|
|
1345
|
+
"Primary metric non-finite during bare control; continuing with diagnostics.",
|
|
1346
|
+
emoji="⚠️",
|
|
1347
|
+
profile=profile_normalized,
|
|
1034
1348
|
)
|
|
1035
1349
|
|
|
1036
1350
|
payload: dict[str, Any] = {
|
|
@@ -1082,6 +1396,7 @@ def _execute_guarded_run(
|
|
|
1082
1396
|
final_count: int,
|
|
1083
1397
|
restore_fn: Any | None,
|
|
1084
1398
|
resolved_device: str,
|
|
1399
|
+
profile_normalized: str | None = None,
|
|
1085
1400
|
console: Console,
|
|
1086
1401
|
snapshot_provenance: dict[str, bool] | None = None,
|
|
1087
1402
|
skip_model_load: bool = False,
|
|
@@ -1095,11 +1410,26 @@ def _execute_guarded_run(
|
|
|
1095
1410
|
elif skip_model_load:
|
|
1096
1411
|
model = model or SimpleNamespace(name="guarded_stub_model")
|
|
1097
1412
|
else:
|
|
1098
|
-
|
|
1099
|
-
|
|
1413
|
+
_event(
|
|
1414
|
+
console,
|
|
1415
|
+
"INIT",
|
|
1416
|
+
f"Loading model: {cfg.model.id} (attempt 1)",
|
|
1417
|
+
emoji="🔧",
|
|
1418
|
+
profile=profile_normalized,
|
|
1419
|
+
)
|
|
1420
|
+
model = _load_model_with_cfg(
|
|
1421
|
+
adapter, cfg, resolved_device, profile=profile_normalized
|
|
1422
|
+
)
|
|
1100
1423
|
if snapshot_provenance is not None:
|
|
1101
1424
|
snapshot_provenance["reload_path_used"] = True
|
|
1102
1425
|
|
|
1426
|
+
runtime_edit_config = dict(edit_config or {})
|
|
1427
|
+
runtime_edit_config.setdefault("console", console)
|
|
1428
|
+
runtime_edit_config.setdefault(
|
|
1429
|
+
"output_style", _style_from_console(console, profile=profile_normalized)
|
|
1430
|
+
)
|
|
1431
|
+
runtime_edit_config.setdefault("emit", True)
|
|
1432
|
+
|
|
1103
1433
|
core_report = runner.execute(
|
|
1104
1434
|
model=model,
|
|
1105
1435
|
adapter=adapter,
|
|
@@ -1108,7 +1438,7 @@ def _execute_guarded_run(
|
|
|
1108
1438
|
config=run_config,
|
|
1109
1439
|
calibration_data=calibration_data,
|
|
1110
1440
|
auto_config=auto_config,
|
|
1111
|
-
edit_config=
|
|
1441
|
+
edit_config=runtime_edit_config,
|
|
1112
1442
|
preview_n=preview_count,
|
|
1113
1443
|
final_n=final_count,
|
|
1114
1444
|
)
|
|
@@ -1145,10 +1475,10 @@ def _postprocess_and_summarize(
|
|
|
1145
1475
|
saved_files = _emit_run_artifacts(
|
|
1146
1476
|
report=report, out_dir=run_dir, filename_prefix="report", console=console
|
|
1147
1477
|
)
|
|
1148
|
-
console
|
|
1149
|
-
console
|
|
1478
|
+
_event(console, "PASS", "Run completed successfully!", emoji="✅")
|
|
1479
|
+
_event(console, "DATA", f"Report: {saved_files['json']}", emoji="📄")
|
|
1150
1480
|
if run_config.event_path:
|
|
1151
|
-
console
|
|
1481
|
+
_event(console, "DATA", f"Events: {run_config.event_path}", emoji="📝")
|
|
1152
1482
|
return saved_files
|
|
1153
1483
|
|
|
1154
1484
|
|
|
@@ -1238,9 +1568,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1238
1568
|
message = f"PAIRING-EVIDENCE-MISSING: {path}: {reason}"
|
|
1239
1569
|
if prof in {"ci", "release"}:
|
|
1240
1570
|
raise InvarlockError(code="E001", message=message)
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1571
|
+
if console is not None:
|
|
1572
|
+
_event(
|
|
1573
|
+
console,
|
|
1574
|
+
"FAIL",
|
|
1575
|
+
f"Baseline pairing schedule '{path}' is incompatible: {reason}",
|
|
1576
|
+
emoji="❌",
|
|
1577
|
+
profile=prof,
|
|
1578
|
+
)
|
|
1244
1579
|
raise typer.Exit(1)
|
|
1245
1580
|
|
|
1246
1581
|
baseline_meta = (
|
|
@@ -1297,7 +1632,7 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1297
1632
|
_fail_schedule(f"{label} input_ids empty at index {idx}")
|
|
1298
1633
|
seqs.append(seq_ints)
|
|
1299
1634
|
|
|
1300
|
-
# attention_masks are required for pairing, but
|
|
1635
|
+
# attention_masks are required for pairing, but some baselines may omit them.
|
|
1301
1636
|
# When absent, default to all-ones masks (cannot infer padding reliably).
|
|
1302
1637
|
masks_rows: list[list[int]] = []
|
|
1303
1638
|
masks_missing = masks is None or masks == []
|
|
@@ -1395,9 +1730,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1395
1730
|
prof = (profile or "dev").strip().lower()
|
|
1396
1731
|
if prof in {"ci", "release"}:
|
|
1397
1732
|
_fail_schedule("preview_hash mismatch vs baseline report data")
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1733
|
+
if console is not None:
|
|
1734
|
+
_event(
|
|
1735
|
+
console,
|
|
1736
|
+
"WARN",
|
|
1737
|
+
"Baseline preview_hash mismatch; continuing in dev profile.",
|
|
1738
|
+
emoji="⚠️",
|
|
1739
|
+
profile=prof,
|
|
1740
|
+
)
|
|
1401
1741
|
if (
|
|
1402
1742
|
isinstance(baseline_final_hash, str)
|
|
1403
1743
|
and baseline_final_hash
|
|
@@ -1406,9 +1746,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1406
1746
|
prof = (profile or "dev").strip().lower()
|
|
1407
1747
|
if prof in {"ci", "release"}:
|
|
1408
1748
|
_fail_schedule("final_hash mismatch vs baseline report data")
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1749
|
+
if console is not None:
|
|
1750
|
+
_event(
|
|
1751
|
+
console,
|
|
1752
|
+
"WARN",
|
|
1753
|
+
"Baseline final_hash mismatch; continuing in dev profile.",
|
|
1754
|
+
emoji="⚠️",
|
|
1755
|
+
profile=prof,
|
|
1756
|
+
)
|
|
1412
1757
|
if (
|
|
1413
1758
|
isinstance(baseline_dataset_hash, str)
|
|
1414
1759
|
and baseline_dataset_hash
|
|
@@ -1417,9 +1762,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1417
1762
|
prof = (profile or "dev").strip().lower()
|
|
1418
1763
|
if prof in {"ci", "release"}:
|
|
1419
1764
|
_fail_schedule("dataset_hash mismatch vs baseline report data")
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1765
|
+
if console is not None:
|
|
1766
|
+
_event(
|
|
1767
|
+
console,
|
|
1768
|
+
"WARN",
|
|
1769
|
+
"Baseline dataset_hash mismatch; continuing in dev profile.",
|
|
1770
|
+
emoji="⚠️",
|
|
1771
|
+
profile=prof,
|
|
1772
|
+
)
|
|
1423
1773
|
except InvarlockError:
|
|
1424
1774
|
raise
|
|
1425
1775
|
except typer.Exit:
|
|
@@ -1441,10 +1791,14 @@ def _validate_and_harvest_baseline_schedule(
|
|
|
1441
1791
|
and baseline_final is not None
|
|
1442
1792
|
and baseline_final != cfg_final
|
|
1443
1793
|
):
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1794
|
+
if console is not None:
|
|
1795
|
+
_event(
|
|
1796
|
+
console,
|
|
1797
|
+
"WARN",
|
|
1798
|
+
f"Adjusting evaluation window counts to match baseline schedule ({baseline_preview}/{baseline_final}).",
|
|
1799
|
+
emoji="⚠️",
|
|
1800
|
+
profile=profile,
|
|
1801
|
+
)
|
|
1448
1802
|
|
|
1449
1803
|
effective_preview = int(baseline_preview)
|
|
1450
1804
|
effective_final = int(baseline_final)
|
|
@@ -1607,10 +1961,11 @@ def _resolve_metric_and_provider(
|
|
|
1607
1961
|
model_profile: Any,
|
|
1608
1962
|
*,
|
|
1609
1963
|
resolved_loss_type: str | None = None,
|
|
1964
|
+
metric_kind_override: str | None = None,
|
|
1610
1965
|
) -> tuple[str, str, dict[str, float]]:
|
|
1611
1966
|
"""Resolve metric kind, provider kind, and metric options from config with precedence.
|
|
1612
1967
|
|
|
1613
|
-
Precedence: CLI args (not handled here) → config → ModelProfile defaults →
|
|
1968
|
+
Precedence: CLI args (not handled here) → config → ModelProfile defaults → fallback.
|
|
1614
1969
|
Primary metric (metric‑v1) is canonical in dev‑phase; no env flag toggles.
|
|
1615
1970
|
"""
|
|
1616
1971
|
# Provider kind
|
|
@@ -1646,9 +2001,13 @@ def _resolve_metric_and_provider(
|
|
|
1646
2001
|
metric_cfg = None
|
|
1647
2002
|
|
|
1648
2003
|
metric_kind = None
|
|
2004
|
+
if isinstance(metric_kind_override, str) and metric_kind_override.strip():
|
|
2005
|
+
mk_override = metric_kind_override.strip().lower()
|
|
2006
|
+
if mk_override != "auto":
|
|
2007
|
+
metric_kind = mk_override
|
|
1649
2008
|
reps = None
|
|
1650
2009
|
ci_level = None
|
|
1651
|
-
if metric_cfg is not None:
|
|
2010
|
+
if metric_kind is None and metric_cfg is not None:
|
|
1652
2011
|
try:
|
|
1653
2012
|
metric_kind = (
|
|
1654
2013
|
metric_cfg.get("kind")
|
|
@@ -1684,11 +2043,11 @@ def _resolve_metric_and_provider(
|
|
|
1684
2043
|
else:
|
|
1685
2044
|
metric_kind = None
|
|
1686
2045
|
|
|
1687
|
-
# Fallback to model profile default or
|
|
2046
|
+
# Fallback to model profile default or loss-type mapping
|
|
1688
2047
|
if not metric_kind and hasattr(model_profile, "default_metric"):
|
|
1689
2048
|
metric_kind = model_profile.default_metric
|
|
1690
2049
|
if not metric_kind:
|
|
1691
|
-
#
|
|
2050
|
+
# Map from loss kind
|
|
1692
2051
|
lk = (resolved_loss_type or "causal").lower()
|
|
1693
2052
|
if lk == "mlm":
|
|
1694
2053
|
metric_kind = "ppl_mlm"
|
|
@@ -1770,18 +2129,25 @@ def _plan_release_windows(
|
|
|
1770
2129
|
candidate_msg = f", candidate_unique={int(candidate_unique)}" + (
|
|
1771
2130
|
f"/{int(candidate_limit)}" if candidate_limit is not None else ""
|
|
1772
2131
|
)
|
|
1773
|
-
|
|
1774
|
-
|
|
2132
|
+
_event(
|
|
2133
|
+
console,
|
|
2134
|
+
"METRIC",
|
|
2135
|
+
"Release window capacity:"
|
|
1775
2136
|
f" unique={available_unique}, reserve={reserve_windows} "
|
|
1776
2137
|
f"(calib {calibration_windows}, buffer {buffer_windows}), "
|
|
1777
2138
|
f"usable={available_for_eval}, "
|
|
1778
2139
|
f"per-arm raw={actual_per_arm_raw} → selected {actual_per_arm} "
|
|
1779
|
-
f"(target {target_per_arm}{candidate_msg})"
|
|
2140
|
+
f"(target {target_per_arm}{candidate_msg})",
|
|
2141
|
+
emoji="📏",
|
|
2142
|
+
profile="release",
|
|
1780
2143
|
)
|
|
1781
2144
|
if actual_per_arm < target_per_arm:
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
2145
|
+
_event(
|
|
2146
|
+
console,
|
|
2147
|
+
"WARN",
|
|
2148
|
+
f"Adjusted per-arm windows down from {target_per_arm} to {actual_per_arm} based on capacity.",
|
|
2149
|
+
emoji="⚠️",
|
|
2150
|
+
profile="release",
|
|
1785
2151
|
)
|
|
1786
2152
|
|
|
1787
2153
|
plan = {
|
|
@@ -1832,15 +2198,30 @@ def run_command(
|
|
|
1832
2198
|
None, "--device", help="Device override (auto|cuda|mps|cpu)"
|
|
1833
2199
|
),
|
|
1834
2200
|
profile: str | None = typer.Option(
|
|
1835
|
-
None,
|
|
2201
|
+
None,
|
|
2202
|
+
"--profile",
|
|
2203
|
+
help="Profile to apply (e.g. ci, release, ci_cpu; dev is a no-op)",
|
|
1836
2204
|
),
|
|
1837
2205
|
out: str | None = typer.Option(None, "--out", help="Output directory override"),
|
|
1838
2206
|
edit: str | None = typer.Option(None, "--edit", help="Edit kind (quant|mixed)"),
|
|
2207
|
+
edit_label: str | None = typer.Option(
|
|
2208
|
+
None,
|
|
2209
|
+
"--edit-label",
|
|
2210
|
+
help=(
|
|
2211
|
+
"Edit algorithm label for BYOE models. Use 'noop' for baseline, "
|
|
2212
|
+
"'quant_rtn' etc. for built-in edits, 'custom' for pre-edited models."
|
|
2213
|
+
),
|
|
2214
|
+
),
|
|
1839
2215
|
tier: str | None = typer.Option(
|
|
1840
2216
|
None,
|
|
1841
2217
|
"--tier",
|
|
1842
2218
|
help="Auto-tuning tier override (conservative|balanced|aggressive)",
|
|
1843
2219
|
),
|
|
2220
|
+
metric_kind: str | None = typer.Option(
|
|
2221
|
+
None,
|
|
2222
|
+
"--metric-kind",
|
|
2223
|
+
help="Primary metric kind override (ppl_causal|ppl_mlm|accuracy|etc.)",
|
|
2224
|
+
),
|
|
1844
2225
|
probes: int | None = typer.Option(
|
|
1845
2226
|
None, "--probes", help="Number of micro-probes (0=deterministic, >0=adaptive)"
|
|
1846
2227
|
),
|
|
@@ -1861,6 +2242,19 @@ def run_command(
|
|
|
1861
2242
|
no_cleanup: bool = typer.Option(
|
|
1862
2243
|
False, "--no-cleanup", help="Skip cleanup of temporary artifacts"
|
|
1863
2244
|
),
|
|
2245
|
+
style: str | None = typer.Option(
|
|
2246
|
+
None, "--style", help="Output style (audit|friendly)"
|
|
2247
|
+
),
|
|
2248
|
+
progress: bool = typer.Option(
|
|
2249
|
+
False, "--progress", help="Show progress done messages"
|
|
2250
|
+
),
|
|
2251
|
+
timing: bool = typer.Option(False, "--timing", help="Show timing summary"),
|
|
2252
|
+
telemetry: bool = typer.Option(
|
|
2253
|
+
False, "--telemetry", help="Write telemetry JSON alongside the report"
|
|
2254
|
+
),
|
|
2255
|
+
no_color: bool = typer.Option(
|
|
2256
|
+
False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
|
|
2257
|
+
),
|
|
1864
2258
|
):
|
|
1865
2259
|
"""
|
|
1866
2260
|
Run InvarLock pipeline with the given configuration.
|
|
@@ -1879,24 +2273,57 @@ def run_command(
|
|
|
1879
2273
|
config = _coerce_option(config)
|
|
1880
2274
|
device = _coerce_option(device)
|
|
1881
2275
|
profile = _coerce_option(profile)
|
|
2276
|
+
profile_normalized = (str(profile or "")).strip().lower()
|
|
1882
2277
|
out = _coerce_option(out)
|
|
1883
2278
|
edit = _coerce_option(edit)
|
|
2279
|
+
edit_label = _coerce_option(edit_label)
|
|
1884
2280
|
tier = _coerce_option(tier)
|
|
2281
|
+
metric_kind = _coerce_option(metric_kind)
|
|
1885
2282
|
probes = _coerce_option(probes)
|
|
1886
2283
|
until_pass = bool(_coerce_option(until_pass, False))
|
|
1887
2284
|
max_attempts = int(_coerce_option(max_attempts, 3))
|
|
1888
2285
|
timeout = _coerce_option(timeout)
|
|
1889
2286
|
baseline = _coerce_option(baseline)
|
|
1890
2287
|
no_cleanup = bool(_coerce_option(no_cleanup, False))
|
|
2288
|
+
style = _coerce_option(style)
|
|
2289
|
+
progress = bool(_coerce_option(progress, False))
|
|
2290
|
+
timing = bool(_coerce_option(timing, False))
|
|
2291
|
+
telemetry = bool(_coerce_option(telemetry, False))
|
|
2292
|
+
no_color = bool(_coerce_option(no_color, False))
|
|
2293
|
+
|
|
2294
|
+
output_style = resolve_output_style(
|
|
2295
|
+
style=str(style) if style is not None else None,
|
|
2296
|
+
profile=profile_normalized,
|
|
2297
|
+
progress=progress,
|
|
2298
|
+
timing=timing,
|
|
2299
|
+
no_color=no_color,
|
|
2300
|
+
)
|
|
2301
|
+
console._invarlock_output_style = output_style
|
|
2302
|
+
if not output_style.color:
|
|
2303
|
+
console.no_color = True
|
|
2304
|
+
timings: dict[str, float] = {}
|
|
2305
|
+
collect_timings = bool(output_style.timing or telemetry)
|
|
2306
|
+
total_start: float | None = perf_counter() if collect_timings else None
|
|
2307
|
+
|
|
2308
|
+
_apply_warning_filters(profile_normalized)
|
|
1891
2309
|
|
|
1892
2310
|
# Use shared CLI coercers from invarlock.cli.utils
|
|
1893
2311
|
report_path_out: str | None = None
|
|
1894
2312
|
|
|
1895
2313
|
def _fail_run(message: str) -> None:
|
|
1896
|
-
console
|
|
2314
|
+
_event(console, "FAIL", message, emoji="❌", profile=profile_normalized)
|
|
1897
2315
|
# Generic failure path → exit 1 (InvarlockError paths handle code 3 separately)
|
|
1898
2316
|
raise typer.Exit(1)
|
|
1899
2317
|
|
|
2318
|
+
def _provider_event(tag: str, message: str, emoji: str | None = None) -> None:
|
|
2319
|
+
_event(
|
|
2320
|
+
console,
|
|
2321
|
+
tag,
|
|
2322
|
+
message,
|
|
2323
|
+
emoji=emoji,
|
|
2324
|
+
profile=profile_normalized,
|
|
2325
|
+
)
|
|
2326
|
+
|
|
1900
2327
|
# Fail fast when torch is missing so users see a clear extras hint instead of
|
|
1901
2328
|
# a raw ModuleNotFoundError from deeper imports.
|
|
1902
2329
|
try:
|
|
@@ -1904,12 +2331,14 @@ def run_command(
|
|
|
1904
2331
|
|
|
1905
2332
|
_ = _torch # pragma: no cover
|
|
1906
2333
|
except (ImportError, ModuleNotFoundError) as e:
|
|
1907
|
-
|
|
1908
|
-
|
|
2334
|
+
_event(
|
|
2335
|
+
console,
|
|
2336
|
+
"FAIL",
|
|
2337
|
+
"Torch is required for this command. "
|
|
1909
2338
|
'Install extras with: pip install "invarlock[hf]" '
|
|
1910
2339
|
'or "invarlock[adapters]".',
|
|
1911
|
-
|
|
1912
|
-
|
|
2340
|
+
emoji="❌",
|
|
2341
|
+
profile=profile_normalized,
|
|
1913
2342
|
)
|
|
1914
2343
|
raise typer.Exit(1) from e
|
|
1915
2344
|
|
|
@@ -1987,7 +2416,7 @@ def run_command(
|
|
|
1987
2416
|
seed_value = 42
|
|
1988
2417
|
set_seed(seed_value)
|
|
1989
2418
|
# Enforce deterministic algorithms in CI/Release profiles when torch is available
|
|
1990
|
-
profile_label =
|
|
2419
|
+
profile_label = profile_normalized or None
|
|
1991
2420
|
if torch is not None and profile_label in {"ci", "release"}:
|
|
1992
2421
|
try: # pragma: no cover - behavior depends on torch availability
|
|
1993
2422
|
if hasattr(torch, "use_deterministic_algorithms"):
|
|
@@ -2016,10 +2445,14 @@ def run_command(
|
|
|
2016
2445
|
"numpy": int(numpy_seed),
|
|
2017
2446
|
"torch": int(torch_seed) if torch_seed is not None else None,
|
|
2018
2447
|
}
|
|
2019
|
-
|
|
2020
|
-
|
|
2448
|
+
_event(
|
|
2449
|
+
console,
|
|
2450
|
+
"INIT",
|
|
2451
|
+
"Deterministic seeds → "
|
|
2021
2452
|
f"python={seed_bundle['python']}, numpy={seed_bundle['numpy']}, "
|
|
2022
|
-
f"torch={seed_bundle['torch'] if seed_bundle['torch'] is not None else 'N/A'}"
|
|
2453
|
+
f"torch={seed_bundle['torch'] if seed_bundle['torch'] is not None else 'N/A'}",
|
|
2454
|
+
emoji="🎲",
|
|
2455
|
+
profile=profile_normalized,
|
|
2023
2456
|
)
|
|
2024
2457
|
|
|
2025
2458
|
# Resolve device and output directory
|
|
@@ -2054,8 +2487,8 @@ def run_command(
|
|
|
2054
2487
|
|
|
2055
2488
|
run_id = f"{output_dir.name}-{timestamp}" if output_dir.name else timestamp
|
|
2056
2489
|
|
|
2057
|
-
console.print(
|
|
2058
|
-
console.print(
|
|
2490
|
+
console.print(_format_kv_line("Output", str(run_dir)))
|
|
2491
|
+
console.print(_format_kv_line("Run ID", run_id))
|
|
2059
2492
|
|
|
2060
2493
|
# Initialize retry controller if --until-pass mode enabled
|
|
2061
2494
|
retry_controller = _init_retry_controller(
|
|
@@ -2070,7 +2503,6 @@ def run_command(
|
|
|
2070
2503
|
pairing_schedule: dict[str, Any] | None = None
|
|
2071
2504
|
if baseline:
|
|
2072
2505
|
baseline_path = Path(baseline)
|
|
2073
|
-
profile_normalized = (profile or "").strip().lower()
|
|
2074
2506
|
strict_baseline = profile_normalized in {"ci", "release"}
|
|
2075
2507
|
if not baseline_path.exists():
|
|
2076
2508
|
msg = (
|
|
@@ -2079,8 +2511,12 @@ def run_command(
|
|
|
2079
2511
|
)
|
|
2080
2512
|
if strict_baseline:
|
|
2081
2513
|
raise InvarlockError(code="E001", message=msg)
|
|
2082
|
-
|
|
2083
|
-
|
|
2514
|
+
_event(
|
|
2515
|
+
console,
|
|
2516
|
+
"WARN",
|
|
2517
|
+
f"{msg}. Falling back to dataset schedule.",
|
|
2518
|
+
emoji="⚠️",
|
|
2519
|
+
profile=profile_normalized,
|
|
2084
2520
|
)
|
|
2085
2521
|
else:
|
|
2086
2522
|
try:
|
|
@@ -2090,8 +2526,12 @@ def run_command(
|
|
|
2090
2526
|
msg = f"PAIRING-EVIDENCE-MISSING: baseline report JSON parse failed ({exc})"
|
|
2091
2527
|
if strict_baseline:
|
|
2092
2528
|
raise InvarlockError(code="E001", message=msg) from exc
|
|
2093
|
-
|
|
2094
|
-
|
|
2529
|
+
_event(
|
|
2530
|
+
console,
|
|
2531
|
+
"WARN",
|
|
2532
|
+
f"{msg}. Falling back to dataset schedule.",
|
|
2533
|
+
emoji="⚠️",
|
|
2534
|
+
profile=profile_normalized,
|
|
2095
2535
|
)
|
|
2096
2536
|
baseline_report_data = None
|
|
2097
2537
|
if isinstance(baseline_report_data, dict):
|
|
@@ -2099,11 +2539,28 @@ def run_command(
|
|
|
2099
2539
|
if pairing_schedule:
|
|
2100
2540
|
# Normalize baseline report in-memory so downstream digest/parity
|
|
2101
2541
|
# computations see a consistent window_id + mask shape even for
|
|
2102
|
-
#
|
|
2542
|
+
# baselines missing some fields.
|
|
2103
2543
|
try:
|
|
2104
|
-
baseline_report_data
|
|
2105
|
-
|
|
2106
|
-
|
|
2544
|
+
ew = baseline_report_data.get("evaluation_windows")
|
|
2545
|
+
if not isinstance(ew, dict):
|
|
2546
|
+
ew = {}
|
|
2547
|
+
baseline_report_data["evaluation_windows"] = ew
|
|
2548
|
+
# Merge the sanitized pairing schedule into existing
|
|
2549
|
+
# evaluation_windows without discarding logloss/token_counts.
|
|
2550
|
+
for arm in ("preview", "final"):
|
|
2551
|
+
src = (
|
|
2552
|
+
pairing_schedule.get(arm)
|
|
2553
|
+
if isinstance(pairing_schedule, dict)
|
|
2554
|
+
else None
|
|
2555
|
+
)
|
|
2556
|
+
if not isinstance(src, dict):
|
|
2557
|
+
continue
|
|
2558
|
+
dst = ew.get(arm)
|
|
2559
|
+
if not isinstance(dst, dict):
|
|
2560
|
+
ew[arm] = dict(src)
|
|
2561
|
+
continue
|
|
2562
|
+
for key, value in src.items():
|
|
2563
|
+
dst[key] = value
|
|
2107
2564
|
except Exception:
|
|
2108
2565
|
pass
|
|
2109
2566
|
# Harvest tokenizer hash provenance from baseline when present.
|
|
@@ -2132,8 +2589,12 @@ def run_command(
|
|
|
2132
2589
|
tokenizer_hash = tok
|
|
2133
2590
|
except Exception:
|
|
2134
2591
|
pass
|
|
2135
|
-
|
|
2136
|
-
|
|
2592
|
+
_event(
|
|
2593
|
+
console,
|
|
2594
|
+
"DATA",
|
|
2595
|
+
"Loaded baseline evaluation schedule for pairing",
|
|
2596
|
+
emoji="🧬",
|
|
2597
|
+
profile=profile_normalized,
|
|
2137
2598
|
)
|
|
2138
2599
|
else:
|
|
2139
2600
|
msg = (
|
|
@@ -2142,8 +2603,12 @@ def run_command(
|
|
|
2142
2603
|
)
|
|
2143
2604
|
if strict_baseline:
|
|
2144
2605
|
raise InvarlockError(code="E001", message=msg)
|
|
2145
|
-
|
|
2146
|
-
|
|
2606
|
+
_event(
|
|
2607
|
+
console,
|
|
2608
|
+
"WARN",
|
|
2609
|
+
f"{msg}. Falling back to dataset schedule.",
|
|
2610
|
+
emoji="⚠️",
|
|
2611
|
+
profile=profile_normalized,
|
|
2147
2612
|
)
|
|
2148
2613
|
baseline_report_data = None
|
|
2149
2614
|
pairing_schedule = None
|
|
@@ -2169,15 +2634,23 @@ def run_command(
|
|
|
2169
2634
|
adapter = registry.get_adapter(cfg.model.adapter)
|
|
2170
2635
|
edit_name = getattr(getattr(cfg, "edit", None), "name", None)
|
|
2171
2636
|
if not isinstance(edit_name, str) or not edit_name.strip():
|
|
2172
|
-
|
|
2173
|
-
|
|
2637
|
+
_event(
|
|
2638
|
+
console,
|
|
2639
|
+
"FAIL",
|
|
2640
|
+
"Edit configuration must specify a non-empty `edit.name`.",
|
|
2641
|
+
emoji="❌",
|
|
2642
|
+
profile=profile_normalized,
|
|
2174
2643
|
)
|
|
2175
2644
|
raise typer.Exit(1)
|
|
2176
2645
|
try:
|
|
2177
2646
|
edit_op = registry.get_edit(edit_name.strip())
|
|
2178
2647
|
except Exception:
|
|
2179
|
-
|
|
2180
|
-
|
|
2648
|
+
_event(
|
|
2649
|
+
console,
|
|
2650
|
+
"WARN",
|
|
2651
|
+
f"Unknown edit '{edit_name.strip()}'. Using pass-through shim.",
|
|
2652
|
+
emoji="⚠️",
|
|
2653
|
+
profile=profile_normalized,
|
|
2181
2654
|
)
|
|
2182
2655
|
edit_op = SimpleNamespace(name=edit_name.strip())
|
|
2183
2656
|
|
|
@@ -2213,8 +2686,12 @@ def run_command(
|
|
|
2213
2686
|
registry.get_plugin_metadata(guard_name, "guards")
|
|
2214
2687
|
)
|
|
2215
2688
|
except KeyError:
|
|
2216
|
-
|
|
2217
|
-
|
|
2689
|
+
_event(
|
|
2690
|
+
console,
|
|
2691
|
+
"WARN",
|
|
2692
|
+
f"Guard '{guard_name}' not found, skipping",
|
|
2693
|
+
emoji="⚠️",
|
|
2694
|
+
profile=profile_normalized,
|
|
2218
2695
|
)
|
|
2219
2696
|
plugin_provenance = {
|
|
2220
2697
|
"adapter": adapter_meta,
|
|
@@ -2222,54 +2699,22 @@ def run_command(
|
|
|
2222
2699
|
"guards": guard_metadata,
|
|
2223
2700
|
}
|
|
2224
2701
|
pm_acceptance_range = _resolve_pm_acceptance_range(cfg)
|
|
2225
|
-
|
|
2226
|
-
|
|
2702
|
+
pm_drift_band = _resolve_pm_drift_band(cfg)
|
|
2703
|
+
|
|
2704
|
+
_event(
|
|
2705
|
+
console,
|
|
2706
|
+
"DATA",
|
|
2707
|
+
f"Adapter: {adapter.name}",
|
|
2708
|
+
emoji="🔌",
|
|
2709
|
+
profile=profile_normalized,
|
|
2710
|
+
)
|
|
2227
2711
|
|
|
2228
2712
|
# Create run configuration
|
|
2229
|
-
def _to_serialisable_dict(section: object) -> dict[str, Any]:
|
|
2230
|
-
"""Coerce config fragments to plain dicts.
|
|
2231
|
-
|
|
2232
|
-
Handles InvarLockConfig sections (which wrap dicts in a private `_Obj` with
|
|
2233
|
-
`_data`) so downstream components (core.runner) see canonical mappings,
|
|
2234
|
-
e.g. `eval.bootstrap.replicates`.
|
|
2235
|
-
"""
|
|
2236
|
-
# Prefer native dump methods
|
|
2237
|
-
if hasattr(section, "model_dump"):
|
|
2238
|
-
return section.model_dump() # type: ignore[return-value]
|
|
2239
|
-
if hasattr(section, "dict"):
|
|
2240
|
-
try:
|
|
2241
|
-
return section.dict() # type: ignore[return-value]
|
|
2242
|
-
except Exception:
|
|
2243
|
-
pass
|
|
2244
|
-
# Unwrap CLI _Obj wrapper used by InvarLockConfig for attribute access
|
|
2245
|
-
try:
|
|
2246
|
-
raw = getattr(section, "_data", None)
|
|
2247
|
-
if isinstance(raw, dict):
|
|
2248
|
-
return raw
|
|
2249
|
-
except Exception:
|
|
2250
|
-
pass
|
|
2251
|
-
# Already a mapping
|
|
2252
|
-
if isinstance(section, dict):
|
|
2253
|
-
return section
|
|
2254
|
-
# Best-effort attribute dump
|
|
2255
|
-
try:
|
|
2256
|
-
data = vars(section)
|
|
2257
|
-
# Common case: {'_data': {...}}
|
|
2258
|
-
if isinstance(data, dict) and isinstance(data.get("_data"), dict):
|
|
2259
|
-
return data["_data"]
|
|
2260
|
-
return data # type: ignore[return-value]
|
|
2261
|
-
except TypeError:
|
|
2262
|
-
return {}
|
|
2263
|
-
|
|
2264
|
-
def _dump_guard(section: object) -> dict[str, Any]:
|
|
2265
|
-
data = _to_serialisable_dict(section)
|
|
2266
|
-
return data if isinstance(data, dict) else {}
|
|
2267
|
-
|
|
2268
2713
|
guard_overrides = {
|
|
2269
|
-
"spectral":
|
|
2270
|
-
"rmt":
|
|
2271
|
-
"variance":
|
|
2272
|
-
"invariants":
|
|
2714
|
+
"spectral": _to_serialisable_dict(getattr(cfg.guards, "spectral", {})),
|
|
2715
|
+
"rmt": _to_serialisable_dict(getattr(cfg.guards, "rmt", {})),
|
|
2716
|
+
"variance": _to_serialisable_dict(getattr(cfg.guards, "variance", {})),
|
|
2717
|
+
"invariants": _to_serialisable_dict(getattr(cfg.guards, "invariants", {})),
|
|
2273
2718
|
}
|
|
2274
2719
|
|
|
2275
2720
|
if model_profile.invariants:
|
|
@@ -2297,10 +2742,38 @@ def run_command(
|
|
|
2297
2742
|
"plugins": plugin_provenance,
|
|
2298
2743
|
"run_id": run_id,
|
|
2299
2744
|
}
|
|
2745
|
+
# Provide baseline per-window logloss to the CoreRunner for paired tail
|
|
2746
|
+
# evidence and (optionally) fail/rollback enforcement.
|
|
2747
|
+
try:
|
|
2748
|
+
if isinstance(baseline_report_data, dict):
|
|
2749
|
+
ew = baseline_report_data.get("evaluation_windows")
|
|
2750
|
+
if isinstance(ew, dict):
|
|
2751
|
+
final = ew.get("final")
|
|
2752
|
+
if (
|
|
2753
|
+
isinstance(final, dict)
|
|
2754
|
+
and isinstance(final.get("window_ids"), list)
|
|
2755
|
+
and isinstance(final.get("logloss"), list)
|
|
2756
|
+
):
|
|
2757
|
+
base_eval: dict[str, Any] = {
|
|
2758
|
+
"final": {
|
|
2759
|
+
"window_ids": list(final.get("window_ids") or []),
|
|
2760
|
+
"logloss": list(final.get("logloss") or []),
|
|
2761
|
+
}
|
|
2762
|
+
}
|
|
2763
|
+
if isinstance(final.get("token_counts"), list):
|
|
2764
|
+
base_eval["final"]["token_counts"] = list(
|
|
2765
|
+
final.get("token_counts") or []
|
|
2766
|
+
)
|
|
2767
|
+
run_context["baseline_eval_windows"] = base_eval
|
|
2768
|
+
except Exception:
|
|
2769
|
+
pass
|
|
2300
2770
|
run_context.setdefault("primary_metric", {})["acceptance_range"] = (
|
|
2301
2771
|
pm_acceptance_range
|
|
2302
2772
|
)
|
|
2303
2773
|
run_context["pm_acceptance_range"] = pm_acceptance_range
|
|
2774
|
+
if pm_drift_band:
|
|
2775
|
+
run_context.setdefault("primary_metric", {})["drift_band"] = pm_drift_band
|
|
2776
|
+
run_context["pm_drift_band"] = pm_drift_band
|
|
2304
2777
|
run_context["model_profile"] = {
|
|
2305
2778
|
"family": model_profile.family,
|
|
2306
2779
|
"default_loss": model_profile.default_loss,
|
|
@@ -2331,6 +2804,7 @@ def run_command(
|
|
|
2331
2804
|
dataset_meta: dict[str, Any] = {}
|
|
2332
2805
|
baseline_meta: dict[str, Any] = {}
|
|
2333
2806
|
window_plan: dict[str, Any] | None = None
|
|
2807
|
+
dataset_timing_start: float | None = perf_counter() if collect_timings else None
|
|
2334
2808
|
if pairing_schedule:
|
|
2335
2809
|
harvested = _validate_and_harvest_baseline_schedule(
|
|
2336
2810
|
cfg,
|
|
@@ -2353,7 +2827,7 @@ def run_command(
|
|
|
2353
2827
|
try:
|
|
2354
2828
|
tokenizer, tokenizer_hash = resolve_tokenizer(model_profile)
|
|
2355
2829
|
except Exception as exc:
|
|
2356
|
-
console
|
|
2830
|
+
_event(console, "FAIL", str(exc), emoji="❌", profile=profile)
|
|
2357
2831
|
raise typer.Exit(1) from exc
|
|
2358
2832
|
preview_window_ids = pairing_schedule["preview"].get("window_ids")
|
|
2359
2833
|
preview_labels = pairing_schedule["preview"].get("labels")
|
|
@@ -2575,7 +3049,13 @@ def run_command(
|
|
|
2575
3049
|
if capacity_meta and "window_capacity" not in dataset_meta:
|
|
2576
3050
|
dataset_meta["window_capacity"] = capacity_meta
|
|
2577
3051
|
elif cfg.dataset.provider:
|
|
2578
|
-
|
|
3052
|
+
_event(
|
|
3053
|
+
console,
|
|
3054
|
+
"DATA",
|
|
3055
|
+
f"Loading dataset: {cfg.dataset.provider}",
|
|
3056
|
+
emoji="📊",
|
|
3057
|
+
profile=profile_normalized,
|
|
3058
|
+
)
|
|
2579
3059
|
# Pass through provider-specific kwargs when available
|
|
2580
3060
|
provider_kwargs = {}
|
|
2581
3061
|
for key in (
|
|
@@ -2635,6 +3115,7 @@ def run_command(
|
|
|
2635
3115
|
provider_kwargs=provider_kwargs,
|
|
2636
3116
|
console=console,
|
|
2637
3117
|
resolved_device=resolved_device,
|
|
3118
|
+
emit=_provider_event,
|
|
2638
3119
|
)
|
|
2639
3120
|
)
|
|
2640
3121
|
|
|
@@ -2642,7 +3123,7 @@ def run_command(
|
|
|
2642
3123
|
try:
|
|
2643
3124
|
tokenizer, tokenizer_hash = resolve_tokenizer(model_profile)
|
|
2644
3125
|
except Exception as exc:
|
|
2645
|
-
console
|
|
3126
|
+
_event(console, "FAIL", str(exc), emoji="❌", profile=profile)
|
|
2646
3127
|
raise typer.Exit(1) from exc
|
|
2647
3128
|
|
|
2648
3129
|
dataset_stride = getattr(
|
|
@@ -2676,7 +3157,7 @@ def run_command(
|
|
|
2676
3157
|
console=console,
|
|
2677
3158
|
)
|
|
2678
3159
|
except RuntimeError as err:
|
|
2679
|
-
console
|
|
3160
|
+
_event(console, "FAIL", str(err), emoji="❌", profile=profile)
|
|
2680
3161
|
raise typer.Exit(1) from err
|
|
2681
3162
|
|
|
2682
3163
|
actual_per_arm = int(window_plan["actual_preview"])
|
|
@@ -2688,9 +3169,12 @@ def run_command(
|
|
|
2688
3169
|
cfg.dataset, "stride", getattr(cfg.dataset, "seq_len", 0)
|
|
2689
3170
|
)
|
|
2690
3171
|
else:
|
|
2691
|
-
|
|
2692
|
-
|
|
2693
|
-
"
|
|
3172
|
+
_event(
|
|
3173
|
+
console,
|
|
3174
|
+
"WARN",
|
|
3175
|
+
"Release profile requested but dataset provider does not expose capacity estimation; using configured window counts.",
|
|
3176
|
+
emoji="⚠️",
|
|
3177
|
+
profile=profile_normalized,
|
|
2694
3178
|
)
|
|
2695
3179
|
|
|
2696
3180
|
preview_records: list[tuple[list[int], list[int]]] = []
|
|
@@ -2894,8 +3378,12 @@ def run_command(
|
|
|
2894
3378
|
raise RuntimeError(
|
|
2895
3379
|
"Unable to construct non-overlapping windows within minimum window floor."
|
|
2896
3380
|
)
|
|
2897
|
-
|
|
2898
|
-
|
|
3381
|
+
_event(
|
|
3382
|
+
console,
|
|
3383
|
+
"WARN",
|
|
3384
|
+
f"Detected {deficit} duplicate windows; reducing per-arm windows to {proposed_per_arm} and retrying stratification.",
|
|
3385
|
+
emoji="⚠️",
|
|
3386
|
+
profile=profile_normalized,
|
|
2899
3387
|
)
|
|
2900
3388
|
|
|
2901
3389
|
effective_preview = proposed_per_arm
|
|
@@ -3037,6 +3525,10 @@ def run_command(
|
|
|
3037
3525
|
run_context["dataset_meta"] = dataset_meta
|
|
3038
3526
|
if window_plan:
|
|
3039
3527
|
run_context["window_plan"] = window_plan
|
|
3528
|
+
if dataset_timing_start is not None:
|
|
3529
|
+
timings["load_dataset"] = max(
|
|
3530
|
+
0.0, float(perf_counter() - dataset_timing_start)
|
|
3531
|
+
)
|
|
3040
3532
|
|
|
3041
3533
|
if os.environ.get("INVARLOCK_DEBUG_TRACE"):
|
|
3042
3534
|
console.print(
|
|
@@ -3060,7 +3552,13 @@ def run_command(
|
|
|
3060
3552
|
)
|
|
3061
3553
|
|
|
3062
3554
|
# Execute the real pipeline using CoreRunner
|
|
3063
|
-
|
|
3555
|
+
_event(
|
|
3556
|
+
console,
|
|
3557
|
+
"EXEC",
|
|
3558
|
+
f"Executing pipeline with {len(guards)} guards...",
|
|
3559
|
+
emoji="⚙️",
|
|
3560
|
+
profile=profile_normalized,
|
|
3561
|
+
)
|
|
3064
3562
|
runner = CoreRunner()
|
|
3065
3563
|
|
|
3066
3564
|
# Prepare auto configuration for tier resolution
|
|
@@ -3125,8 +3623,8 @@ def run_command(
|
|
|
3125
3623
|
for key, values in model_profile.module_selectors.items()
|
|
3126
3624
|
}
|
|
3127
3625
|
|
|
3128
|
-
console.print(
|
|
3129
|
-
console.print(
|
|
3626
|
+
console.print(_format_kv_line("Edit", str(edit_op.name)))
|
|
3627
|
+
console.print(_format_kv_line("Guards", _format_guard_chain(guards)))
|
|
3130
3628
|
|
|
3131
3629
|
# Model load/snapshot strategy
|
|
3132
3630
|
model = None
|
|
@@ -3140,8 +3638,25 @@ def run_command(
|
|
|
3140
3638
|
# Try single-load with snapshot/restore if adapter supports it; fallback to reload per attempt
|
|
3141
3639
|
try:
|
|
3142
3640
|
# Load once
|
|
3143
|
-
|
|
3144
|
-
|
|
3641
|
+
_event(
|
|
3642
|
+
console,
|
|
3643
|
+
"INIT",
|
|
3644
|
+
f"Loading model once: {cfg.model.id}",
|
|
3645
|
+
emoji="🔧",
|
|
3646
|
+
profile=profile_normalized,
|
|
3647
|
+
)
|
|
3648
|
+
with timed_step(
|
|
3649
|
+
console=console,
|
|
3650
|
+
style=_style_from_console(console, profile=profile_normalized),
|
|
3651
|
+
timings=timings,
|
|
3652
|
+
key="load_model",
|
|
3653
|
+
tag="INIT",
|
|
3654
|
+
message="Load model",
|
|
3655
|
+
emoji="🔧",
|
|
3656
|
+
):
|
|
3657
|
+
model = _load_model_with_cfg(
|
|
3658
|
+
adapter, cfg, resolved_device, profile=profile_normalized
|
|
3659
|
+
)
|
|
3145
3660
|
|
|
3146
3661
|
# No edit-specific bootstrap logic
|
|
3147
3662
|
|
|
@@ -3297,9 +3812,13 @@ def run_command(
|
|
|
3297
3812
|
return "reload"
|
|
3298
3813
|
|
|
3299
3814
|
mode = _choose_snapshot_mode()
|
|
3300
|
-
|
|
3301
|
-
|
|
3302
|
-
|
|
3815
|
+
enabled = mode in {"bytes", "chunked"}
|
|
3816
|
+
_event(
|
|
3817
|
+
console,
|
|
3818
|
+
"INIT",
|
|
3819
|
+
f"Snapshot mode: {'enabled' if enabled else 'disabled'}",
|
|
3820
|
+
emoji="💾",
|
|
3821
|
+
profile=profile_normalized,
|
|
3303
3822
|
)
|
|
3304
3823
|
if mode == "chunked":
|
|
3305
3824
|
snapshot_tmpdir = adapter.snapshot_chunked(model) # type: ignore[attr-defined]
|
|
@@ -3342,13 +3861,16 @@ def run_command(
|
|
|
3342
3861
|
|
|
3343
3862
|
# RETRY LOOP - All report processing inside loop
|
|
3344
3863
|
attempt = 1
|
|
3345
|
-
profile_normalized = (profile or "").lower()
|
|
3346
3864
|
measure_guard_overhead, skip_overhead = _should_measure_overhead(
|
|
3347
3865
|
profile_normalized
|
|
3348
3866
|
)
|
|
3349
3867
|
if skip_overhead and profile_normalized in {"ci", "release"}:
|
|
3350
|
-
|
|
3351
|
-
|
|
3868
|
+
_event(
|
|
3869
|
+
console,
|
|
3870
|
+
"WARN",
|
|
3871
|
+
"Overhead check skipped via INVARLOCK_SKIP_OVERHEAD_CHECK",
|
|
3872
|
+
emoji="⚠️",
|
|
3873
|
+
profile=profile_normalized,
|
|
3352
3874
|
)
|
|
3353
3875
|
|
|
3354
3876
|
while True:
|
|
@@ -3356,12 +3878,32 @@ def run_command(
|
|
|
3356
3878
|
set_seed(seed_bundle["python"])
|
|
3357
3879
|
|
|
3358
3880
|
if retry_controller:
|
|
3359
|
-
console.print(
|
|
3881
|
+
console.print("\n")
|
|
3882
|
+
_event(
|
|
3883
|
+
console,
|
|
3884
|
+
"EXEC",
|
|
3885
|
+
f"Attempt {attempt}/{max_attempts}",
|
|
3886
|
+
emoji="🚀",
|
|
3887
|
+
profile=profile_normalized,
|
|
3888
|
+
)
|
|
3360
3889
|
if attempt > 1:
|
|
3361
|
-
|
|
3890
|
+
_event(
|
|
3891
|
+
console,
|
|
3892
|
+
"EXEC",
|
|
3893
|
+
f"Retry attempt {attempt}/{max_attempts}",
|
|
3894
|
+
emoji="🔄",
|
|
3895
|
+
profile=profile_normalized,
|
|
3896
|
+
)
|
|
3362
3897
|
else:
|
|
3363
3898
|
if attempt > 1:
|
|
3364
|
-
console.print(
|
|
3899
|
+
console.print("\n")
|
|
3900
|
+
_event(
|
|
3901
|
+
console,
|
|
3902
|
+
"EXEC",
|
|
3903
|
+
f"Attempt {attempt}",
|
|
3904
|
+
emoji="🚀",
|
|
3905
|
+
profile=profile_normalized,
|
|
3906
|
+
)
|
|
3365
3907
|
|
|
3366
3908
|
# Adjust parameters for retry attempts
|
|
3367
3909
|
if retry_controller and attempt > 1:
|
|
@@ -3390,6 +3932,8 @@ def run_command(
|
|
|
3390
3932
|
"checks": {},
|
|
3391
3933
|
}
|
|
3392
3934
|
elif measure_guard_overhead:
|
|
3935
|
+
bare_edit_config = dict(edit_config or {})
|
|
3936
|
+
bare_edit_config["emit"] = False
|
|
3393
3937
|
guard_overhead_payload = _run_bare_control(
|
|
3394
3938
|
adapter=adapter,
|
|
3395
3939
|
edit_op=edit_op,
|
|
@@ -3398,7 +3942,7 @@ def run_command(
|
|
|
3398
3942
|
run_config=run_config,
|
|
3399
3943
|
calibration_data=calibration_data,
|
|
3400
3944
|
auto_config=auto_config,
|
|
3401
|
-
edit_config=
|
|
3945
|
+
edit_config=bare_edit_config,
|
|
3402
3946
|
preview_count=preview_count,
|
|
3403
3947
|
final_count=final_count,
|
|
3404
3948
|
seed_bundle=seed_bundle,
|
|
@@ -3412,34 +3956,53 @@ def run_command(
|
|
|
3412
3956
|
)
|
|
3413
3957
|
|
|
3414
3958
|
# Ensure clean state for guarded run
|
|
3415
|
-
|
|
3416
|
-
runner=runner,
|
|
3417
|
-
adapter=adapter,
|
|
3418
|
-
model=model,
|
|
3419
|
-
cfg=cfg,
|
|
3420
|
-
edit_op=edit_op,
|
|
3421
|
-
run_config=run_config,
|
|
3422
|
-
guards=guards,
|
|
3423
|
-
calibration_data=calibration_data,
|
|
3424
|
-
auto_config=auto_config,
|
|
3425
|
-
edit_config=edit_config,
|
|
3426
|
-
preview_count=preview_count,
|
|
3427
|
-
final_count=final_count,
|
|
3428
|
-
restore_fn=restore_fn,
|
|
3429
|
-
resolved_device=resolved_device,
|
|
3959
|
+
with timed_step(
|
|
3430
3960
|
console=console,
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3961
|
+
style=_style_from_console(console, profile=profile_normalized),
|
|
3962
|
+
timings=timings,
|
|
3963
|
+
key="execute",
|
|
3964
|
+
tag="EXEC",
|
|
3965
|
+
message="Execute pipeline",
|
|
3966
|
+
emoji="⚙️",
|
|
3967
|
+
):
|
|
3968
|
+
core_report, model = _execute_guarded_run(
|
|
3969
|
+
runner=runner,
|
|
3970
|
+
adapter=adapter,
|
|
3971
|
+
model=model,
|
|
3972
|
+
cfg=cfg,
|
|
3973
|
+
edit_op=edit_op,
|
|
3974
|
+
run_config=run_config,
|
|
3975
|
+
guards=guards,
|
|
3976
|
+
calibration_data=calibration_data,
|
|
3977
|
+
auto_config=auto_config,
|
|
3978
|
+
edit_config=edit_config,
|
|
3979
|
+
preview_count=preview_count,
|
|
3980
|
+
final_count=final_count,
|
|
3981
|
+
restore_fn=restore_fn,
|
|
3982
|
+
resolved_device=resolved_device,
|
|
3983
|
+
profile_normalized=profile_normalized,
|
|
3984
|
+
console=console,
|
|
3985
|
+
snapshot_provenance=snapshot_provenance,
|
|
3986
|
+
skip_model_load=skip_model_load,
|
|
3987
|
+
)
|
|
3434
3988
|
except _SnapshotRestoreFailed as exc:
|
|
3435
3989
|
snapshot_provenance["restore_failed"] = True
|
|
3436
3990
|
_free_model_memory(model)
|
|
3437
3991
|
model = None
|
|
3438
3992
|
restore_fn = None
|
|
3439
|
-
|
|
3440
|
-
|
|
3993
|
+
_event(
|
|
3994
|
+
console,
|
|
3995
|
+
"WARN",
|
|
3996
|
+
"Snapshot restore failed; switching to reload-per-attempt.",
|
|
3997
|
+
emoji="⚠️",
|
|
3998
|
+
profile=profile_normalized,
|
|
3999
|
+
)
|
|
4000
|
+
_event(
|
|
4001
|
+
console,
|
|
4002
|
+
"WARN",
|
|
4003
|
+
f"↳ {exc}",
|
|
4004
|
+
profile=profile_normalized,
|
|
3441
4005
|
)
|
|
3442
|
-
console.print(f"[yellow]↳ {exc}[/yellow]")
|
|
3443
4006
|
if retry_controller:
|
|
3444
4007
|
retry_controller.record_attempt(
|
|
3445
4008
|
attempt,
|
|
@@ -3461,6 +4024,16 @@ def run_command(
|
|
|
3461
4024
|
# Convert CoreRunner report to evaluation report
|
|
3462
4025
|
report = create_empty_report()
|
|
3463
4026
|
|
|
4027
|
+
# Persist minimal run context for certificate/report provenance.
|
|
4028
|
+
try:
|
|
4029
|
+
report["context"] = {
|
|
4030
|
+
"profile": profile_normalized,
|
|
4031
|
+
"auto": dict(auto_config),
|
|
4032
|
+
"assurance": dict(run_context.get("assurance") or {}),
|
|
4033
|
+
}
|
|
4034
|
+
except Exception:
|
|
4035
|
+
pass
|
|
4036
|
+
|
|
3464
4037
|
# Code provenance: commit hash and InvarLock version
|
|
3465
4038
|
commit_value = (
|
|
3466
4039
|
getattr(cfg.meta, "commit", "") if hasattr(cfg, "meta") else ""
|
|
@@ -3561,6 +4134,8 @@ def run_command(
|
|
|
3561
4134
|
report["meta"].update(meta_payload)
|
|
3562
4135
|
if pm_acceptance_range:
|
|
3563
4136
|
report["meta"]["pm_acceptance_range"] = pm_acceptance_range
|
|
4137
|
+
if pm_drift_band:
|
|
4138
|
+
report["meta"]["pm_drift_band"] = pm_drift_band
|
|
3564
4139
|
report["meta"]["model_profile"] = {
|
|
3565
4140
|
"family": model_profile.family,
|
|
3566
4141
|
"default_loss": model_profile.default_loss,
|
|
@@ -3644,6 +4219,14 @@ def run_command(
|
|
|
3644
4219
|
}
|
|
3645
4220
|
)
|
|
3646
4221
|
|
|
4222
|
+
if edit_label:
|
|
4223
|
+
report.setdefault("edit", {})
|
|
4224
|
+
report["edit"]["name"] = edit_label
|
|
4225
|
+
report["edit"]["algorithm"] = edit_label
|
|
4226
|
+
if isinstance(core_report.context, dict):
|
|
4227
|
+
core_report.context.setdefault("edit", {})
|
|
4228
|
+
core_report.context["edit"]["name"] = edit_label
|
|
4229
|
+
|
|
3647
4230
|
mask_artifact_path = _persist_ref_masks(core_report, run_dir)
|
|
3648
4231
|
if mask_artifact_path:
|
|
3649
4232
|
report.setdefault("artifacts", {})
|
|
@@ -3651,6 +4234,22 @@ def run_command(
|
|
|
3651
4234
|
|
|
3652
4235
|
# Transfer metrics (PM-only: do not write ppl_* fields)
|
|
3653
4236
|
if hasattr(core_report, "metrics") and core_report.metrics:
|
|
4237
|
+
if isinstance(core_report.metrics, dict):
|
|
4238
|
+
core_timings = core_report.metrics.get("timings")
|
|
4239
|
+
if isinstance(core_timings, dict):
|
|
4240
|
+
for key in (
|
|
4241
|
+
"prepare",
|
|
4242
|
+
"prepare_guards",
|
|
4243
|
+
"edit",
|
|
4244
|
+
"guards",
|
|
4245
|
+
"eval",
|
|
4246
|
+
"finalize",
|
|
4247
|
+
):
|
|
4248
|
+
if key in core_timings:
|
|
4249
|
+
try:
|
|
4250
|
+
timings[key] = float(core_timings[key])
|
|
4251
|
+
except Exception:
|
|
4252
|
+
timings[key] = core_timings[key]
|
|
3654
4253
|
metrics_payload = {
|
|
3655
4254
|
"latency_ms_per_tok": core_report.metrics.get(
|
|
3656
4255
|
"latency_ms_per_tok", 0.0
|
|
@@ -3696,11 +4295,17 @@ def run_command(
|
|
|
3696
4295
|
"window_pairing_final",
|
|
3697
4296
|
"paired_windows",
|
|
3698
4297
|
"paired_delta_summary",
|
|
4298
|
+
"primary_metric_tail",
|
|
3699
4299
|
"preview_total_tokens",
|
|
3700
4300
|
"final_total_tokens",
|
|
3701
4301
|
"masked_tokens_total",
|
|
3702
4302
|
"masked_tokens_preview",
|
|
3703
4303
|
"masked_tokens_final",
|
|
4304
|
+
"timings",
|
|
4305
|
+
"guard_timings",
|
|
4306
|
+
"memory_snapshots",
|
|
4307
|
+
"gpu_memory_mb_peak",
|
|
4308
|
+
"gpu_memory_reserved_mb_peak",
|
|
3704
4309
|
"reduction",
|
|
3705
4310
|
]
|
|
3706
4311
|
for key in optional_keys:
|
|
@@ -3864,8 +4469,12 @@ def run_command(
|
|
|
3864
4469
|
},
|
|
3865
4470
|
}
|
|
3866
4471
|
elif had_baseline and (profile or "").lower() in {"ci", "release"}:
|
|
3867
|
-
|
|
3868
|
-
|
|
4472
|
+
_event(
|
|
4473
|
+
console,
|
|
4474
|
+
"FAIL",
|
|
4475
|
+
"[INVARLOCK:E001] PAIRING-SCHEDULE-MISMATCH: baseline pairing requested but evaluation windows were not produced. Check capacity/pairing config.",
|
|
4476
|
+
emoji="❌",
|
|
4477
|
+
profile=profile_normalized,
|
|
3869
4478
|
)
|
|
3870
4479
|
raise typer.Exit(3)
|
|
3871
4480
|
else:
|
|
@@ -4076,12 +4685,20 @@ def run_command(
|
|
|
4076
4685
|
if ok:
|
|
4077
4686
|
report["artifacts"]["checkpoint_path"] = str(export_dir)
|
|
4078
4687
|
else:
|
|
4079
|
-
|
|
4080
|
-
|
|
4688
|
+
_event(
|
|
4689
|
+
console,
|
|
4690
|
+
"WARN",
|
|
4691
|
+
"Model export requested but adapter did not save a HF directory.",
|
|
4692
|
+
emoji="⚠️",
|
|
4693
|
+
profile=profile_normalized,
|
|
4081
4694
|
)
|
|
4082
4695
|
except Exception:
|
|
4083
|
-
|
|
4084
|
-
|
|
4696
|
+
_event(
|
|
4697
|
+
console,
|
|
4698
|
+
"WARN",
|
|
4699
|
+
"Model export requested but failed due to an unexpected error.",
|
|
4700
|
+
emoji="⚠️",
|
|
4701
|
+
profile=profile_normalized,
|
|
4085
4702
|
)
|
|
4086
4703
|
|
|
4087
4704
|
# Set flags
|
|
@@ -4302,7 +4919,10 @@ def run_command(
|
|
|
4302
4919
|
try:
|
|
4303
4920
|
metric_kind_resolved, _provider_kind, metric_opts = (
|
|
4304
4921
|
_resolve_metric_and_provider(
|
|
4305
|
-
cfg,
|
|
4922
|
+
cfg,
|
|
4923
|
+
model_profile,
|
|
4924
|
+
resolved_loss_type=resolved_loss_type,
|
|
4925
|
+
metric_kind_override=metric_kind,
|
|
4306
4926
|
)
|
|
4307
4927
|
)
|
|
4308
4928
|
if metric_kind_resolved:
|
|
@@ -4313,6 +4933,12 @@ def run_command(
|
|
|
4313
4933
|
pm = compute_primary_metric_from_report(
|
|
4314
4934
|
report, kind=metric_kind_resolved, baseline=baseline_report_data
|
|
4315
4935
|
)
|
|
4936
|
+
core_primary_metric = None
|
|
4937
|
+
if hasattr(core_report, "metrics") and isinstance(
|
|
4938
|
+
core_report.metrics, dict
|
|
4939
|
+
):
|
|
4940
|
+
core_primary_metric = core_report.metrics.get("primary_metric")
|
|
4941
|
+
pm = _merge_primary_metric_health(pm, core_primary_metric)
|
|
4316
4942
|
report.setdefault("metrics", {})["primary_metric"] = pm
|
|
4317
4943
|
# Attach configured reps/ci_level when provided
|
|
4318
4944
|
if metric_opts:
|
|
@@ -4327,7 +4953,7 @@ def run_command(
|
|
|
4327
4953
|
) # type: ignore[index]
|
|
4328
4954
|
except Exception:
|
|
4329
4955
|
pass
|
|
4330
|
-
# Shadow parity check against
|
|
4956
|
+
# Shadow parity check against ppl_* fields (best-effort)
|
|
4331
4957
|
try:
|
|
4332
4958
|
pm_blk = report.get("metrics", {}).get("primary_metric", {})
|
|
4333
4959
|
ppl_final_v1 = float(pm_blk.get("final"))
|
|
@@ -4375,6 +5001,13 @@ def run_command(
|
|
|
4375
5001
|
except Exception:
|
|
4376
5002
|
pass
|
|
4377
5003
|
|
|
5004
|
+
telemetry_path: Path | None = None
|
|
5005
|
+
if telemetry:
|
|
5006
|
+
telemetry_path = run_dir / "telemetry.json"
|
|
5007
|
+
report.setdefault("artifacts", {})["telemetry_path"] = str(
|
|
5008
|
+
telemetry_path
|
|
5009
|
+
)
|
|
5010
|
+
|
|
4378
5011
|
saved_files = _postprocess_and_summarize(
|
|
4379
5012
|
report=report,
|
|
4380
5013
|
run_dir=run_dir,
|
|
@@ -4391,6 +5024,31 @@ def run_command(
|
|
|
4391
5024
|
except Exception:
|
|
4392
5025
|
pass
|
|
4393
5026
|
|
|
5027
|
+
if telemetry and telemetry_path is not None:
|
|
5028
|
+
try:
|
|
5029
|
+
from invarlock.reporting.telemetry import save_telemetry_report
|
|
5030
|
+
|
|
5031
|
+
saved_path = save_telemetry_report(
|
|
5032
|
+
report, run_dir, filename=telemetry_path.name
|
|
5033
|
+
)
|
|
5034
|
+
if isinstance(saved_files, dict):
|
|
5035
|
+
saved_files["telemetry"] = str(saved_path)
|
|
5036
|
+
_event(
|
|
5037
|
+
console,
|
|
5038
|
+
"DATA",
|
|
5039
|
+
f"Telemetry: {saved_path}",
|
|
5040
|
+
emoji="📈",
|
|
5041
|
+
profile=profile_normalized,
|
|
5042
|
+
)
|
|
5043
|
+
except Exception as exc: # pragma: no cover - best-effort
|
|
5044
|
+
_event(
|
|
5045
|
+
console,
|
|
5046
|
+
"WARN",
|
|
5047
|
+
f"Telemetry export failed: {exc}",
|
|
5048
|
+
emoji="⚠️",
|
|
5049
|
+
profile=profile_normalized,
|
|
5050
|
+
)
|
|
5051
|
+
|
|
4394
5052
|
# Metrics display
|
|
4395
5053
|
pm_obj = None
|
|
4396
5054
|
try:
|
|
@@ -4405,15 +5063,23 @@ def run_command(
|
|
|
4405
5063
|
if isinstance(pm_prev, (int | float)) and isinstance(
|
|
4406
5064
|
pm_fin, (int | float)
|
|
4407
5065
|
):
|
|
4408
|
-
|
|
4409
|
-
|
|
5066
|
+
_event(
|
|
5067
|
+
console,
|
|
5068
|
+
"METRIC",
|
|
5069
|
+
f"Primary Metric [{pm_kind}] — preview: {pm_prev:.3f}, final: {pm_fin:.3f}",
|
|
5070
|
+
emoji="📌",
|
|
5071
|
+
profile=profile_normalized,
|
|
4410
5072
|
)
|
|
4411
5073
|
ratio_vs_base = pm_obj.get("ratio_vs_baseline")
|
|
4412
5074
|
if isinstance(ratio_vs_base, (int | float)) and math.isfinite(
|
|
4413
5075
|
ratio_vs_base
|
|
4414
5076
|
):
|
|
4415
|
-
|
|
4416
|
-
|
|
5077
|
+
_event(
|
|
5078
|
+
console,
|
|
5079
|
+
"METRIC",
|
|
5080
|
+
f"Ratio vs baseline [{pm_kind}]: {ratio_vs_base:.3f}",
|
|
5081
|
+
emoji="🔗",
|
|
5082
|
+
profile=profile_normalized,
|
|
4417
5083
|
)
|
|
4418
5084
|
except Exception:
|
|
4419
5085
|
pass
|
|
@@ -4425,8 +5091,12 @@ def run_command(
|
|
|
4425
5091
|
console, guard_overhead_info
|
|
4426
5092
|
)
|
|
4427
5093
|
if not guard_overhead_info.get("passed", True):
|
|
4428
|
-
|
|
4429
|
-
|
|
5094
|
+
_event(
|
|
5095
|
+
console,
|
|
5096
|
+
"FAIL",
|
|
5097
|
+
"Guard overhead gate FAILED: Guards add more than the permitted budget",
|
|
5098
|
+
emoji="⚠️",
|
|
5099
|
+
profile=profile_normalized,
|
|
4430
5100
|
)
|
|
4431
5101
|
# Only fail hard when the overhead check was actually evaluated
|
|
4432
5102
|
# (e.g., for causal LMs with available bare/guarded PM). For
|
|
@@ -4467,7 +5137,13 @@ def run_command(
|
|
|
4467
5137
|
if baseline_report is None:
|
|
4468
5138
|
raise FileNotFoundError("Baseline report unavailable")
|
|
4469
5139
|
|
|
4470
|
-
|
|
5140
|
+
_event(
|
|
5141
|
+
console,
|
|
5142
|
+
"EXEC",
|
|
5143
|
+
"Generating evaluation certificate...",
|
|
5144
|
+
emoji="📜",
|
|
5145
|
+
profile=profile_normalized,
|
|
5146
|
+
)
|
|
4471
5147
|
certificate = make_certificate(report, baseline_report)
|
|
4472
5148
|
|
|
4473
5149
|
validation = certificate.get("validation", {})
|
|
@@ -4484,11 +5160,21 @@ def run_command(
|
|
|
4484
5160
|
)
|
|
4485
5161
|
|
|
4486
5162
|
if certificate_passed:
|
|
4487
|
-
|
|
5163
|
+
_event(
|
|
5164
|
+
console,
|
|
5165
|
+
"PASS",
|
|
5166
|
+
"Certificate PASSED all gates!",
|
|
5167
|
+
emoji="✅",
|
|
5168
|
+
profile=profile_normalized,
|
|
5169
|
+
)
|
|
4488
5170
|
break
|
|
4489
5171
|
else:
|
|
4490
|
-
|
|
4491
|
-
|
|
5172
|
+
_event(
|
|
5173
|
+
console,
|
|
5174
|
+
"FAIL",
|
|
5175
|
+
f"Certificate FAILED gates: {', '.join(failed_gates)}",
|
|
5176
|
+
emoji="⚠️",
|
|
5177
|
+
profile=profile_normalized,
|
|
4492
5178
|
)
|
|
4493
5179
|
|
|
4494
5180
|
# Auto-tune mask-only heads (binary search on keep count)
|
|
@@ -4533,8 +5219,12 @@ def run_command(
|
|
|
4533
5219
|
}
|
|
4534
5220
|
)
|
|
4535
5221
|
head_section["global_k"] = next_keep
|
|
4536
|
-
|
|
4537
|
-
|
|
5222
|
+
_event(
|
|
5223
|
+
console,
|
|
5224
|
+
"INIT",
|
|
5225
|
+
f"Auto-tune adjust: global_k → {next_keep} (bounds {keep_low}-{keep_high})",
|
|
5226
|
+
emoji="🔧",
|
|
5227
|
+
profile=profile_normalized,
|
|
4538
5228
|
)
|
|
4539
5229
|
except Exception:
|
|
4540
5230
|
pass
|
|
@@ -4543,14 +5233,22 @@ def run_command(
|
|
|
4543
5233
|
attempt += 1
|
|
4544
5234
|
continue
|
|
4545
5235
|
else:
|
|
4546
|
-
|
|
4547
|
-
|
|
5236
|
+
_event(
|
|
5237
|
+
console,
|
|
5238
|
+
"FAIL",
|
|
5239
|
+
f"Exhausted retry budget after {attempt} attempts",
|
|
5240
|
+
emoji="❌",
|
|
5241
|
+
profile=profile_normalized,
|
|
4548
5242
|
)
|
|
4549
5243
|
break
|
|
4550
5244
|
|
|
4551
5245
|
except Exception as cert_error:
|
|
4552
|
-
|
|
4553
|
-
|
|
5246
|
+
_event(
|
|
5247
|
+
console,
|
|
5248
|
+
"WARN",
|
|
5249
|
+
f"Certificate validation failed: {cert_error}",
|
|
5250
|
+
emoji="⚠️",
|
|
5251
|
+
profile=profile_normalized,
|
|
4554
5252
|
)
|
|
4555
5253
|
if retry_controller:
|
|
4556
5254
|
retry_controller.record_attempt(
|
|
@@ -4579,11 +5277,82 @@ def run_command(
|
|
|
4579
5277
|
# (moved) Cleanup printing occurs after loop to guarantee execution
|
|
4580
5278
|
pass
|
|
4581
5279
|
|
|
5280
|
+
if output_style.timing:
|
|
5281
|
+
total_duration = (
|
|
5282
|
+
max(0.0, float(perf_counter() - total_start))
|
|
5283
|
+
if total_start is not None
|
|
5284
|
+
else None
|
|
5285
|
+
)
|
|
5286
|
+
timings_for_summary: dict[str, float] = {}
|
|
5287
|
+
for key, value in timings.items():
|
|
5288
|
+
if isinstance(value, (int | float)):
|
|
5289
|
+
timings_for_summary[key] = float(value)
|
|
5290
|
+
if total_duration is not None:
|
|
5291
|
+
timings_for_summary["total"] = total_duration
|
|
5292
|
+
|
|
5293
|
+
has_breakdown = any(
|
|
5294
|
+
key in timings_for_summary
|
|
5295
|
+
for key in (
|
|
5296
|
+
"prepare",
|
|
5297
|
+
"prepare_guards",
|
|
5298
|
+
"edit",
|
|
5299
|
+
"guards",
|
|
5300
|
+
"eval",
|
|
5301
|
+
"finalize",
|
|
5302
|
+
)
|
|
5303
|
+
)
|
|
5304
|
+
|
|
5305
|
+
order: list[tuple[str, str]] = []
|
|
5306
|
+
|
|
5307
|
+
def _add(label: str, key: str) -> None:
|
|
5308
|
+
if key in timings_for_summary:
|
|
5309
|
+
order.append((label, key))
|
|
5310
|
+
|
|
5311
|
+
_add("Load model", "load_model")
|
|
5312
|
+
_add("Load data", "load_dataset")
|
|
5313
|
+
if has_breakdown:
|
|
5314
|
+
_add("Prepare", "prepare")
|
|
5315
|
+
_add("Prep guards", "prepare_guards")
|
|
5316
|
+
_add("Edit", "edit")
|
|
5317
|
+
_add("Guards", "guards")
|
|
5318
|
+
_add("Eval", "eval")
|
|
5319
|
+
_add("Finalize", "finalize")
|
|
5320
|
+
else:
|
|
5321
|
+
_add("Execute", "execute")
|
|
5322
|
+
_add("Total", "total")
|
|
5323
|
+
|
|
5324
|
+
extra_lines: list[str] = []
|
|
5325
|
+
metrics_section = (
|
|
5326
|
+
report.get("metrics", {}) if isinstance(report, dict) else {}
|
|
5327
|
+
)
|
|
5328
|
+
if isinstance(metrics_section, dict):
|
|
5329
|
+
mem_peak = metrics_section.get("memory_mb_peak")
|
|
5330
|
+
gpu_peak = metrics_section.get("gpu_memory_mb_peak")
|
|
5331
|
+
if isinstance(mem_peak, (int | float)):
|
|
5332
|
+
extra_lines.append(f" Peak Memory : {float(mem_peak):.2f} MB")
|
|
5333
|
+
if isinstance(gpu_peak, (int | float)):
|
|
5334
|
+
extra_lines.append(f" Peak GPU Mem: {float(gpu_peak):.2f} MB")
|
|
5335
|
+
|
|
5336
|
+
if timings_for_summary and order:
|
|
5337
|
+
print_timing_summary(
|
|
5338
|
+
console,
|
|
5339
|
+
timings_for_summary,
|
|
5340
|
+
style=output_style,
|
|
5341
|
+
order=order,
|
|
5342
|
+
extra_lines=extra_lines,
|
|
5343
|
+
)
|
|
5344
|
+
|
|
4582
5345
|
# Normal path falls through; cleanup handled below in finally
|
|
4583
5346
|
return report_path_out
|
|
4584
5347
|
|
|
4585
5348
|
except FileNotFoundError as e:
|
|
4586
|
-
|
|
5349
|
+
_event(
|
|
5350
|
+
console,
|
|
5351
|
+
"FAIL",
|
|
5352
|
+
f"Configuration file not found: {e}",
|
|
5353
|
+
emoji="❌",
|
|
5354
|
+
profile=profile_normalized,
|
|
5355
|
+
)
|
|
4587
5356
|
raise typer.Exit(1) from e
|
|
4588
5357
|
except InvarlockError as ce:
|
|
4589
5358
|
# InvarlockError → code 3 only in CI/Release; dev → 1
|
|
@@ -4599,12 +5368,22 @@ def run_command(
|
|
|
4599
5368
|
traceback.print_exc()
|
|
4600
5369
|
# Emit a clearer message for schema failures (exit 2)
|
|
4601
5370
|
if isinstance(e, ValueError) and "Invalid RunReport" in str(e):
|
|
4602
|
-
|
|
4603
|
-
|
|
5371
|
+
_event(
|
|
5372
|
+
console,
|
|
5373
|
+
"FAIL",
|
|
5374
|
+
"Schema invalid: run report structure failed validation",
|
|
5375
|
+
emoji="❌",
|
|
5376
|
+
profile=profile_normalized,
|
|
4604
5377
|
)
|
|
4605
5378
|
code = 2
|
|
4606
5379
|
else:
|
|
4607
|
-
|
|
5380
|
+
_event(
|
|
5381
|
+
console,
|
|
5382
|
+
"FAIL",
|
|
5383
|
+
f"Pipeline execution failed: {e}",
|
|
5384
|
+
emoji="❌",
|
|
5385
|
+
profile=profile_normalized,
|
|
5386
|
+
)
|
|
4608
5387
|
code = _resolve_exit_code(e, profile=profile)
|
|
4609
5388
|
raise typer.Exit(code) from e
|
|
4610
5389
|
finally:
|
|
@@ -4618,20 +5397,53 @@ def run_command(
|
|
|
4618
5397
|
except Exception:
|
|
4619
5398
|
pass
|
|
4620
5399
|
finally:
|
|
4621
|
-
|
|
5400
|
+
_event(
|
|
5401
|
+
console,
|
|
5402
|
+
"INFO",
|
|
5403
|
+
"Cleanup: removed",
|
|
5404
|
+
emoji="🧹",
|
|
5405
|
+
profile=profile_normalized,
|
|
5406
|
+
)
|
|
4622
5407
|
else:
|
|
4623
|
-
|
|
5408
|
+
_event(
|
|
5409
|
+
console,
|
|
5410
|
+
"INFO",
|
|
5411
|
+
"Cleanup: skipped",
|
|
5412
|
+
emoji="🧹",
|
|
5413
|
+
profile=profile_normalized,
|
|
5414
|
+
)
|
|
4624
5415
|
except Exception:
|
|
4625
5416
|
# Best-effort cleanup printing; never raise from finally
|
|
4626
5417
|
pass
|
|
4627
5418
|
|
|
4628
5419
|
|
|
5420
|
+
def _merge_primary_metric_health(
|
|
5421
|
+
primary_metric: dict[str, Any] | None,
|
|
5422
|
+
core_primary_metric: dict[str, Any] | None,
|
|
5423
|
+
) -> dict[str, Any]:
|
|
5424
|
+
if not isinstance(primary_metric, dict):
|
|
5425
|
+
return {}
|
|
5426
|
+
merged = dict(primary_metric)
|
|
5427
|
+
if not isinstance(core_primary_metric, dict):
|
|
5428
|
+
return merged
|
|
5429
|
+
if core_primary_metric.get("invalid") is True:
|
|
5430
|
+
merged["invalid"] = True
|
|
5431
|
+
merged["degraded"] = True
|
|
5432
|
+
if core_primary_metric.get("degraded") is True:
|
|
5433
|
+
merged["degraded"] = True
|
|
5434
|
+
core_reason = core_primary_metric.get("degraded_reason")
|
|
5435
|
+
if isinstance(core_reason, str) and core_reason:
|
|
5436
|
+
merged["degraded_reason"] = core_reason
|
|
5437
|
+
merged["degraded"] = True
|
|
5438
|
+
return merged
|
|
5439
|
+
|
|
5440
|
+
|
|
4629
5441
|
def _format_debug_metric_diffs(
|
|
4630
5442
|
pm: dict[str, float] | None,
|
|
4631
5443
|
metrics: dict[str, float] | None,
|
|
4632
5444
|
baseline_report_data: dict | None,
|
|
4633
5445
|
) -> str:
|
|
4634
|
-
"""Build a compact DEBUG_METRIC_DIFFS line comparing current snapshot vs
|
|
5446
|
+
"""Build a compact DEBUG_METRIC_DIFFS line comparing current snapshot vs ppl_*.
|
|
4635
5447
|
|
|
4636
5448
|
Returns a semicolon-separated string of deltas like
|
|
4637
5449
|
"final: v1-v1 = +0.000000000; Δlog(final): +0.000000000; ...". Safe to call with
|
|
@@ -4746,11 +5558,9 @@ def _print_guard_overhead_summary(
|
|
|
4746
5558
|
"""Print a concise guard-overhead console summary. Returns threshold fraction used."""
|
|
4747
5559
|
evaluated = bool(guard_overhead_info.get("evaluated", True))
|
|
4748
5560
|
if not evaluated:
|
|
4749
|
-
console
|
|
5561
|
+
_event(console, "METRIC", "Guard Overhead: not evaluated", emoji="🛡️")
|
|
4750
5562
|
return GUARD_OVERHEAD_THRESHOLD
|
|
4751
|
-
overhead_status = (
|
|
4752
|
-
"✅ PASS" if guard_overhead_info.get("passed", True) else "❌ FAIL"
|
|
4753
|
-
)
|
|
5563
|
+
overhead_status = "PASS" if guard_overhead_info.get("passed", True) else "FAIL"
|
|
4754
5564
|
overhead_percent = guard_overhead_info.get("overhead_percent")
|
|
4755
5565
|
if isinstance(overhead_percent, (int | float)) and math.isfinite(
|
|
4756
5566
|
float(overhead_percent)
|
|
@@ -4769,8 +5579,11 @@ def _print_guard_overhead_summary(
|
|
|
4769
5579
|
except (TypeError, ValueError):
|
|
4770
5580
|
threshold_fraction = GUARD_OVERHEAD_THRESHOLD
|
|
4771
5581
|
threshold_display = f"≤ +{threshold_fraction * 100:.1f}%"
|
|
4772
|
-
|
|
4773
|
-
|
|
5582
|
+
_event(
|
|
5583
|
+
console,
|
|
5584
|
+
"METRIC",
|
|
5585
|
+
f"Guard Overhead: {overhead_status} {overhead_display} ({threshold_display})",
|
|
5586
|
+
emoji="🛡️",
|
|
4774
5587
|
)
|
|
4775
5588
|
return threshold_fraction
|
|
4776
5589
|
|
|
@@ -4780,8 +5593,12 @@ def _print_retry_summary(console: Console, retry_controller: Any | None) -> None
|
|
|
4780
5593
|
try:
|
|
4781
5594
|
if retry_controller and getattr(retry_controller, "attempt_history", None):
|
|
4782
5595
|
summary = retry_controller.get_attempt_summary()
|
|
4783
|
-
console.print(
|
|
4784
|
-
|
|
5596
|
+
console.print("\n")
|
|
5597
|
+
_event(
|
|
5598
|
+
console,
|
|
5599
|
+
"METRIC",
|
|
5600
|
+
f"Retry Summary: {summary['total_attempts']} attempts in {summary['elapsed_time']:.1f}s",
|
|
5601
|
+
emoji="📊",
|
|
4785
5602
|
)
|
|
4786
5603
|
except Exception:
|
|
4787
5604
|
# Never break the run for summary printing
|
|
@@ -4804,10 +5621,15 @@ def _init_retry_controller(
|
|
|
4804
5621
|
retry_controller = RetryController(
|
|
4805
5622
|
max_attempts=max_attempts, timeout=timeout, verbose=True
|
|
4806
5623
|
)
|
|
4807
|
-
|
|
5624
|
+
_event(
|
|
5625
|
+
console,
|
|
5626
|
+
"INIT",
|
|
5627
|
+
f"Retry mode enabled: max {max_attempts} attempts",
|
|
5628
|
+
emoji="🔄",
|
|
5629
|
+
)
|
|
4808
5630
|
if baseline:
|
|
4809
|
-
console
|
|
5631
|
+
_event(console, "DATA", f"Using baseline: {baseline}", emoji="📋")
|
|
4810
5632
|
else:
|
|
4811
5633
|
if baseline:
|
|
4812
|
-
console
|
|
5634
|
+
_event(console, "DATA", f"Using baseline: {baseline}", emoji="📋")
|
|
4813
5635
|
return retry_controller
|