invarlock 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +2 -2
- invarlock/adapters/__init__.py +10 -14
- invarlock/adapters/auto.py +35 -40
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_mixin.py +25 -4
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/cli/adapter_auto.py +31 -21
- invarlock/cli/app.py +73 -2
- invarlock/cli/commands/certify.py +600 -59
- invarlock/cli/commands/doctor.py +8 -10
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +233 -69
- invarlock/cli/commands/run.py +907 -183
- invarlock/cli/commands/verify.py +76 -11
- invarlock/cli/config.py +1 -1
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +1 -1
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/registry.py +9 -11
- invarlock/core/runner.py +111 -25
- invarlock/edits/quant_rtn.py +65 -37
- invarlock/eval/bench.py +3 -3
- invarlock/eval/data.py +68 -23
- invarlock/eval/metrics.py +59 -1
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/rmt.py +2 -2
- invarlock/guards/variance.py +2 -2
- invarlock/model_profile.py +48 -27
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/reporting/certificate.py +159 -9
- invarlock/reporting/certificate_schema.py +1 -1
- invarlock/reporting/guards_analysis.py +154 -4
- invarlock/reporting/html.py +55 -5
- invarlock/reporting/normalizer.py +7 -0
- invarlock/reporting/render.py +791 -431
- invarlock/reporting/report.py +39 -3
- invarlock/reporting/report_types.py +6 -1
- invarlock/reporting/telemetry.py +86 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/METADATA +23 -9
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/RECORD +53 -48
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
|
@@ -14,14 +14,20 @@ Steps:
|
|
|
14
14
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
|
+
import inspect
|
|
18
|
+
import io
|
|
17
19
|
import json
|
|
18
20
|
import math
|
|
21
|
+
from collections.abc import Iterator
|
|
22
|
+
from contextlib import contextmanager
|
|
19
23
|
from pathlib import Path
|
|
20
|
-
from typing import Any
|
|
24
|
+
from typing import Any, NoReturn
|
|
21
25
|
|
|
22
26
|
import typer
|
|
23
27
|
from rich.console import Console
|
|
24
28
|
|
|
29
|
+
from invarlock import __version__ as INVARLOCK_VERSION
|
|
30
|
+
|
|
25
31
|
from ...core.exceptions import MetricsError
|
|
26
32
|
from ..adapter_auto import resolve_auto_adapter
|
|
27
33
|
from ..config import _deep_merge as _merge # reuse helper
|
|
@@ -32,9 +38,142 @@ from .run import _resolve_exit_code as _resolve_exit_code
|
|
|
32
38
|
|
|
33
39
|
_LAZY_RUN_IMPORT = True
|
|
34
40
|
|
|
41
|
+
PHASE_BAR_WIDTH = 67
|
|
42
|
+
VERBOSITY_QUIET = 0
|
|
43
|
+
VERBOSITY_DEFAULT = 1
|
|
44
|
+
VERBOSITY_VERBOSE = 2
|
|
45
|
+
|
|
35
46
|
console = Console()
|
|
36
47
|
|
|
37
48
|
|
|
49
|
+
def _render_banner_lines(title: str, context: str) -> list[str]:
|
|
50
|
+
width = max(len(title), len(context))
|
|
51
|
+
border = "─" * (width + 2)
|
|
52
|
+
return [
|
|
53
|
+
f"┌{border}┐",
|
|
54
|
+
f"│ {title.ljust(width)} │",
|
|
55
|
+
f"│ {context.ljust(width)} │",
|
|
56
|
+
f"└{border}┘",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _print_header_banner(
|
|
61
|
+
console: Console, *, version: str, profile: str, tier: str, adapter: str
|
|
62
|
+
) -> None:
|
|
63
|
+
title = f"INVARLOCK v{version} · Certification Pipeline"
|
|
64
|
+
context = f"Profile: {profile} · Tier: {tier} · Adapter: {adapter}"
|
|
65
|
+
for line in _render_banner_lines(title, context):
|
|
66
|
+
console.print(line)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _phase_title(index: int, total: int, title: str) -> str:
|
|
70
|
+
return f"PHASE {index}/{total} · {title}"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _print_phase_header(console: Console, title: str) -> None:
|
|
74
|
+
bar_width = max(PHASE_BAR_WIDTH, len(title))
|
|
75
|
+
bar = "═" * bar_width
|
|
76
|
+
console.print(bar)
|
|
77
|
+
console.print(title)
|
|
78
|
+
console.print(bar)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _format_ratio(value: Any) -> str:
|
|
82
|
+
try:
|
|
83
|
+
val = float(value)
|
|
84
|
+
except (TypeError, ValueError):
|
|
85
|
+
return "N/A"
|
|
86
|
+
if not math.isfinite(val):
|
|
87
|
+
return "N/A"
|
|
88
|
+
return f"{val:.3f}"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _resolve_verbosity(quiet: bool, verbose: bool) -> int:
|
|
92
|
+
if quiet and verbose:
|
|
93
|
+
console.print("--quiet and --verbose are mutually exclusive")
|
|
94
|
+
raise typer.Exit(2)
|
|
95
|
+
if quiet:
|
|
96
|
+
return VERBOSITY_QUIET
|
|
97
|
+
if verbose:
|
|
98
|
+
return VERBOSITY_VERBOSE
|
|
99
|
+
return VERBOSITY_DEFAULT
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@contextmanager
|
|
103
|
+
def _override_console(module: Any, new_console: Console) -> Iterator[None]:
|
|
104
|
+
original_console = getattr(module, "console", None)
|
|
105
|
+
module.console = new_console
|
|
106
|
+
try:
|
|
107
|
+
yield
|
|
108
|
+
finally:
|
|
109
|
+
module.console = original_console
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@contextmanager
|
|
113
|
+
def _suppress_child_output(enabled: bool) -> Iterator[io.StringIO | None]:
|
|
114
|
+
if not enabled:
|
|
115
|
+
yield None
|
|
116
|
+
return
|
|
117
|
+
from . import report as report_mod
|
|
118
|
+
from . import run as run_mod
|
|
119
|
+
|
|
120
|
+
buffer = io.StringIO()
|
|
121
|
+
quiet_console = Console(file=buffer, force_terminal=False, color_system=None)
|
|
122
|
+
with (
|
|
123
|
+
_override_console(run_mod, quiet_console),
|
|
124
|
+
_override_console(report_mod, quiet_console),
|
|
125
|
+
):
|
|
126
|
+
yield buffer
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _print_quiet_summary(
|
|
130
|
+
*,
|
|
131
|
+
cert_out: Path,
|
|
132
|
+
source: str,
|
|
133
|
+
edited: str,
|
|
134
|
+
profile: str,
|
|
135
|
+
) -> None:
|
|
136
|
+
cert_path = cert_out / "evaluation.cert.json"
|
|
137
|
+
console.print(f"INVARLOCK v{INVARLOCK_VERSION} · CERTIFY")
|
|
138
|
+
console.print(f"Baseline: {source} -> Subject: {edited} · Profile: {profile}")
|
|
139
|
+
if not cert_path.exists():
|
|
140
|
+
console.print(f"Output: {cert_out}")
|
|
141
|
+
return
|
|
142
|
+
try:
|
|
143
|
+
with cert_path.open("r", encoding="utf-8") as fh:
|
|
144
|
+
certificate = json.load(fh)
|
|
145
|
+
except Exception:
|
|
146
|
+
console.print(f"Output: {cert_path}")
|
|
147
|
+
return
|
|
148
|
+
if not isinstance(certificate, dict):
|
|
149
|
+
console.print(f"Output: {cert_path}")
|
|
150
|
+
return
|
|
151
|
+
try:
|
|
152
|
+
from invarlock.reporting.render import (
|
|
153
|
+
compute_console_validation_block as _console_block,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
block = _console_block(certificate)
|
|
157
|
+
rows = block.get("rows", [])
|
|
158
|
+
total = len(rows) if isinstance(rows, list) else 0
|
|
159
|
+
passed = (
|
|
160
|
+
sum(1 for row in rows if row.get("ok")) if isinstance(rows, list) else 0
|
|
161
|
+
)
|
|
162
|
+
status = "PASS" if block.get("overall_pass") else "FAIL"
|
|
163
|
+
except Exception:
|
|
164
|
+
total = 0
|
|
165
|
+
passed = 0
|
|
166
|
+
status = "UNKNOWN"
|
|
167
|
+
pm_ratio = _format_ratio(
|
|
168
|
+
(certificate.get("primary_metric") or {}).get("ratio_vs_baseline")
|
|
169
|
+
)
|
|
170
|
+
gate_summary = f"{passed}/{total} passed" if total else "N/A"
|
|
171
|
+
console.print(f"Status: {status} · Gates: {gate_summary}")
|
|
172
|
+
if pm_ratio != "N/A":
|
|
173
|
+
console.print(f"Primary metric ratio: {pm_ratio}")
|
|
174
|
+
console.print(f"Output: {cert_path}")
|
|
175
|
+
|
|
176
|
+
|
|
38
177
|
def _latest_run_report(run_root: Path) -> Path | None:
|
|
39
178
|
if not run_root.exists():
|
|
40
179
|
return None
|
|
@@ -90,6 +229,14 @@ def certify_command(
|
|
|
90
229
|
edited: str = typer.Option(
|
|
91
230
|
..., "--edited", "--subject", help="Subject model dir or Hub ID"
|
|
92
231
|
),
|
|
232
|
+
baseline_report: str | None = typer.Option(
|
|
233
|
+
None,
|
|
234
|
+
"--baseline-report",
|
|
235
|
+
help=(
|
|
236
|
+
"Reuse an existing baseline run report.json (skips baseline evaluation). "
|
|
237
|
+
"Must include stored evaluation windows (e.g., set INVARLOCK_STORE_EVAL_WINDOWS=1)."
|
|
238
|
+
),
|
|
239
|
+
),
|
|
93
240
|
adapter: str = typer.Option(
|
|
94
241
|
"auto", "--adapter", help="Adapter name or 'auto' to resolve"
|
|
95
242
|
),
|
|
@@ -117,6 +264,31 @@ def certify_command(
|
|
|
117
264
|
edit_config: str | None = typer.Option(
|
|
118
265
|
None, "--edit-config", help="Edit preset to apply a demo edit (quant_rtn)"
|
|
119
266
|
),
|
|
267
|
+
edit_label: str | None = typer.Option(
|
|
268
|
+
None,
|
|
269
|
+
"--edit-label",
|
|
270
|
+
help=(
|
|
271
|
+
"Edit algorithm label for BYOE models. Use 'noop' for baseline, "
|
|
272
|
+
"'quant_rtn' etc. for built-in edits, 'custom' for pre-edited models."
|
|
273
|
+
),
|
|
274
|
+
),
|
|
275
|
+
quiet: bool = typer.Option(
|
|
276
|
+
False, "--quiet", "-q", help="Minimal output (suppress run/report detail)"
|
|
277
|
+
),
|
|
278
|
+
verbose: bool = typer.Option(
|
|
279
|
+
False, "--verbose", "-v", help="Verbose output (include debug details)"
|
|
280
|
+
),
|
|
281
|
+
banner: bool = typer.Option(
|
|
282
|
+
True, "--banner/--no-banner", help="Show header banner"
|
|
283
|
+
),
|
|
284
|
+
style: str = typer.Option("audit", "--style", help="Output style (audit|friendly)"),
|
|
285
|
+
timing: bool = typer.Option(False, "--timing", help="Show timing summary"),
|
|
286
|
+
progress: bool = typer.Option(
|
|
287
|
+
True, "--progress/--no-progress", help="Show progress done messages"
|
|
288
|
+
),
|
|
289
|
+
no_color: bool = typer.Option(
|
|
290
|
+
False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
|
|
291
|
+
),
|
|
120
292
|
):
|
|
121
293
|
"""Certify two checkpoints (baseline vs subject) with pinned windows."""
|
|
122
294
|
# Support programmatic calls and Typer-invoked calls uniformly
|
|
@@ -132,6 +304,7 @@ def certify_command(
|
|
|
132
304
|
|
|
133
305
|
source = _coerce_option(source)
|
|
134
306
|
edited = _coerce_option(edited)
|
|
307
|
+
baseline_report = _coerce_option(baseline_report)
|
|
135
308
|
adapter = _coerce_option(adapter, "auto")
|
|
136
309
|
device = _coerce_option(device)
|
|
137
310
|
profile = _coerce_option(profile, "ci")
|
|
@@ -140,22 +313,88 @@ def certify_command(
|
|
|
140
313
|
out = _coerce_option(out, "runs")
|
|
141
314
|
cert_out = _coerce_option(cert_out, "reports/cert")
|
|
142
315
|
edit_config = _coerce_option(edit_config)
|
|
316
|
+
edit_label = _coerce_option(edit_label)
|
|
317
|
+
quiet = _coerce_option(quiet, False)
|
|
318
|
+
verbose = _coerce_option(verbose, False)
|
|
319
|
+
banner = _coerce_option(banner, True)
|
|
320
|
+
style = _coerce_option(style, "audit")
|
|
321
|
+
timing = bool(_coerce_option(timing, False))
|
|
322
|
+
progress = bool(_coerce_option(progress, True))
|
|
323
|
+
no_color = bool(_coerce_option(no_color, False))
|
|
324
|
+
|
|
325
|
+
verbosity = _resolve_verbosity(bool(quiet), bool(verbose))
|
|
326
|
+
|
|
327
|
+
if verbosity == VERBOSITY_QUIET:
|
|
328
|
+
progress = False
|
|
329
|
+
timing = False
|
|
330
|
+
|
|
331
|
+
from invarlock.cli.output import (
|
|
332
|
+
make_console,
|
|
333
|
+
perf_counter,
|
|
334
|
+
print_event,
|
|
335
|
+
print_timing_summary,
|
|
336
|
+
resolve_output_style,
|
|
337
|
+
timed_step,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
output_style = resolve_output_style(
|
|
341
|
+
style=str(style),
|
|
342
|
+
profile=str(profile),
|
|
343
|
+
progress=bool(progress),
|
|
344
|
+
timing=bool(timing),
|
|
345
|
+
no_color=bool(no_color),
|
|
346
|
+
)
|
|
347
|
+
console = make_console(no_color=not output_style.color)
|
|
348
|
+
timings: dict[str, float] = {}
|
|
349
|
+
total_start: float | None = perf_counter() if output_style.timing else None
|
|
350
|
+
|
|
351
|
+
def _info(message: str, *, tag: str = "INFO", emoji: str | None = None) -> None:
|
|
352
|
+
if verbosity >= VERBOSITY_DEFAULT:
|
|
353
|
+
print_event(console, tag, message, style=output_style, emoji=emoji)
|
|
354
|
+
|
|
355
|
+
def _debug(msg: str) -> None:
|
|
356
|
+
if verbosity >= VERBOSITY_VERBOSE:
|
|
357
|
+
console.print(msg, markup=False)
|
|
358
|
+
|
|
359
|
+
def _fail(message: str, *, exit_code: int = 2) -> NoReturn:
|
|
360
|
+
print_event(console, "FAIL", message, style=output_style, emoji="❌")
|
|
361
|
+
raise typer.Exit(exit_code)
|
|
362
|
+
|
|
363
|
+
def _phase(index: int, total: int, title: str) -> None:
|
|
364
|
+
if verbosity >= VERBOSITY_DEFAULT:
|
|
365
|
+
console.print("")
|
|
366
|
+
_print_phase_header(console, _phase_title(index, total, title))
|
|
143
367
|
|
|
144
368
|
src_id = str(source)
|
|
145
369
|
edt_id = str(edited)
|
|
146
370
|
|
|
147
371
|
# Resolve adapter when requested
|
|
148
372
|
eff_adapter = adapter
|
|
149
|
-
|
|
373
|
+
adapter_auto = False
|
|
374
|
+
if str(adapter).strip().lower() in {"auto", "auto_hf"}:
|
|
150
375
|
eff_adapter = resolve_auto_adapter(src_id)
|
|
151
|
-
|
|
376
|
+
adapter_auto = True
|
|
377
|
+
|
|
378
|
+
show_banner = bool(banner) and verbosity >= VERBOSITY_DEFAULT
|
|
379
|
+
if show_banner:
|
|
380
|
+
_print_header_banner(
|
|
381
|
+
console,
|
|
382
|
+
version=INVARLOCK_VERSION,
|
|
383
|
+
profile=profile,
|
|
384
|
+
tier=tier,
|
|
385
|
+
adapter=str(eff_adapter),
|
|
386
|
+
)
|
|
387
|
+
console.print("")
|
|
388
|
+
|
|
389
|
+
if adapter_auto:
|
|
390
|
+
_debug(f"Adapter:auto -> {eff_adapter}")
|
|
152
391
|
|
|
153
392
|
# Choose preset. If none provided and repo preset is missing (pip install
|
|
154
393
|
# scenario), fall back to a minimal built-in universal preset so the
|
|
155
394
|
# flag-only quick start works without cloning the repo.
|
|
156
395
|
default_universal = (
|
|
157
396
|
Path("configs/presets/masked_lm/wikitext2_128.yaml")
|
|
158
|
-
if eff_adapter == "
|
|
397
|
+
if eff_adapter == "hf_mlm"
|
|
159
398
|
else Path("configs/presets/causal_lm/wikitext2_512.yaml")
|
|
160
399
|
)
|
|
161
400
|
preset_path = Path(preset) if preset is not None else default_universal
|
|
@@ -176,7 +415,13 @@ def certify_command(
|
|
|
176
415
|
}
|
|
177
416
|
else:
|
|
178
417
|
if not preset_path.exists():
|
|
179
|
-
|
|
418
|
+
print_event(
|
|
419
|
+
console,
|
|
420
|
+
"FAIL",
|
|
421
|
+
f"Preset not found: {preset_path}",
|
|
422
|
+
style=output_style,
|
|
423
|
+
emoji="❌",
|
|
424
|
+
)
|
|
180
425
|
raise typer.Exit(1)
|
|
181
426
|
preset_data = _load_yaml(preset_path)
|
|
182
427
|
# Do not hard-code device from presets in auto-generated certify configs;
|
|
@@ -201,6 +446,108 @@ def certify_command(
|
|
|
201
446
|
if guards_order is None:
|
|
202
447
|
guards_order = list(default_guards_order)
|
|
203
448
|
|
|
449
|
+
def _load_and_validate_baseline_report(
|
|
450
|
+
report_path: Path,
|
|
451
|
+
*,
|
|
452
|
+
expected_profile: str,
|
|
453
|
+
expected_tier: str,
|
|
454
|
+
expected_adapter: str,
|
|
455
|
+
) -> Path:
|
|
456
|
+
candidate = Path(report_path).expanduser()
|
|
457
|
+
if not candidate.exists():
|
|
458
|
+
_fail(f"Baseline report not found: {candidate}")
|
|
459
|
+
resolved_report: Path | None = None
|
|
460
|
+
if candidate.is_dir():
|
|
461
|
+
direct = candidate / "report.json"
|
|
462
|
+
if direct.is_file():
|
|
463
|
+
resolved_report = direct
|
|
464
|
+
else:
|
|
465
|
+
resolved_report = _latest_run_report(candidate)
|
|
466
|
+
elif candidate.is_file():
|
|
467
|
+
resolved_report = candidate
|
|
468
|
+
if resolved_report is None or not resolved_report.is_file():
|
|
469
|
+
_fail(f"Baseline report not found: {candidate}")
|
|
470
|
+
resolved_report = resolved_report.resolve()
|
|
471
|
+
try:
|
|
472
|
+
with resolved_report.open("r", encoding="utf-8") as fh:
|
|
473
|
+
payload = json.load(fh)
|
|
474
|
+
except Exception as exc: # noqa: BLE001
|
|
475
|
+
_fail(f"Baseline report is not valid JSON: {resolved_report} ({exc})")
|
|
476
|
+
if not isinstance(payload, dict):
|
|
477
|
+
_fail(f"Baseline report must be a JSON object: {resolved_report}")
|
|
478
|
+
|
|
479
|
+
edit_block = payload.get("edit")
|
|
480
|
+
edit_name = edit_block.get("name") if isinstance(edit_block, dict) else None
|
|
481
|
+
if edit_name != "noop":
|
|
482
|
+
_fail(
|
|
483
|
+
"Baseline report must be a no-op run (edit.name == 'noop'). "
|
|
484
|
+
f"Got edit.name={edit_name!r} in {resolved_report}"
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
meta = payload.get("meta")
|
|
488
|
+
if isinstance(meta, dict):
|
|
489
|
+
baseline_adapter = meta.get("adapter")
|
|
490
|
+
if (
|
|
491
|
+
isinstance(baseline_adapter, str)
|
|
492
|
+
and baseline_adapter != expected_adapter
|
|
493
|
+
):
|
|
494
|
+
_fail(
|
|
495
|
+
"Baseline report adapter mismatch. "
|
|
496
|
+
f"Expected {expected_adapter!r}, got {baseline_adapter!r} in {resolved_report}"
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
context = payload.get("context")
|
|
500
|
+
if isinstance(context, dict):
|
|
501
|
+
baseline_profile = context.get("profile")
|
|
502
|
+
if (
|
|
503
|
+
isinstance(baseline_profile, str)
|
|
504
|
+
and baseline_profile.strip().lower() != expected_profile.strip().lower()
|
|
505
|
+
):
|
|
506
|
+
_fail(
|
|
507
|
+
"Baseline report profile mismatch. "
|
|
508
|
+
f"Expected {expected_profile!r}, got {baseline_profile!r} in {resolved_report}"
|
|
509
|
+
)
|
|
510
|
+
auto_ctx = context.get("auto")
|
|
511
|
+
if isinstance(auto_ctx, dict):
|
|
512
|
+
baseline_tier = auto_ctx.get("tier")
|
|
513
|
+
if isinstance(baseline_tier, str) and baseline_tier != expected_tier:
|
|
514
|
+
_fail(
|
|
515
|
+
"Baseline report tier mismatch. "
|
|
516
|
+
f"Expected {expected_tier!r}, got {baseline_tier!r} in {resolved_report}"
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
eval_windows = payload.get("evaluation_windows")
|
|
520
|
+
if not isinstance(eval_windows, dict):
|
|
521
|
+
_fail(
|
|
522
|
+
"Baseline report missing evaluation window payloads. "
|
|
523
|
+
"Re-run baseline with INVARLOCK_STORE_EVAL_WINDOWS=1."
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
for phase_name in ("preview", "final"):
|
|
527
|
+
phase = eval_windows.get(phase_name)
|
|
528
|
+
if not isinstance(phase, dict):
|
|
529
|
+
_fail(
|
|
530
|
+
f"Baseline report missing evaluation_windows.{phase_name} payloads. "
|
|
531
|
+
"Re-run baseline with INVARLOCK_STORE_EVAL_WINDOWS=1."
|
|
532
|
+
)
|
|
533
|
+
window_ids = phase.get("window_ids")
|
|
534
|
+
input_ids = phase.get("input_ids")
|
|
535
|
+
if not isinstance(window_ids, list) or not window_ids:
|
|
536
|
+
_fail(
|
|
537
|
+
f"Baseline report missing evaluation_windows.{phase_name}.window_ids."
|
|
538
|
+
)
|
|
539
|
+
if not isinstance(input_ids, list) or not input_ids:
|
|
540
|
+
_fail(
|
|
541
|
+
f"Baseline report missing evaluation_windows.{phase_name}.input_ids."
|
|
542
|
+
)
|
|
543
|
+
if len(input_ids) != len(window_ids):
|
|
544
|
+
_fail(
|
|
545
|
+
"Baseline report has inconsistent evaluation window payloads "
|
|
546
|
+
f"for {phase_name}: input_ids={len(input_ids)} window_ids={len(window_ids)}."
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
return resolved_report
|
|
550
|
+
|
|
204
551
|
# Create temp baseline config (no-op edit)
|
|
205
552
|
# Normalize possible "hf:" prefixes for HF adapters
|
|
206
553
|
norm_src_id = _normalize_model_id(src_id, eff_adapter)
|
|
@@ -221,39 +568,98 @@ def certify_command(
|
|
|
221
568
|
},
|
|
222
569
|
)
|
|
223
570
|
|
|
571
|
+
baseline_label = "noop"
|
|
572
|
+
subject_label: str | None = None
|
|
573
|
+
if edit_label:
|
|
574
|
+
subject_label = edit_label
|
|
575
|
+
elif not edit_config:
|
|
576
|
+
subject_label = "custom" if norm_src_id != norm_edt_id else "noop"
|
|
577
|
+
|
|
224
578
|
tmp_dir = Path(".certify_tmp")
|
|
225
579
|
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
226
|
-
baseline_yaml = tmp_dir / "baseline_noop.yaml"
|
|
227
|
-
_dump_yaml(baseline_yaml, baseline_cfg)
|
|
228
|
-
|
|
229
|
-
console.print("🏁 Running baseline (no-op edit)")
|
|
230
|
-
from .run import run_command as _run
|
|
231
|
-
|
|
232
|
-
_run(
|
|
233
|
-
config=str(baseline_yaml),
|
|
234
|
-
profile=profile,
|
|
235
|
-
out=str(Path(out) / "source"),
|
|
236
|
-
tier=tier,
|
|
237
|
-
device=device,
|
|
238
|
-
)
|
|
239
580
|
|
|
240
|
-
|
|
241
|
-
if
|
|
242
|
-
|
|
243
|
-
|
|
581
|
+
baseline_report_path: Path
|
|
582
|
+
if baseline_report:
|
|
583
|
+
_info(
|
|
584
|
+
"Using provided baseline report (skipping baseline evaluation)",
|
|
585
|
+
tag="EXEC",
|
|
586
|
+
emoji="♻️",
|
|
587
|
+
)
|
|
588
|
+
baseline_report_path = _load_and_validate_baseline_report(
|
|
589
|
+
Path(baseline_report),
|
|
590
|
+
expected_profile=profile,
|
|
591
|
+
expected_tier=tier,
|
|
592
|
+
expected_adapter=str(eff_adapter),
|
|
593
|
+
)
|
|
594
|
+
_debug(f"Baseline report: {baseline_report_path}")
|
|
595
|
+
else:
|
|
596
|
+
baseline_yaml = tmp_dir / "baseline_noop.yaml"
|
|
597
|
+
_dump_yaml(baseline_yaml, baseline_cfg)
|
|
598
|
+
|
|
599
|
+
_phase(1, 3, "BASELINE EVALUATION")
|
|
600
|
+
_info("Running baseline (no-op edit)", tag="EXEC", emoji="🏁")
|
|
601
|
+
_debug(f"Baseline config: {baseline_yaml}")
|
|
602
|
+
from .run import run_command as _run
|
|
603
|
+
|
|
604
|
+
with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
|
|
605
|
+
try:
|
|
606
|
+
with timed_step(
|
|
607
|
+
console=console,
|
|
608
|
+
style=output_style,
|
|
609
|
+
timings=timings,
|
|
610
|
+
key="baseline",
|
|
611
|
+
tag="EXEC",
|
|
612
|
+
message="Baseline",
|
|
613
|
+
emoji="🏁",
|
|
614
|
+
):
|
|
615
|
+
_run(
|
|
616
|
+
config=str(baseline_yaml),
|
|
617
|
+
profile=profile,
|
|
618
|
+
out=str(Path(out) / "source"),
|
|
619
|
+
tier=tier,
|
|
620
|
+
device=device,
|
|
621
|
+
edit_label=baseline_label,
|
|
622
|
+
style=output_style.name,
|
|
623
|
+
progress=progress,
|
|
624
|
+
timing=False,
|
|
625
|
+
no_color=no_color,
|
|
626
|
+
)
|
|
627
|
+
except Exception:
|
|
628
|
+
if quiet_buffer is not None:
|
|
629
|
+
console.print(quiet_buffer.getvalue(), markup=False)
|
|
630
|
+
raise
|
|
631
|
+
|
|
632
|
+
baseline_report_path_candidate = _latest_run_report(Path(out) / "source")
|
|
633
|
+
if not baseline_report_path_candidate:
|
|
634
|
+
_fail("Could not locate baseline report after run", exit_code=1)
|
|
635
|
+
baseline_report_path = baseline_report_path_candidate
|
|
636
|
+
_debug(f"Baseline report: {baseline_report_path}")
|
|
244
637
|
|
|
245
638
|
# Edited run: either no-op (Compare & Certify) or provided edit_config (demo edit)
|
|
639
|
+
_phase(2, 3, "SUBJECT EVALUATION")
|
|
246
640
|
if edit_config:
|
|
247
641
|
edited_yaml = Path(edit_config)
|
|
248
642
|
if not edited_yaml.exists():
|
|
249
|
-
|
|
643
|
+
print_event(
|
|
644
|
+
console,
|
|
645
|
+
"FAIL",
|
|
646
|
+
f"Edit config not found: {edited_yaml}",
|
|
647
|
+
style=output_style,
|
|
648
|
+
emoji="❌",
|
|
649
|
+
)
|
|
250
650
|
raise typer.Exit(1)
|
|
251
|
-
|
|
651
|
+
_info("Running edited (demo edit via --edit-config)", tag="EXEC", emoji="✂️")
|
|
252
652
|
# Overlay subject model id/adapter and output/context onto the provided edit config
|
|
253
653
|
try:
|
|
254
654
|
cfg_loaded: dict[str, Any] = _load_yaml(edited_yaml)
|
|
255
655
|
except Exception as exc: # noqa: BLE001
|
|
256
|
-
|
|
656
|
+
print_event(
|
|
657
|
+
console,
|
|
658
|
+
"FAIL",
|
|
659
|
+
f"Failed to load edit config: {exc}",
|
|
660
|
+
style=output_style,
|
|
661
|
+
emoji="❌",
|
|
662
|
+
)
|
|
257
663
|
raise typer.Exit(1) from exc
|
|
258
664
|
|
|
259
665
|
# Ensure model.id/adapter point to the requested subject
|
|
@@ -282,23 +688,58 @@ def certify_command(
|
|
|
282
688
|
"context": {"profile": profile, "tier": tier},
|
|
283
689
|
},
|
|
284
690
|
)
|
|
691
|
+
# Ensure the edited run always has a guard chain. Presets/edit configs
|
|
692
|
+
# often omit it, but `invarlock run` expects guards.order.
|
|
693
|
+
guards_block = merged_edited_cfg.get("guards")
|
|
694
|
+
guards_order_cfg = (
|
|
695
|
+
guards_block.get("order") if isinstance(guards_block, dict) else None
|
|
696
|
+
)
|
|
697
|
+
if not (
|
|
698
|
+
isinstance(guards_order_cfg, list)
|
|
699
|
+
and guards_order_cfg
|
|
700
|
+
and all(isinstance(item, str) for item in guards_order_cfg)
|
|
701
|
+
):
|
|
702
|
+
merged_edited_cfg = _merge(
|
|
703
|
+
merged_edited_cfg, {"guards": {"order": guards_order}}
|
|
704
|
+
)
|
|
285
705
|
|
|
286
706
|
# Persist a temporary merged config for traceability
|
|
287
707
|
tmp_dir = Path(".certify_tmp")
|
|
288
708
|
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
289
709
|
edited_merged_yaml = tmp_dir / "edited_merged.yaml"
|
|
290
710
|
_dump_yaml(edited_merged_yaml, merged_edited_cfg)
|
|
711
|
+
_debug(f"Edited config (merged): {edited_merged_yaml}")
|
|
291
712
|
|
|
292
713
|
from .run import run_command as _run
|
|
293
714
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
715
|
+
with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
|
|
716
|
+
try:
|
|
717
|
+
with timed_step(
|
|
718
|
+
console=console,
|
|
719
|
+
style=output_style,
|
|
720
|
+
timings=timings,
|
|
721
|
+
key="subject",
|
|
722
|
+
tag="EXEC",
|
|
723
|
+
message="Subject",
|
|
724
|
+
emoji="✂️",
|
|
725
|
+
):
|
|
726
|
+
_run(
|
|
727
|
+
config=str(edited_merged_yaml),
|
|
728
|
+
profile=profile,
|
|
729
|
+
out=str(Path(out) / "edited"),
|
|
730
|
+
tier=tier,
|
|
731
|
+
baseline=str(baseline_report_path),
|
|
732
|
+
device=device,
|
|
733
|
+
edit_label=subject_label if edit_label else None,
|
|
734
|
+
style=output_style.name,
|
|
735
|
+
progress=progress,
|
|
736
|
+
timing=False,
|
|
737
|
+
no_color=no_color,
|
|
738
|
+
)
|
|
739
|
+
except Exception:
|
|
740
|
+
if quiet_buffer is not None:
|
|
741
|
+
console.print(quiet_buffer.getvalue(), markup=False)
|
|
742
|
+
raise
|
|
302
743
|
else:
|
|
303
744
|
edited_cfg = _merge(
|
|
304
745
|
preset_data,
|
|
@@ -313,22 +754,96 @@ def certify_command(
|
|
|
313
754
|
)
|
|
314
755
|
edited_yaml = tmp_dir / "edited_noop.yaml"
|
|
315
756
|
_dump_yaml(edited_yaml, edited_cfg)
|
|
316
|
-
|
|
757
|
+
_info("Running edited (no-op, Compare & Certify)", tag="EXEC", emoji="🧪")
|
|
758
|
+
_debug(f"Edited config: {edited_yaml}")
|
|
317
759
|
from .run import run_command as _run
|
|
318
760
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
761
|
+
with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
|
|
762
|
+
try:
|
|
763
|
+
with timed_step(
|
|
764
|
+
console=console,
|
|
765
|
+
style=output_style,
|
|
766
|
+
timings=timings,
|
|
767
|
+
key="subject",
|
|
768
|
+
tag="EXEC",
|
|
769
|
+
message="Subject",
|
|
770
|
+
emoji="🧪",
|
|
771
|
+
):
|
|
772
|
+
_run(
|
|
773
|
+
config=str(edited_yaml),
|
|
774
|
+
profile=profile,
|
|
775
|
+
out=str(Path(out) / "edited"),
|
|
776
|
+
tier=tier,
|
|
777
|
+
baseline=str(baseline_report_path),
|
|
778
|
+
device=device,
|
|
779
|
+
edit_label=subject_label,
|
|
780
|
+
style=output_style.name,
|
|
781
|
+
progress=progress,
|
|
782
|
+
timing=False,
|
|
783
|
+
no_color=no_color,
|
|
784
|
+
)
|
|
785
|
+
except Exception:
|
|
786
|
+
if quiet_buffer is not None:
|
|
787
|
+
console.print(quiet_buffer.getvalue(), markup=False)
|
|
788
|
+
raise
|
|
327
789
|
|
|
328
790
|
edited_report = _latest_run_report(Path(out) / "edited")
|
|
329
791
|
if not edited_report:
|
|
330
|
-
|
|
792
|
+
print_event(
|
|
793
|
+
console,
|
|
794
|
+
"FAIL",
|
|
795
|
+
"Could not locate edited report after run",
|
|
796
|
+
style=output_style,
|
|
797
|
+
emoji="❌",
|
|
798
|
+
)
|
|
331
799
|
raise typer.Exit(1)
|
|
800
|
+
_debug(f"Edited report: {edited_report}")
|
|
801
|
+
|
|
802
|
+
_phase(3, 3, "CERTIFICATE GENERATION")
|
|
803
|
+
|
|
804
|
+
def _emit_certificate() -> None:
|
|
805
|
+
_info("Emitting certificate", tag="EXEC", emoji="📜")
|
|
806
|
+
with _suppress_child_output(verbosity == VERBOSITY_QUIET) as quiet_buffer:
|
|
807
|
+
try:
|
|
808
|
+
with timed_step(
|
|
809
|
+
console=console,
|
|
810
|
+
style=output_style,
|
|
811
|
+
timings=timings,
|
|
812
|
+
key="certificate",
|
|
813
|
+
tag="EXEC",
|
|
814
|
+
message="Certificate",
|
|
815
|
+
emoji="📜",
|
|
816
|
+
):
|
|
817
|
+
report_kwargs = {
|
|
818
|
+
"run": str(edited_report),
|
|
819
|
+
"format": "cert",
|
|
820
|
+
"baseline": str(baseline_report_path),
|
|
821
|
+
"output": cert_out,
|
|
822
|
+
"style": output_style.name,
|
|
823
|
+
"no_color": no_color,
|
|
824
|
+
}
|
|
825
|
+
try:
|
|
826
|
+
sig = inspect.signature(_report)
|
|
827
|
+
except (TypeError, ValueError):
|
|
828
|
+
_report(**report_kwargs)
|
|
829
|
+
else:
|
|
830
|
+
if any(
|
|
831
|
+
param.kind == inspect.Parameter.VAR_KEYWORD
|
|
832
|
+
for param in sig.parameters.values()
|
|
833
|
+
):
|
|
834
|
+
_report(**report_kwargs)
|
|
835
|
+
else:
|
|
836
|
+
_report(
|
|
837
|
+
**{
|
|
838
|
+
key: value
|
|
839
|
+
for key, value in report_kwargs.items()
|
|
840
|
+
if key in sig.parameters
|
|
841
|
+
}
|
|
842
|
+
)
|
|
843
|
+
except Exception:
|
|
844
|
+
if quiet_buffer is not None:
|
|
845
|
+
console.print(quiet_buffer.getvalue(), markup=False)
|
|
846
|
+
raise
|
|
332
847
|
|
|
333
848
|
# CI/Release hard‑abort: fail fast when primary metric is not computable.
|
|
334
849
|
try:
|
|
@@ -340,7 +855,13 @@ def certify_command(
|
|
|
340
855
|
with Path(edited_report).open("r", encoding="utf-8") as fh:
|
|
341
856
|
edited_payload = json.load(fh)
|
|
342
857
|
except Exception as exc: # noqa: BLE001
|
|
343
|
-
|
|
858
|
+
print_event(
|
|
859
|
+
console,
|
|
860
|
+
"FAIL",
|
|
861
|
+
f"Failed to read edited report: {exc}",
|
|
862
|
+
style=output_style,
|
|
863
|
+
emoji="❌",
|
|
864
|
+
)
|
|
344
865
|
raise typer.Exit(1) from exc
|
|
345
866
|
|
|
346
867
|
def _finite(x: Any) -> bool:
|
|
@@ -382,8 +903,12 @@ def certify_command(
|
|
|
382
903
|
if (not _finite(pm_prev) or not _finite(pm_final))
|
|
383
904
|
else "primary_metric_degraded"
|
|
384
905
|
)
|
|
385
|
-
|
|
386
|
-
|
|
906
|
+
print_event(
|
|
907
|
+
console,
|
|
908
|
+
"WARN",
|
|
909
|
+
"Primary metric degraded or non-finite; emitting certificate and marking task degraded. Primary metric computation failed.",
|
|
910
|
+
style=output_style,
|
|
911
|
+
emoji="⚠️",
|
|
387
912
|
)
|
|
388
913
|
pm["degraded"] = True
|
|
389
914
|
pm["invalid"] = pm.get("invalid") or True
|
|
@@ -395,12 +920,7 @@ def certify_command(
|
|
|
395
920
|
edited_payload.setdefault("metrics", {}).update(metrics)
|
|
396
921
|
|
|
397
922
|
# Emit the certificate for inspection, then exit with a CI-visible error.
|
|
398
|
-
|
|
399
|
-
run=str(edited_report),
|
|
400
|
-
format="cert",
|
|
401
|
-
baseline=str(baseline_report),
|
|
402
|
-
output=cert_out,
|
|
403
|
-
)
|
|
923
|
+
_emit_certificate()
|
|
404
924
|
err = MetricsError(
|
|
405
925
|
code="E111",
|
|
406
926
|
message=f"Primary metric degraded or non-finite ({degraded_reason}).",
|
|
@@ -413,10 +933,31 @@ def certify_command(
|
|
|
413
933
|
)
|
|
414
934
|
raise typer.Exit(_resolve_exit_code(err, profile=profile))
|
|
415
935
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
936
|
+
_emit_certificate()
|
|
937
|
+
if timing:
|
|
938
|
+
if total_start is not None:
|
|
939
|
+
timings["total"] = max(0.0, float(perf_counter() - total_start))
|
|
940
|
+
else:
|
|
941
|
+
timings["total"] = (
|
|
942
|
+
float(timings.get("baseline", 0.0))
|
|
943
|
+
+ float(timings.get("subject", 0.0))
|
|
944
|
+
+ float(timings.get("certificate", 0.0))
|
|
945
|
+
)
|
|
946
|
+
print_timing_summary(
|
|
947
|
+
console,
|
|
948
|
+
timings,
|
|
949
|
+
style=output_style,
|
|
950
|
+
order=[
|
|
951
|
+
("Baseline", "baseline"),
|
|
952
|
+
("Subject", "subject"),
|
|
953
|
+
("Certificate", "certificate"),
|
|
954
|
+
("Total", "total"),
|
|
955
|
+
],
|
|
956
|
+
)
|
|
957
|
+
if verbosity == VERBOSITY_QUIET:
|
|
958
|
+
_print_quiet_summary(
|
|
959
|
+
cert_out=Path(cert_out),
|
|
960
|
+
source=src_id,
|
|
961
|
+
edited=edt_id,
|
|
962
|
+
profile=profile,
|
|
963
|
+
)
|