invarlock 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +2 -2
- invarlock/adapters/__init__.py +10 -14
- invarlock/adapters/auto.py +35 -40
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_mixin.py +25 -4
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/cli/adapter_auto.py +31 -21
- invarlock/cli/app.py +73 -2
- invarlock/cli/commands/certify.py +600 -59
- invarlock/cli/commands/doctor.py +8 -10
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +233 -69
- invarlock/cli/commands/run.py +907 -183
- invarlock/cli/commands/verify.py +76 -11
- invarlock/cli/config.py +1 -1
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +1 -1
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/registry.py +9 -11
- invarlock/core/runner.py +111 -25
- invarlock/edits/quant_rtn.py +65 -37
- invarlock/eval/bench.py +3 -3
- invarlock/eval/data.py +68 -23
- invarlock/eval/metrics.py +59 -1
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/rmt.py +2 -2
- invarlock/guards/variance.py +2 -2
- invarlock/model_profile.py +48 -27
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/reporting/certificate.py +159 -9
- invarlock/reporting/certificate_schema.py +1 -1
- invarlock/reporting/guards_analysis.py +154 -4
- invarlock/reporting/html.py +55 -5
- invarlock/reporting/normalizer.py +7 -0
- invarlock/reporting/render.py +791 -431
- invarlock/reporting/report.py +39 -3
- invarlock/reporting/report_types.py +6 -1
- invarlock/reporting/telemetry.py +86 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/METADATA +23 -9
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/RECORD +53 -48
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
invarlock/cli/commands/verify.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
invarlock verify command
|
|
3
3
|
====================
|
|
4
4
|
|
|
5
|
-
Validates generated
|
|
5
|
+
Validates generated evaluation certificates for internal consistency. The command
|
|
6
6
|
ensures schema compliance, checks that the primary metric ratio agrees with the
|
|
7
7
|
baseline reference, and enforces paired-window guarantees (match=1.0,
|
|
8
8
|
overlap=0.0).
|
|
@@ -66,9 +66,25 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
|
|
|
66
66
|
errors.append("Certificate missing primary_metric block.")
|
|
67
67
|
return errors
|
|
68
68
|
|
|
69
|
+
def _is_finite_number(value: Any) -> bool:
|
|
70
|
+
return isinstance(value, (int, float)) and math.isfinite(float(value))
|
|
71
|
+
|
|
72
|
+
def _declares_invalid_primary_metric(metric: dict[str, Any]) -> bool:
|
|
73
|
+
if bool(metric.get("invalid")):
|
|
74
|
+
return True
|
|
75
|
+
reason = metric.get("degraded_reason")
|
|
76
|
+
if isinstance(reason, str):
|
|
77
|
+
r = reason.strip().lower()
|
|
78
|
+
return r.startswith("non_finite") or r in {
|
|
79
|
+
"primary_metric_invalid",
|
|
80
|
+
"evaluation_error",
|
|
81
|
+
}
|
|
82
|
+
return False
|
|
83
|
+
|
|
69
84
|
kind = str(pm.get("kind", "")).lower()
|
|
70
85
|
ratio_vs_baseline = pm.get("ratio_vs_baseline")
|
|
71
86
|
final = pm.get("final")
|
|
87
|
+
pm_invalid = _declares_invalid_primary_metric(pm)
|
|
72
88
|
|
|
73
89
|
if kind.startswith("ppl"):
|
|
74
90
|
baseline_ref = certificate.get("baseline_ref", {}) or {}
|
|
@@ -82,16 +98,14 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
|
|
|
82
98
|
bv = baseline_pm.get("final")
|
|
83
99
|
if isinstance(bv, (int | float)):
|
|
84
100
|
baseline_final = float(bv)
|
|
85
|
-
if
|
|
86
|
-
if baseline_final <= 0.0:
|
|
101
|
+
if _is_finite_number(final) and _is_finite_number(baseline_final):
|
|
102
|
+
if float(baseline_final) <= 0.0:
|
|
87
103
|
errors.append(
|
|
88
104
|
f"Baseline final must be > 0.0 to compute ratio (found {baseline_final})."
|
|
89
105
|
)
|
|
90
106
|
else:
|
|
91
107
|
expected_ratio = float(final) / float(baseline_final)
|
|
92
|
-
if not
|
|
93
|
-
float(ratio_vs_baseline)
|
|
94
|
-
):
|
|
108
|
+
if not _is_finite_number(ratio_vs_baseline):
|
|
95
109
|
errors.append(
|
|
96
110
|
"Certificate is missing a finite primary_metric.ratio_vs_baseline value."
|
|
97
111
|
)
|
|
@@ -102,7 +116,18 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
|
|
|
102
116
|
"Primary metric ratio mismatch: "
|
|
103
117
|
f"recorded={float(ratio_vs_baseline):.12f}, expected={expected_ratio:.12f}"
|
|
104
118
|
)
|
|
119
|
+
else:
|
|
120
|
+
# If the primary metric is non-finite, it must be explicitly marked invalid.
|
|
121
|
+
# This is expected for structural error-injection runs (NaN/Inf weights).
|
|
122
|
+
if (isinstance(final, (int | float)) and not _is_finite_number(final)) and (
|
|
123
|
+
not pm_invalid
|
|
124
|
+
):
|
|
125
|
+
errors.append(
|
|
126
|
+
"Primary metric final is non-finite but primary_metric.invalid is not set."
|
|
127
|
+
)
|
|
105
128
|
else:
|
|
129
|
+
if pm_invalid:
|
|
130
|
+
return errors
|
|
106
131
|
if ratio_vs_baseline is None or not isinstance(ratio_vs_baseline, int | float):
|
|
107
132
|
errors.append(
|
|
108
133
|
"Certificate missing primary_metric.ratio_vs_baseline for non-ppl metric."
|
|
@@ -194,14 +219,29 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
|
|
|
194
219
|
|
|
195
220
|
|
|
196
221
|
def _validate_drift_band(certificate: dict[str, Any]) -> list[str]:
|
|
197
|
-
"""Validate preview→final drift stays within the configured band
|
|
222
|
+
"""Validate preview→final drift stays within the configured band.
|
|
223
|
+
|
|
224
|
+
Defaults to 0.95–1.05 unless the certificate provides `primary_metric.drift_band`.
|
|
225
|
+
"""
|
|
198
226
|
errors: list[str] = []
|
|
199
227
|
pm = certificate.get("primary_metric", {}) or {}
|
|
228
|
+
if not isinstance(pm, dict) or not pm:
|
|
229
|
+
errors.append("Certificate missing primary_metric block.")
|
|
230
|
+
return errors
|
|
231
|
+
if bool(pm.get("invalid")):
|
|
232
|
+
# Drift is undefined when the primary metric is invalid (e.g., NaN/Inf weights).
|
|
233
|
+
return errors
|
|
200
234
|
drift_ratio = None
|
|
201
235
|
try:
|
|
202
236
|
prev = pm.get("preview")
|
|
203
237
|
fin = pm.get("final")
|
|
204
|
-
if
|
|
238
|
+
if (
|
|
239
|
+
isinstance(prev, int | float)
|
|
240
|
+
and isinstance(fin, int | float)
|
|
241
|
+
and math.isfinite(float(prev))
|
|
242
|
+
and math.isfinite(float(fin))
|
|
243
|
+
and prev > 0
|
|
244
|
+
):
|
|
205
245
|
drift_ratio = float(fin) / float(prev)
|
|
206
246
|
except Exception:
|
|
207
247
|
drift_ratio = None
|
|
@@ -210,9 +250,33 @@ def _validate_drift_band(certificate: dict[str, Any]) -> list[str]:
|
|
|
210
250
|
errors.append("Certificate missing preview/final to compute drift ratio.")
|
|
211
251
|
return errors
|
|
212
252
|
|
|
213
|
-
|
|
253
|
+
drift_min = 0.95
|
|
254
|
+
drift_max = 1.05
|
|
255
|
+
band = pm.get("drift_band")
|
|
256
|
+
try:
|
|
257
|
+
if isinstance(band, dict):
|
|
258
|
+
lo = band.get("min")
|
|
259
|
+
hi = band.get("max")
|
|
260
|
+
if isinstance(lo, int | float) and isinstance(hi, int | float):
|
|
261
|
+
lo_f = float(lo)
|
|
262
|
+
hi_f = float(hi)
|
|
263
|
+
if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
|
|
264
|
+
drift_min = lo_f
|
|
265
|
+
drift_max = hi_f
|
|
266
|
+
elif isinstance(band, list | tuple) and len(band) == 2:
|
|
267
|
+
lo_raw, hi_raw = band[0], band[1]
|
|
268
|
+
if isinstance(lo_raw, int | float) and isinstance(hi_raw, int | float):
|
|
269
|
+
lo_f = float(lo_raw)
|
|
270
|
+
hi_f = float(hi_raw)
|
|
271
|
+
if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
|
|
272
|
+
drift_min = lo_f
|
|
273
|
+
drift_max = hi_f
|
|
274
|
+
except Exception:
|
|
275
|
+
pass
|
|
276
|
+
|
|
277
|
+
if not drift_min <= float(drift_ratio) <= drift_max:
|
|
214
278
|
errors.append(
|
|
215
|
-
f"Preview→final drift ratio out of band (
|
|
279
|
+
f"Preview→final drift ratio out of band ({drift_min:.2f}–{drift_max:.2f}): observed {drift_ratio:.6f}."
|
|
216
280
|
)
|
|
217
281
|
|
|
218
282
|
return errors
|
|
@@ -406,7 +470,8 @@ def _validate_certificate_payload(
|
|
|
406
470
|
)
|
|
407
471
|
except Exception:
|
|
408
472
|
prof = "dev"
|
|
409
|
-
#
|
|
473
|
+
# Drift band is a CI/Release enforcement check; dev profile should not
|
|
474
|
+
# fail verification due to preview→final drift.
|
|
410
475
|
if prof in {"ci", "release"}:
|
|
411
476
|
errors.extend(_validate_drift_band(certificate))
|
|
412
477
|
errors.extend(_apply_profile_lints(certificate))
|
invarlock/cli/config.py
CHANGED
|
@@ -415,7 +415,7 @@ def _deep_merge_dicts(a: dict, b: dict) -> dict: # pragma: no cover - trivial a
|
|
|
415
415
|
|
|
416
416
|
def create_example_config() -> InvarLockConfig: # pragma: no cover - test helper
|
|
417
417
|
return InvarLockConfig(
|
|
418
|
-
model={"id": "gpt2", "adapter": "
|
|
418
|
+
model={"id": "gpt2", "adapter": "hf_causal", "device": "auto"},
|
|
419
419
|
edit={"name": "quant_rtn", "plan": {}},
|
|
420
420
|
dataset={"provider": "wikitext2", "seq_len": 512, "stride": 512},
|
|
421
421
|
output={"dir": "runs"},
|
invarlock/cli/doctor_helpers.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any
|
|
|
8
8
|
def get_adapter_rows() -> list[dict[str, Any]]:
|
|
9
9
|
"""Build adapter rows similar to doctor output for testing.
|
|
10
10
|
|
|
11
|
-
Applies optional-extra detection for
|
|
11
|
+
Applies optional-extra detection for hf_causal_onnx (optimum/onnxruntime) even if
|
|
12
12
|
registered as a core adapter, so missing extras are surfaced.
|
|
13
13
|
"""
|
|
14
14
|
from invarlock.core.registry import get_registry
|
|
@@ -29,13 +29,12 @@ def get_adapter_rows() -> list[dict[str, Any]]:
|
|
|
29
29
|
module = str(info.get("module") or "")
|
|
30
30
|
support = (
|
|
31
31
|
"auto"
|
|
32
|
-
if module.startswith("invarlock.adapters")
|
|
33
|
-
and name in {"hf_causal_auto", "hf_mlm_auto"}
|
|
32
|
+
if module.startswith("invarlock.adapters") and name in {"hf_auto"}
|
|
34
33
|
else ("core" if module.startswith("invarlock.adapters") else "optional")
|
|
35
34
|
)
|
|
36
35
|
backend, status, enable = None, "ready", ""
|
|
37
36
|
|
|
38
|
-
if name in {"
|
|
37
|
+
if name in {"hf_causal", "hf_mlm", "hf_seq2seq", "hf_auto"}:
|
|
39
38
|
backend = "transformers"
|
|
40
39
|
elif name == "hf_gptq":
|
|
41
40
|
backend = "auto-gptq"
|
|
@@ -49,7 +48,7 @@ def get_adapter_rows() -> list[dict[str, Any]]:
|
|
|
49
48
|
backend = "bitsandbytes"
|
|
50
49
|
if not has_cuda:
|
|
51
50
|
status, enable = "unsupported", "Requires CUDA"
|
|
52
|
-
elif name == "
|
|
51
|
+
elif name == "hf_causal_onnx":
|
|
53
52
|
backend = "onnxruntime"
|
|
54
53
|
present = (
|
|
55
54
|
importlib.util.find_spec("optimum.onnxruntime") is not None
|
invarlock/cli/output.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import TextIO
|
|
9
|
+
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
|
|
12
|
+
_STYLE_AUDIT = "audit"
|
|
13
|
+
_STYLE_FRIENDLY = "friendly"
|
|
14
|
+
_VALID_STYLES = {_STYLE_AUDIT, _STYLE_FRIENDLY}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _safe_console_print(console: Console, *args: object, **kwargs: object) -> None:
|
|
18
|
+
try:
|
|
19
|
+
console.print(*args, **kwargs)
|
|
20
|
+
except TypeError:
|
|
21
|
+
console.print(*args)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def env_no_color() -> bool:
|
|
25
|
+
"""Return True when NO_COLOR is set (value-agnostic)."""
|
|
26
|
+
return bool(str(os.environ.get("NO_COLOR", "")).strip())
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def perf_counter() -> float:
|
|
30
|
+
return time.perf_counter()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True, slots=True)
|
|
34
|
+
class OutputStyle:
|
|
35
|
+
name: str
|
|
36
|
+
progress: bool = False
|
|
37
|
+
timing: bool = False
|
|
38
|
+
color: bool = True
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def emojis(self) -> bool:
|
|
42
|
+
return self.name != _STYLE_AUDIT
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def audit(self) -> bool:
|
|
46
|
+
return self.name == _STYLE_AUDIT
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def normalize_style(style: str | None) -> str | None:
|
|
50
|
+
if style is None:
|
|
51
|
+
return None
|
|
52
|
+
value = str(style).strip().lower()
|
|
53
|
+
if not value:
|
|
54
|
+
return None
|
|
55
|
+
return value if value in _VALID_STYLES else None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def resolve_style_name(style: str | None, profile: str | None) -> str:
|
|
59
|
+
normalized = normalize_style(style)
|
|
60
|
+
if normalized is not None:
|
|
61
|
+
return normalized
|
|
62
|
+
profile_norm = str(profile or "").strip().lower()
|
|
63
|
+
if profile_norm in {"ci", "ci_cpu", "release"}:
|
|
64
|
+
return _STYLE_AUDIT
|
|
65
|
+
return _STYLE_FRIENDLY
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def resolve_output_style(
|
|
69
|
+
*,
|
|
70
|
+
style: str | None,
|
|
71
|
+
profile: str | None,
|
|
72
|
+
progress: bool = False,
|
|
73
|
+
timing: bool = False,
|
|
74
|
+
no_color: bool = False,
|
|
75
|
+
) -> OutputStyle:
|
|
76
|
+
name = resolve_style_name(style, profile)
|
|
77
|
+
color_enabled = not (bool(no_color) or env_no_color())
|
|
78
|
+
return OutputStyle(
|
|
79
|
+
name=name,
|
|
80
|
+
progress=bool(progress),
|
|
81
|
+
timing=bool(timing),
|
|
82
|
+
color=color_enabled,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def make_console(
|
|
87
|
+
*,
|
|
88
|
+
file: TextIO | None = None,
|
|
89
|
+
force_terminal: bool | None = None,
|
|
90
|
+
no_color: bool | None = None,
|
|
91
|
+
) -> Console:
|
|
92
|
+
if no_color is None:
|
|
93
|
+
no_color = env_no_color()
|
|
94
|
+
if no_color:
|
|
95
|
+
color_system = None
|
|
96
|
+
else:
|
|
97
|
+
color_system = "standard" if force_terminal else "auto"
|
|
98
|
+
return Console(
|
|
99
|
+
file=file,
|
|
100
|
+
force_terminal=force_terminal,
|
|
101
|
+
no_color=bool(no_color),
|
|
102
|
+
color_system=color_system,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def format_event_line(
|
|
107
|
+
tag: str,
|
|
108
|
+
message: str,
|
|
109
|
+
*,
|
|
110
|
+
style: OutputStyle,
|
|
111
|
+
emoji: str | None = None,
|
|
112
|
+
) -> str:
|
|
113
|
+
tag_norm = str(tag or "").strip().upper() or "INFO"
|
|
114
|
+
if style.emojis and emoji:
|
|
115
|
+
prefix = emoji
|
|
116
|
+
else:
|
|
117
|
+
prefix = f"[{tag_norm}]"
|
|
118
|
+
msg = str(message or "").rstrip()
|
|
119
|
+
return f"{prefix} {msg}".rstrip()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def print_event(
|
|
123
|
+
console: Console,
|
|
124
|
+
tag: str,
|
|
125
|
+
message: str,
|
|
126
|
+
*,
|
|
127
|
+
style: OutputStyle,
|
|
128
|
+
emoji: str | None = None,
|
|
129
|
+
console_style: str | None = None,
|
|
130
|
+
) -> None:
|
|
131
|
+
line = format_event_line(tag, message, style=style, emoji=emoji)
|
|
132
|
+
if console_style is None and style.color:
|
|
133
|
+
tag_norm = str(tag or "").strip().upper()
|
|
134
|
+
if tag_norm in {"PASS"}:
|
|
135
|
+
console_style = "green"
|
|
136
|
+
elif tag_norm in {"FAIL", "ERROR"}:
|
|
137
|
+
console_style = "red"
|
|
138
|
+
elif tag_norm in {"WARN", "WARNING"}:
|
|
139
|
+
console_style = "yellow"
|
|
140
|
+
elif tag_norm in {"METRIC"}:
|
|
141
|
+
console_style = "cyan"
|
|
142
|
+
_safe_console_print(console, line, style=console_style, markup=False)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@contextmanager
|
|
146
|
+
def timed_step(
|
|
147
|
+
*,
|
|
148
|
+
console: Console,
|
|
149
|
+
style: OutputStyle,
|
|
150
|
+
timings: dict[str, float] | None,
|
|
151
|
+
key: str,
|
|
152
|
+
tag: str,
|
|
153
|
+
message: str,
|
|
154
|
+
emoji: str | None = None,
|
|
155
|
+
) -> Iterator[None]:
|
|
156
|
+
start = perf_counter()
|
|
157
|
+
try:
|
|
158
|
+
yield
|
|
159
|
+
finally:
|
|
160
|
+
elapsed = max(0.0, float(perf_counter() - start))
|
|
161
|
+
if timings is not None:
|
|
162
|
+
timings[key] = elapsed
|
|
163
|
+
if style.progress:
|
|
164
|
+
print_event(
|
|
165
|
+
console,
|
|
166
|
+
tag,
|
|
167
|
+
f"{message} done ({elapsed:.2f}s)",
|
|
168
|
+
style=style,
|
|
169
|
+
emoji=emoji,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def print_timing_summary(
|
|
174
|
+
console: Console,
|
|
175
|
+
timings: dict[str, float],
|
|
176
|
+
*,
|
|
177
|
+
style: OutputStyle,
|
|
178
|
+
order: list[tuple[str, str]],
|
|
179
|
+
extra_lines: list[str] | None = None,
|
|
180
|
+
) -> None:
|
|
181
|
+
if not style.timing:
|
|
182
|
+
return
|
|
183
|
+
_safe_console_print(console, "", markup=False)
|
|
184
|
+
_safe_console_print(console, "TIMING SUMMARY", markup=False)
|
|
185
|
+
for label, key in order:
|
|
186
|
+
if key not in timings:
|
|
187
|
+
continue
|
|
188
|
+
_safe_console_print(
|
|
189
|
+
console, f" {label:<11}: {timings[key]:.2f}s", markup=False
|
|
190
|
+
)
|
|
191
|
+
if extra_lines:
|
|
192
|
+
for line in extra_lines:
|
|
193
|
+
_safe_console_print(console, line, markup=False)
|
invarlock/cli/provenance.py
CHANGED
|
@@ -31,7 +31,7 @@ _FAMILY_MAP: dict[str, tuple[str, str, list[str]]] = {
|
|
|
31
31
|
"hf_awq": ("awq", "autoawq", []),
|
|
32
32
|
"hf_bnb": ("bnb", "bitsandbytes", []),
|
|
33
33
|
# ONNX stack (requires extras: invarlock[onnx])
|
|
34
|
-
"
|
|
34
|
+
"hf_causal_onnx": ("onnx", "onnxruntime", []),
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
|
invarlock/core/bootstrap.py
CHANGED
|
@@ -6,7 +6,7 @@ Numerically stable bootstrap helpers for evaluation metrics.
|
|
|
6
6
|
|
|
7
7
|
This module provides bias-corrected and accelerated (BCa) confidence
|
|
8
8
|
intervals tailored for paired log-loss statistics used by the runner
|
|
9
|
-
and
|
|
9
|
+
and evaluation certificate reports.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
from __future__ import annotations
|
invarlock/core/registry.py
CHANGED
|
@@ -140,23 +140,21 @@ class CoreRegistry:
|
|
|
140
140
|
)
|
|
141
141
|
|
|
142
142
|
# Register built-in adapters
|
|
143
|
-
_fallback(self._adapters, "hf_gpt2", "invarlock.adapters", "HF_GPT2_Adapter")
|
|
144
|
-
_fallback(self._adapters, "hf_bert", "invarlock.adapters", "HF_BERT_Adapter")
|
|
145
|
-
_fallback(self._adapters, "hf_llama", "invarlock.adapters", "HF_LLaMA_Adapter")
|
|
146
|
-
_fallback(self._adapters, "hf_t5", "invarlock.adapters", "HF_T5_Adapter")
|
|
147
143
|
_fallback(
|
|
148
|
-
self._adapters, "
|
|
144
|
+
self._adapters, "hf_causal", "invarlock.adapters", "HF_Causal_Adapter"
|
|
149
145
|
)
|
|
150
|
-
|
|
146
|
+
_fallback(self._adapters, "hf_mlm", "invarlock.adapters", "HF_MLM_Adapter")
|
|
151
147
|
_fallback(
|
|
152
|
-
self._adapters,
|
|
153
|
-
"hf_causal_auto",
|
|
154
|
-
"invarlock.adapters",
|
|
155
|
-
"HF_Causal_Auto_Adapter",
|
|
148
|
+
self._adapters, "hf_seq2seq", "invarlock.adapters", "HF_Seq2Seq_Adapter"
|
|
156
149
|
)
|
|
157
150
|
_fallback(
|
|
158
|
-
self._adapters,
|
|
151
|
+
self._adapters,
|
|
152
|
+
"hf_causal_onnx",
|
|
153
|
+
"invarlock.adapters",
|
|
154
|
+
"HF_Causal_ONNX_Adapter",
|
|
155
|
+
required_deps=["optimum"],
|
|
159
156
|
)
|
|
157
|
+
_fallback(self._adapters, "hf_auto", "invarlock.adapters", "HF_Auto_Adapter")
|
|
160
158
|
# Optional plugin adapters (verify runtime dependencies)
|
|
161
159
|
_fallback(
|
|
162
160
|
self._adapters,
|
invarlock/core/runner.py
CHANGED
|
@@ -19,6 +19,11 @@ from typing import Any
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
|
|
21
21
|
from invarlock.eval.tail_stats import evaluate_metric_tail
|
|
22
|
+
from invarlock.observability.metrics import (
|
|
23
|
+
capture_memory_snapshot,
|
|
24
|
+
reset_peak_memory_stats,
|
|
25
|
+
summarize_memory_snapshots,
|
|
26
|
+
)
|
|
22
27
|
|
|
23
28
|
from .api import (
|
|
24
29
|
EditLike,
|
|
@@ -190,6 +195,18 @@ class CoreRunner:
|
|
|
190
195
|
pass
|
|
191
196
|
|
|
192
197
|
report.status = RunStatus.RUNNING.value
|
|
198
|
+
timings: dict[str, float] = {}
|
|
199
|
+
guard_timings: dict[str, float] = {}
|
|
200
|
+
memory_snapshots: list[dict[str, Any]] = []
|
|
201
|
+
total_start = time.perf_counter()
|
|
202
|
+
|
|
203
|
+
def _record_timing(key: str, start: float) -> None:
|
|
204
|
+
timings[key] = max(0.0, float(time.perf_counter() - start))
|
|
205
|
+
|
|
206
|
+
def _capture_memory(phase: str) -> None:
|
|
207
|
+
snapshot = capture_memory_snapshot(phase)
|
|
208
|
+
if snapshot:
|
|
209
|
+
memory_snapshots.append(snapshot)
|
|
193
210
|
|
|
194
211
|
try:
|
|
195
212
|
# Log start
|
|
@@ -205,40 +222,78 @@ class CoreRunner:
|
|
|
205
222
|
)
|
|
206
223
|
|
|
207
224
|
# Phase 1: Prepare (describe model, create checkpoint)
|
|
208
|
-
|
|
225
|
+
reset_peak_memory_stats()
|
|
226
|
+
phase_start = time.perf_counter()
|
|
227
|
+
try:
|
|
228
|
+
model_desc = self._prepare_phase(model, adapter, report)
|
|
229
|
+
finally:
|
|
230
|
+
_record_timing("prepare", phase_start)
|
|
231
|
+
_capture_memory("prepare")
|
|
209
232
|
|
|
210
233
|
# Phase 2: Prepare guards (must happen before edit)
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
234
|
+
reset_peak_memory_stats()
|
|
235
|
+
phase_start = time.perf_counter()
|
|
236
|
+
try:
|
|
237
|
+
self._prepare_guards_phase(
|
|
238
|
+
model,
|
|
239
|
+
adapter,
|
|
240
|
+
guards,
|
|
241
|
+
calibration_data,
|
|
242
|
+
report,
|
|
243
|
+
auto_config,
|
|
244
|
+
config,
|
|
245
|
+
)
|
|
246
|
+
finally:
|
|
247
|
+
_record_timing("prepare_guards", phase_start)
|
|
248
|
+
_capture_memory("prepare_guards")
|
|
220
249
|
|
|
221
250
|
# Phase 3: Apply edit
|
|
222
|
-
|
|
251
|
+
reset_peak_memory_stats()
|
|
252
|
+
phase_start = time.perf_counter()
|
|
253
|
+
try:
|
|
254
|
+
self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
|
|
255
|
+
finally:
|
|
256
|
+
_record_timing("edit", phase_start)
|
|
257
|
+
_capture_memory("edit")
|
|
223
258
|
|
|
224
259
|
# Phase 4: Run guards
|
|
225
|
-
|
|
260
|
+
reset_peak_memory_stats()
|
|
261
|
+
phase_start = time.perf_counter()
|
|
262
|
+
try:
|
|
263
|
+
guard_results = self._guard_phase(
|
|
264
|
+
model, adapter, guards, report, guard_timings=guard_timings
|
|
265
|
+
)
|
|
266
|
+
finally:
|
|
267
|
+
_record_timing("guards", phase_start)
|
|
268
|
+
_capture_memory("guards")
|
|
226
269
|
|
|
227
270
|
# Phase 5: Evaluate final metrics
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
271
|
+
reset_peak_memory_stats()
|
|
272
|
+
phase_start = time.perf_counter()
|
|
273
|
+
try:
|
|
274
|
+
metrics = self._eval_phase(
|
|
275
|
+
model,
|
|
276
|
+
adapter,
|
|
277
|
+
calibration_data,
|
|
278
|
+
report,
|
|
279
|
+
preview_n,
|
|
280
|
+
final_n,
|
|
281
|
+
config,
|
|
282
|
+
)
|
|
283
|
+
finally:
|
|
284
|
+
_record_timing("eval", phase_start)
|
|
285
|
+
_capture_memory("eval")
|
|
237
286
|
|
|
238
287
|
# Phase 6: Finalize or rollback
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
288
|
+
reset_peak_memory_stats()
|
|
289
|
+
phase_start = time.perf_counter()
|
|
290
|
+
try:
|
|
291
|
+
final_status = self._finalize_phase(
|
|
292
|
+
model, adapter, guard_results, metrics, config, report
|
|
293
|
+
)
|
|
294
|
+
finally:
|
|
295
|
+
_record_timing("finalize", phase_start)
|
|
296
|
+
_capture_memory("finalize")
|
|
242
297
|
|
|
243
298
|
report.status = final_status
|
|
244
299
|
report.meta["end_time"] = time.time()
|
|
@@ -260,6 +315,25 @@ class CoreRunner:
|
|
|
260
315
|
return report
|
|
261
316
|
|
|
262
317
|
finally:
|
|
318
|
+
_record_timing("total", total_start)
|
|
319
|
+
if not isinstance(report.metrics, dict):
|
|
320
|
+
report.metrics = {}
|
|
321
|
+
if timings:
|
|
322
|
+
report.metrics.setdefault("timings", {}).update(timings)
|
|
323
|
+
if guard_timings:
|
|
324
|
+
report.metrics["guard_timings"] = guard_timings
|
|
325
|
+
if memory_snapshots:
|
|
326
|
+
report.metrics["memory_snapshots"] = memory_snapshots
|
|
327
|
+
summary = summarize_memory_snapshots(memory_snapshots)
|
|
328
|
+
if summary:
|
|
329
|
+
mem_peak = summary.get("memory_mb_peak")
|
|
330
|
+
if isinstance(mem_peak, (int | float)):
|
|
331
|
+
existing = report.metrics.get("memory_mb_peak")
|
|
332
|
+
if isinstance(existing, (int | float)):
|
|
333
|
+
summary["memory_mb_peak"] = max(
|
|
334
|
+
float(existing), float(mem_peak)
|
|
335
|
+
)
|
|
336
|
+
report.metrics.update(summary)
|
|
263
337
|
self._active_model = None
|
|
264
338
|
self._active_adapter = None
|
|
265
339
|
self._cleanup_services()
|
|
@@ -455,7 +529,13 @@ class CoreRunner:
|
|
|
455
529
|
)
|
|
456
530
|
|
|
457
531
|
def _guard_phase(
|
|
458
|
-
self,
|
|
532
|
+
self,
|
|
533
|
+
model: Any,
|
|
534
|
+
adapter: ModelAdapter,
|
|
535
|
+
guards: list[Guard],
|
|
536
|
+
report: RunReport,
|
|
537
|
+
*,
|
|
538
|
+
guard_timings: dict[str, float] | None = None,
|
|
459
539
|
) -> dict[str, dict[str, Any]]:
|
|
460
540
|
"""Phase 4: Run safety guards."""
|
|
461
541
|
self._log_event("guards", "start", LogLevel.INFO, {"count": len(guards)})
|
|
@@ -464,6 +544,7 @@ class CoreRunner:
|
|
|
464
544
|
|
|
465
545
|
for guard in guards:
|
|
466
546
|
self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
|
|
547
|
+
guard_start = time.perf_counter()
|
|
467
548
|
|
|
468
549
|
if isinstance(guard, GuardWithContext):
|
|
469
550
|
try:
|
|
@@ -497,6 +578,11 @@ class CoreRunner:
|
|
|
497
578
|
LogLevel.ERROR,
|
|
498
579
|
{"guard": guard.name, "error": str(e)},
|
|
499
580
|
)
|
|
581
|
+
finally:
|
|
582
|
+
if guard_timings is not None:
|
|
583
|
+
guard_timings[guard.name] = max(
|
|
584
|
+
0.0, float(time.perf_counter() - guard_start)
|
|
585
|
+
)
|
|
500
586
|
|
|
501
587
|
report.guards = guard_results
|
|
502
588
|
|