invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +4 -4
- invarlock/adapters/__init__.py +10 -14
- invarlock/adapters/auto.py +37 -50
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +53 -9
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/assurance/__init__.py +15 -23
- invarlock/cli/adapter_auto.py +32 -26
- invarlock/cli/app.py +128 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/doctor.py +8 -10
- invarlock/cli/commands/evaluate.py +986 -0
- invarlock/cli/commands/explain_gates.py +25 -17
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +326 -92
- invarlock/cli/commands/run.py +1160 -228
- invarlock/cli/commands/verify.py +157 -97
- invarlock/cli/config.py +1 -1
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +4 -4
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/registry.py +9 -11
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +112 -26
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +67 -39
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +14 -10
- invarlock/eval/data.py +68 -23
- invarlock/eval/metrics.py +59 -1
- invarlock/eval/primary_metric.py +1 -1
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/rmt.py +2 -2
- invarlock/guards/spectral.py +1 -1
- invarlock/guards/variance.py +2 -2
- invarlock/model_profile.py +64 -62
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/guards_analysis.py +154 -4
- invarlock/reporting/html.py +61 -11
- invarlock/reporting/normalizer.py +9 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +876 -510
- invarlock/reporting/report.py +72 -30
- invarlock/reporting/{certificate.py → report_builder.py} +252 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock/reporting/report_types.py +6 -1
- invarlock/reporting/telemetry.py +86 -0
- invarlock-0.3.8.dist-info/METADATA +283 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- invarlock/cli/commands/certify.py +0 -422
- invarlock-0.3.6.dist-info/METADATA +0 -588
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
invarlock/cli/doctor_helpers.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any
|
|
|
8
8
|
def get_adapter_rows() -> list[dict[str, Any]]:
|
|
9
9
|
"""Build adapter rows similar to doctor output for testing.
|
|
10
10
|
|
|
11
|
-
Applies optional-extra detection for
|
|
11
|
+
Applies optional-extra detection for hf_causal_onnx (optimum/onnxruntime) even if
|
|
12
12
|
registered as a core adapter, so missing extras are surfaced.
|
|
13
13
|
"""
|
|
14
14
|
from invarlock.core.registry import get_registry
|
|
@@ -29,13 +29,12 @@ def get_adapter_rows() -> list[dict[str, Any]]:
|
|
|
29
29
|
module = str(info.get("module") or "")
|
|
30
30
|
support = (
|
|
31
31
|
"auto"
|
|
32
|
-
if module.startswith("invarlock.adapters")
|
|
33
|
-
and name in {"hf_causal_auto", "hf_mlm_auto"}
|
|
32
|
+
if module.startswith("invarlock.adapters") and name in {"hf_auto"}
|
|
34
33
|
else ("core" if module.startswith("invarlock.adapters") else "optional")
|
|
35
34
|
)
|
|
36
35
|
backend, status, enable = None, "ready", ""
|
|
37
36
|
|
|
38
|
-
if name in {"
|
|
37
|
+
if name in {"hf_causal", "hf_mlm", "hf_seq2seq", "hf_auto"}:
|
|
39
38
|
backend = "transformers"
|
|
40
39
|
elif name == "hf_gptq":
|
|
41
40
|
backend = "auto-gptq"
|
|
@@ -49,7 +48,7 @@ def get_adapter_rows() -> list[dict[str, Any]]:
|
|
|
49
48
|
backend = "bitsandbytes"
|
|
50
49
|
if not has_cuda:
|
|
51
50
|
status, enable = "unsupported", "Requires CUDA"
|
|
52
|
-
elif name == "
|
|
51
|
+
elif name == "hf_causal_onnx":
|
|
53
52
|
backend = "onnxruntime"
|
|
54
53
|
present = (
|
|
55
54
|
importlib.util.find_spec("optimum.onnxruntime") is not None
|
invarlock/cli/output.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import TextIO
|
|
9
|
+
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
|
|
12
|
+
_STYLE_AUDIT = "audit"
|
|
13
|
+
_STYLE_FRIENDLY = "friendly"
|
|
14
|
+
_VALID_STYLES = {_STYLE_AUDIT, _STYLE_FRIENDLY}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _safe_console_print(console: Console, *args: object, **kwargs: object) -> None:
|
|
18
|
+
try:
|
|
19
|
+
console.print(*args, **kwargs)
|
|
20
|
+
except TypeError:
|
|
21
|
+
console.print(*args)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def env_no_color() -> bool:
|
|
25
|
+
"""Return True when NO_COLOR is set (value-agnostic)."""
|
|
26
|
+
return bool(str(os.environ.get("NO_COLOR", "")).strip())
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def perf_counter() -> float:
|
|
30
|
+
return time.perf_counter()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True, slots=True)
|
|
34
|
+
class OutputStyle:
|
|
35
|
+
name: str
|
|
36
|
+
progress: bool = False
|
|
37
|
+
timing: bool = False
|
|
38
|
+
color: bool = True
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def emojis(self) -> bool:
|
|
42
|
+
return self.name != _STYLE_AUDIT
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def audit(self) -> bool:
|
|
46
|
+
return self.name == _STYLE_AUDIT
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def normalize_style(style: str | None) -> str | None:
|
|
50
|
+
if style is None:
|
|
51
|
+
return None
|
|
52
|
+
value = str(style).strip().lower()
|
|
53
|
+
if not value:
|
|
54
|
+
return None
|
|
55
|
+
return value if value in _VALID_STYLES else None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def resolve_style_name(style: str | None, profile: str | None) -> str:
|
|
59
|
+
normalized = normalize_style(style)
|
|
60
|
+
if normalized is not None:
|
|
61
|
+
return normalized
|
|
62
|
+
profile_norm = str(profile or "").strip().lower()
|
|
63
|
+
if profile_norm in {"ci", "ci_cpu", "release"}:
|
|
64
|
+
return _STYLE_AUDIT
|
|
65
|
+
return _STYLE_FRIENDLY
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def resolve_output_style(
|
|
69
|
+
*,
|
|
70
|
+
style: str | None,
|
|
71
|
+
profile: str | None,
|
|
72
|
+
progress: bool = False,
|
|
73
|
+
timing: bool = False,
|
|
74
|
+
no_color: bool = False,
|
|
75
|
+
) -> OutputStyle:
|
|
76
|
+
name = resolve_style_name(style, profile)
|
|
77
|
+
color_enabled = not (bool(no_color) or env_no_color())
|
|
78
|
+
return OutputStyle(
|
|
79
|
+
name=name,
|
|
80
|
+
progress=bool(progress),
|
|
81
|
+
timing=bool(timing),
|
|
82
|
+
color=color_enabled,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def make_console(
|
|
87
|
+
*,
|
|
88
|
+
file: TextIO | None = None,
|
|
89
|
+
force_terminal: bool | None = None,
|
|
90
|
+
no_color: bool | None = None,
|
|
91
|
+
) -> Console:
|
|
92
|
+
if no_color is None:
|
|
93
|
+
no_color = env_no_color()
|
|
94
|
+
if no_color:
|
|
95
|
+
color_system = None
|
|
96
|
+
else:
|
|
97
|
+
color_system = "standard" if force_terminal else "auto"
|
|
98
|
+
return Console(
|
|
99
|
+
file=file,
|
|
100
|
+
force_terminal=force_terminal,
|
|
101
|
+
no_color=bool(no_color),
|
|
102
|
+
color_system=color_system,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def format_event_line(
|
|
107
|
+
tag: str,
|
|
108
|
+
message: str,
|
|
109
|
+
*,
|
|
110
|
+
style: OutputStyle,
|
|
111
|
+
emoji: str | None = None,
|
|
112
|
+
) -> str:
|
|
113
|
+
tag_norm = str(tag or "").strip().upper() or "INFO"
|
|
114
|
+
if style.emojis and emoji:
|
|
115
|
+
prefix = emoji
|
|
116
|
+
else:
|
|
117
|
+
prefix = f"[{tag_norm}]"
|
|
118
|
+
msg = str(message or "").rstrip()
|
|
119
|
+
return f"{prefix} {msg}".rstrip()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def print_event(
|
|
123
|
+
console: Console,
|
|
124
|
+
tag: str,
|
|
125
|
+
message: str,
|
|
126
|
+
*,
|
|
127
|
+
style: OutputStyle,
|
|
128
|
+
emoji: str | None = None,
|
|
129
|
+
console_style: str | None = None,
|
|
130
|
+
) -> None:
|
|
131
|
+
line = format_event_line(tag, message, style=style, emoji=emoji)
|
|
132
|
+
if console_style is None and style.color:
|
|
133
|
+
tag_norm = str(tag or "").strip().upper()
|
|
134
|
+
if tag_norm in {"PASS"}:
|
|
135
|
+
console_style = "green"
|
|
136
|
+
elif tag_norm in {"FAIL", "ERROR"}:
|
|
137
|
+
console_style = "red"
|
|
138
|
+
elif tag_norm in {"WARN", "WARNING"}:
|
|
139
|
+
console_style = "yellow"
|
|
140
|
+
elif tag_norm in {"METRIC"}:
|
|
141
|
+
console_style = "cyan"
|
|
142
|
+
_safe_console_print(console, line, style=console_style, markup=False)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@contextmanager
|
|
146
|
+
def timed_step(
|
|
147
|
+
*,
|
|
148
|
+
console: Console,
|
|
149
|
+
style: OutputStyle,
|
|
150
|
+
timings: dict[str, float] | None,
|
|
151
|
+
key: str,
|
|
152
|
+
tag: str,
|
|
153
|
+
message: str,
|
|
154
|
+
emoji: str | None = None,
|
|
155
|
+
) -> Iterator[None]:
|
|
156
|
+
start = perf_counter()
|
|
157
|
+
try:
|
|
158
|
+
yield
|
|
159
|
+
finally:
|
|
160
|
+
elapsed = max(0.0, float(perf_counter() - start))
|
|
161
|
+
if timings is not None:
|
|
162
|
+
timings[key] = elapsed
|
|
163
|
+
if style.progress:
|
|
164
|
+
print_event(
|
|
165
|
+
console,
|
|
166
|
+
tag,
|
|
167
|
+
f"{message} done ({elapsed:.2f}s)",
|
|
168
|
+
style=style,
|
|
169
|
+
emoji=emoji,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def print_timing_summary(
|
|
174
|
+
console: Console,
|
|
175
|
+
timings: dict[str, float],
|
|
176
|
+
*,
|
|
177
|
+
style: OutputStyle,
|
|
178
|
+
order: list[tuple[str, str]],
|
|
179
|
+
extra_lines: list[str] | None = None,
|
|
180
|
+
) -> None:
|
|
181
|
+
if not style.timing:
|
|
182
|
+
return
|
|
183
|
+
_safe_console_print(console, "", markup=False)
|
|
184
|
+
_safe_console_print(console, "TIMING SUMMARY", markup=False)
|
|
185
|
+
for label, key in order:
|
|
186
|
+
if key not in timings:
|
|
187
|
+
continue
|
|
188
|
+
_safe_console_print(
|
|
189
|
+
console, f" {label:<11}: {timings[key]:.2f}s", markup=False
|
|
190
|
+
)
|
|
191
|
+
if extra_lines:
|
|
192
|
+
for line in extra_lines:
|
|
193
|
+
_safe_console_print(console, line, markup=False)
|
invarlock/cli/provenance.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Provides a tiny, versioned schema describing the adapter family and the
|
|
4
4
|
underlying library versions. This does not perform any edits; it only reads
|
|
5
|
-
environment and import metadata to annotate
|
|
5
|
+
environment and import metadata to annotate evaluation artifacts.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
@@ -31,7 +31,7 @@ _FAMILY_MAP: dict[str, tuple[str, str, list[str]]] = {
|
|
|
31
31
|
"hf_awq": ("awq", "autoawq", []),
|
|
32
32
|
"hf_bnb": ("bnb", "bitsandbytes", []),
|
|
33
33
|
# ONNX stack (requires extras: invarlock[onnx])
|
|
34
|
-
"
|
|
34
|
+
"hf_causal_onnx": ("onnx", "onnxruntime", []),
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
|
|
@@ -46,12 +46,12 @@ def extract_adapter_provenance(adapter_name: str) -> AdapterProvenance:
|
|
|
46
46
|
msg = (
|
|
47
47
|
None
|
|
48
48
|
if supported
|
|
49
|
-
else f"Use Compare &
|
|
49
|
+
else f"Use Compare & Evaluate (BYOE); {library} version unsupported (tested: {tested})"
|
|
50
50
|
)
|
|
51
51
|
except Exception: # Package not installed or version unknown
|
|
52
52
|
ver = None
|
|
53
53
|
supported = False
|
|
54
|
-
msg = f"{library} not available; prefer Compare &
|
|
54
|
+
msg = f"{library} not available; prefer Compare & Evaluate (BYOE) or install extras."
|
|
55
55
|
|
|
56
56
|
return AdapterProvenance(
|
|
57
57
|
family=family,
|
invarlock/core/bootstrap.py
CHANGED
|
@@ -6,7 +6,7 @@ Numerically stable bootstrap helpers for evaluation metrics.
|
|
|
6
6
|
|
|
7
7
|
This module provides bias-corrected and accelerated (BCa) confidence
|
|
8
8
|
intervals tailored for paired log-loss statistics used by the runner
|
|
9
|
-
and
|
|
9
|
+
and evaluation reports.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
from __future__ import annotations
|
invarlock/core/registry.py
CHANGED
|
@@ -140,23 +140,21 @@ class CoreRegistry:
|
|
|
140
140
|
)
|
|
141
141
|
|
|
142
142
|
# Register built-in adapters
|
|
143
|
-
_fallback(self._adapters, "hf_gpt2", "invarlock.adapters", "HF_GPT2_Adapter")
|
|
144
|
-
_fallback(self._adapters, "hf_bert", "invarlock.adapters", "HF_BERT_Adapter")
|
|
145
|
-
_fallback(self._adapters, "hf_llama", "invarlock.adapters", "HF_LLaMA_Adapter")
|
|
146
|
-
_fallback(self._adapters, "hf_t5", "invarlock.adapters", "HF_T5_Adapter")
|
|
147
143
|
_fallback(
|
|
148
|
-
self._adapters, "
|
|
144
|
+
self._adapters, "hf_causal", "invarlock.adapters", "HF_Causal_Adapter"
|
|
149
145
|
)
|
|
150
|
-
|
|
146
|
+
_fallback(self._adapters, "hf_mlm", "invarlock.adapters", "HF_MLM_Adapter")
|
|
151
147
|
_fallback(
|
|
152
|
-
self._adapters,
|
|
153
|
-
"hf_causal_auto",
|
|
154
|
-
"invarlock.adapters",
|
|
155
|
-
"HF_Causal_Auto_Adapter",
|
|
148
|
+
self._adapters, "hf_seq2seq", "invarlock.adapters", "HF_Seq2Seq_Adapter"
|
|
156
149
|
)
|
|
157
150
|
_fallback(
|
|
158
|
-
self._adapters,
|
|
151
|
+
self._adapters,
|
|
152
|
+
"hf_causal_onnx",
|
|
153
|
+
"invarlock.adapters",
|
|
154
|
+
"HF_Causal_ONNX_Adapter",
|
|
155
|
+
required_deps=["optimum"],
|
|
159
156
|
)
|
|
157
|
+
_fallback(self._adapters, "hf_auto", "invarlock.adapters", "HF_Auto_Adapter")
|
|
160
158
|
# Optional plugin adapters (verify runtime dependencies)
|
|
161
159
|
_fallback(
|
|
162
160
|
self._adapters,
|
invarlock/core/retry.py
CHANGED
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
InvarLock Retry Controller
|
|
3
3
|
=====================
|
|
4
4
|
|
|
5
|
-
Manages retry logic for automated
|
|
5
|
+
Manages retry logic for automated evaluation workflows with:
|
|
6
6
|
- Attempt budgets (max 3 attempts default)
|
|
7
7
|
- Time budgets (optional timeout)
|
|
8
8
|
- Parameter adjustment strategies per edit type
|
|
9
|
-
-
|
|
9
|
+
- Gate-driven retry decisions
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
from __future__ import annotations
|
|
@@ -19,7 +19,7 @@ __all__ = ["RetryController", "adjust_edit_params"]
|
|
|
19
19
|
|
|
20
20
|
class RetryController:
|
|
21
21
|
"""
|
|
22
|
-
Controls retry logic for
|
|
22
|
+
Controls retry logic for evaluation-report-driven automation.
|
|
23
23
|
|
|
24
24
|
Features:
|
|
25
25
|
- Attempt budget enforcement (default 3 max)
|
|
@@ -45,18 +45,18 @@ class RetryController:
|
|
|
45
45
|
self.start_time = time.time()
|
|
46
46
|
self.attempt_history: list[dict[str, Any]] = []
|
|
47
47
|
|
|
48
|
-
def should_retry(self,
|
|
48
|
+
def should_retry(self, report_passed: bool) -> bool:
|
|
49
49
|
"""
|
|
50
50
|
Determine if retry should be attempted.
|
|
51
51
|
|
|
52
52
|
Args:
|
|
53
|
-
|
|
53
|
+
report_passed: Whether evaluation report gates passed
|
|
54
54
|
|
|
55
55
|
Returns:
|
|
56
56
|
True if retry should be attempted, False otherwise
|
|
57
57
|
"""
|
|
58
|
-
# If
|
|
59
|
-
if
|
|
58
|
+
# If report passed, no retry needed
|
|
59
|
+
if report_passed:
|
|
60
60
|
return False
|
|
61
61
|
|
|
62
62
|
# Check attempt budget (attempt count equals history length)
|
|
@@ -81,21 +81,21 @@ class RetryController:
|
|
|
81
81
|
def record_attempt(
|
|
82
82
|
self,
|
|
83
83
|
attempt_num: int,
|
|
84
|
-
|
|
84
|
+
report_result: dict[str, Any],
|
|
85
85
|
edit_params: dict[str, Any],
|
|
86
86
|
) -> None:
|
|
87
87
|
"""Record details of an attempt for tracking."""
|
|
88
|
-
|
|
88
|
+
report_result = report_result or {}
|
|
89
89
|
edit_params = edit_params or {}
|
|
90
90
|
|
|
91
91
|
self.attempt_history.append(
|
|
92
92
|
{
|
|
93
93
|
"attempt": attempt_num,
|
|
94
94
|
"timestamp": time.time(),
|
|
95
|
-
"
|
|
95
|
+
"report_passed": report_result.get("passed", False),
|
|
96
96
|
"edit_params": edit_params.copy(),
|
|
97
|
-
"failures":
|
|
98
|
-
"validation":
|
|
97
|
+
"failures": report_result.get("failures", []),
|
|
98
|
+
"validation": report_result.get("validation", {}),
|
|
99
99
|
}
|
|
100
100
|
)
|
|
101
101
|
|
|
@@ -114,7 +114,7 @@ def adjust_edit_params(
|
|
|
114
114
|
edit_name: str,
|
|
115
115
|
edit_params: dict[str, Any],
|
|
116
116
|
attempt: int,
|
|
117
|
-
|
|
117
|
+
report_result: dict[str, Any] | None = None,
|
|
118
118
|
) -> dict[str, Any]:
|
|
119
119
|
"""
|
|
120
120
|
Adjust edit parameters for retry attempt based on edit type and failure mode.
|
|
@@ -126,7 +126,7 @@ def adjust_edit_params(
|
|
|
126
126
|
edit_name: Name of the edit operation
|
|
127
127
|
edit_params: Current edit parameters
|
|
128
128
|
attempt: Attempt number (1-indexed)
|
|
129
|
-
|
|
129
|
+
report_result: Optional evaluation report result for failure analysis
|
|
130
130
|
|
|
131
131
|
Returns:
|
|
132
132
|
Adjusted parameters for next attempt
|
invarlock/core/runner.py
CHANGED
|
@@ -19,6 +19,11 @@ from typing import Any
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
|
|
21
21
|
from invarlock.eval.tail_stats import evaluate_metric_tail
|
|
22
|
+
from invarlock.observability.metrics import (
|
|
23
|
+
capture_memory_snapshot,
|
|
24
|
+
reset_peak_memory_stats,
|
|
25
|
+
summarize_memory_snapshots,
|
|
26
|
+
)
|
|
22
27
|
|
|
23
28
|
from .api import (
|
|
24
29
|
EditLike,
|
|
@@ -190,6 +195,18 @@ class CoreRunner:
|
|
|
190
195
|
pass
|
|
191
196
|
|
|
192
197
|
report.status = RunStatus.RUNNING.value
|
|
198
|
+
timings: dict[str, float] = {}
|
|
199
|
+
guard_timings: dict[str, float] = {}
|
|
200
|
+
memory_snapshots: list[dict[str, Any]] = []
|
|
201
|
+
total_start = time.perf_counter()
|
|
202
|
+
|
|
203
|
+
def _record_timing(key: str, start: float) -> None:
|
|
204
|
+
timings[key] = max(0.0, float(time.perf_counter() - start))
|
|
205
|
+
|
|
206
|
+
def _capture_memory(phase: str) -> None:
|
|
207
|
+
snapshot = capture_memory_snapshot(phase)
|
|
208
|
+
if snapshot:
|
|
209
|
+
memory_snapshots.append(snapshot)
|
|
193
210
|
|
|
194
211
|
try:
|
|
195
212
|
# Log start
|
|
@@ -205,40 +222,78 @@ class CoreRunner:
|
|
|
205
222
|
)
|
|
206
223
|
|
|
207
224
|
# Phase 1: Prepare (describe model, create checkpoint)
|
|
208
|
-
|
|
225
|
+
reset_peak_memory_stats()
|
|
226
|
+
phase_start = time.perf_counter()
|
|
227
|
+
try:
|
|
228
|
+
model_desc = self._prepare_phase(model, adapter, report)
|
|
229
|
+
finally:
|
|
230
|
+
_record_timing("prepare", phase_start)
|
|
231
|
+
_capture_memory("prepare")
|
|
209
232
|
|
|
210
233
|
# Phase 2: Prepare guards (must happen before edit)
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
234
|
+
reset_peak_memory_stats()
|
|
235
|
+
phase_start = time.perf_counter()
|
|
236
|
+
try:
|
|
237
|
+
self._prepare_guards_phase(
|
|
238
|
+
model,
|
|
239
|
+
adapter,
|
|
240
|
+
guards,
|
|
241
|
+
calibration_data,
|
|
242
|
+
report,
|
|
243
|
+
auto_config,
|
|
244
|
+
config,
|
|
245
|
+
)
|
|
246
|
+
finally:
|
|
247
|
+
_record_timing("prepare_guards", phase_start)
|
|
248
|
+
_capture_memory("prepare_guards")
|
|
220
249
|
|
|
221
250
|
# Phase 3: Apply edit
|
|
222
|
-
|
|
251
|
+
reset_peak_memory_stats()
|
|
252
|
+
phase_start = time.perf_counter()
|
|
253
|
+
try:
|
|
254
|
+
self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
|
|
255
|
+
finally:
|
|
256
|
+
_record_timing("edit", phase_start)
|
|
257
|
+
_capture_memory("edit")
|
|
223
258
|
|
|
224
259
|
# Phase 4: Run guards
|
|
225
|
-
|
|
260
|
+
reset_peak_memory_stats()
|
|
261
|
+
phase_start = time.perf_counter()
|
|
262
|
+
try:
|
|
263
|
+
guard_results = self._guard_phase(
|
|
264
|
+
model, adapter, guards, report, guard_timings=guard_timings
|
|
265
|
+
)
|
|
266
|
+
finally:
|
|
267
|
+
_record_timing("guards", phase_start)
|
|
268
|
+
_capture_memory("guards")
|
|
226
269
|
|
|
227
270
|
# Phase 5: Evaluate final metrics
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
271
|
+
reset_peak_memory_stats()
|
|
272
|
+
phase_start = time.perf_counter()
|
|
273
|
+
try:
|
|
274
|
+
metrics = self._eval_phase(
|
|
275
|
+
model,
|
|
276
|
+
adapter,
|
|
277
|
+
calibration_data,
|
|
278
|
+
report,
|
|
279
|
+
preview_n,
|
|
280
|
+
final_n,
|
|
281
|
+
config,
|
|
282
|
+
)
|
|
283
|
+
finally:
|
|
284
|
+
_record_timing("eval", phase_start)
|
|
285
|
+
_capture_memory("eval")
|
|
237
286
|
|
|
238
287
|
# Phase 6: Finalize or rollback
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
288
|
+
reset_peak_memory_stats()
|
|
289
|
+
phase_start = time.perf_counter()
|
|
290
|
+
try:
|
|
291
|
+
final_status = self._finalize_phase(
|
|
292
|
+
model, adapter, guard_results, metrics, config, report
|
|
293
|
+
)
|
|
294
|
+
finally:
|
|
295
|
+
_record_timing("finalize", phase_start)
|
|
296
|
+
_capture_memory("finalize")
|
|
242
297
|
|
|
243
298
|
report.status = final_status
|
|
244
299
|
report.meta["end_time"] = time.time()
|
|
@@ -260,6 +315,25 @@ class CoreRunner:
|
|
|
260
315
|
return report
|
|
261
316
|
|
|
262
317
|
finally:
|
|
318
|
+
_record_timing("total", total_start)
|
|
319
|
+
if not isinstance(report.metrics, dict):
|
|
320
|
+
report.metrics = {}
|
|
321
|
+
if timings:
|
|
322
|
+
report.metrics.setdefault("timings", {}).update(timings)
|
|
323
|
+
if guard_timings:
|
|
324
|
+
report.metrics["guard_timings"] = guard_timings
|
|
325
|
+
if memory_snapshots:
|
|
326
|
+
report.metrics["memory_snapshots"] = memory_snapshots
|
|
327
|
+
summary = summarize_memory_snapshots(memory_snapshots)
|
|
328
|
+
if summary:
|
|
329
|
+
mem_peak = summary.get("memory_mb_peak")
|
|
330
|
+
if isinstance(mem_peak, (int | float)):
|
|
331
|
+
existing = report.metrics.get("memory_mb_peak")
|
|
332
|
+
if isinstance(existing, (int | float)):
|
|
333
|
+
summary["memory_mb_peak"] = max(
|
|
334
|
+
float(existing), float(mem_peak)
|
|
335
|
+
)
|
|
336
|
+
report.metrics.update(summary)
|
|
263
337
|
self._active_model = None
|
|
264
338
|
self._active_adapter = None
|
|
265
339
|
self._cleanup_services()
|
|
@@ -447,7 +521,7 @@ class CoreRunner:
|
|
|
447
521
|
f"Guard '{guard.name}' prepare failed: {e}"
|
|
448
522
|
) from e
|
|
449
523
|
|
|
450
|
-
# Store resolved policies in report for
|
|
524
|
+
# Store resolved policies in report for evaluation report generation
|
|
451
525
|
report.meta["tier_policies"] = tier_policies
|
|
452
526
|
|
|
453
527
|
self._log_event(
|
|
@@ -455,7 +529,13 @@ class CoreRunner:
|
|
|
455
529
|
)
|
|
456
530
|
|
|
457
531
|
def _guard_phase(
|
|
458
|
-
self,
|
|
532
|
+
self,
|
|
533
|
+
model: Any,
|
|
534
|
+
adapter: ModelAdapter,
|
|
535
|
+
guards: list[Guard],
|
|
536
|
+
report: RunReport,
|
|
537
|
+
*,
|
|
538
|
+
guard_timings: dict[str, float] | None = None,
|
|
459
539
|
) -> dict[str, dict[str, Any]]:
|
|
460
540
|
"""Phase 4: Run safety guards."""
|
|
461
541
|
self._log_event("guards", "start", LogLevel.INFO, {"count": len(guards)})
|
|
@@ -464,6 +544,7 @@ class CoreRunner:
|
|
|
464
544
|
|
|
465
545
|
for guard in guards:
|
|
466
546
|
self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
|
|
547
|
+
guard_start = time.perf_counter()
|
|
467
548
|
|
|
468
549
|
if isinstance(guard, GuardWithContext):
|
|
469
550
|
try:
|
|
@@ -497,6 +578,11 @@ class CoreRunner:
|
|
|
497
578
|
LogLevel.ERROR,
|
|
498
579
|
{"guard": guard.name, "error": str(e)},
|
|
499
580
|
)
|
|
581
|
+
finally:
|
|
582
|
+
if guard_timings is not None:
|
|
583
|
+
guard_timings[guard.name] = max(
|
|
584
|
+
0.0, float(time.perf_counter() - guard_start)
|
|
585
|
+
)
|
|
500
586
|
|
|
501
587
|
report.guards = guard_results
|
|
502
588
|
|
invarlock/edits/noop.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
"""Built-in no-op edit used for baseline and Compare &
|
|
1
|
+
"""Built-in no-op edit used for baseline and Compare & Evaluate (BYOE).
|
|
2
2
|
|
|
3
3
|
This edit does not modify the model and reports zero deltas. It exists to
|
|
4
|
-
support baseline runs and Compare &
|
|
4
|
+
support baseline runs and Compare & Evaluate workflows where the subject
|
|
5
5
|
checkpoint is produced outside of InvarLock.
|
|
6
6
|
"""
|
|
7
7
|
|