invarlock 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. invarlock/__init__.py +2 -2
  2. invarlock/adapters/__init__.py +10 -14
  3. invarlock/adapters/auto.py +35 -40
  4. invarlock/adapters/capabilities.py +2 -2
  5. invarlock/adapters/hf_causal.py +418 -0
  6. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  7. invarlock/adapters/hf_mixin.py +25 -4
  8. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  9. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  10. invarlock/cli/adapter_auto.py +31 -21
  11. invarlock/cli/app.py +73 -2
  12. invarlock/cli/commands/certify.py +600 -59
  13. invarlock/cli/commands/doctor.py +8 -10
  14. invarlock/cli/commands/plugins.py +13 -9
  15. invarlock/cli/commands/report.py +233 -69
  16. invarlock/cli/commands/run.py +907 -183
  17. invarlock/cli/commands/verify.py +76 -11
  18. invarlock/cli/config.py +1 -1
  19. invarlock/cli/doctor_helpers.py +4 -5
  20. invarlock/cli/output.py +193 -0
  21. invarlock/cli/provenance.py +1 -1
  22. invarlock/core/bootstrap.py +1 -1
  23. invarlock/core/registry.py +9 -11
  24. invarlock/core/runner.py +111 -25
  25. invarlock/edits/quant_rtn.py +65 -37
  26. invarlock/eval/bench.py +3 -3
  27. invarlock/eval/data.py +68 -23
  28. invarlock/eval/metrics.py +59 -1
  29. invarlock/eval/tasks/__init__.py +12 -0
  30. invarlock/eval/tasks/classification.py +48 -0
  31. invarlock/eval/tasks/qa.py +36 -0
  32. invarlock/eval/tasks/text_generation.py +102 -0
  33. invarlock/guards/invariants.py +19 -10
  34. invarlock/guards/rmt.py +2 -2
  35. invarlock/guards/variance.py +2 -2
  36. invarlock/model_profile.py +48 -27
  37. invarlock/observability/health.py +6 -6
  38. invarlock/observability/metrics.py +108 -0
  39. invarlock/reporting/certificate.py +159 -9
  40. invarlock/reporting/certificate_schema.py +1 -1
  41. invarlock/reporting/guards_analysis.py +154 -4
  42. invarlock/reporting/html.py +55 -5
  43. invarlock/reporting/normalizer.py +7 -0
  44. invarlock/reporting/render.py +791 -431
  45. invarlock/reporting/report.py +39 -3
  46. invarlock/reporting/report_types.py +6 -1
  47. invarlock/reporting/telemetry.py +86 -0
  48. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/METADATA +23 -9
  49. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/RECORD +53 -48
  50. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
  51. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
  52. invarlock/adapters/hf_gpt2.py +0 -404
  53. invarlock/adapters/hf_llama.py +0 -487
  54. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
  55. {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  invarlock verify command
3
3
  ====================
4
4
 
5
- Validates generated safety certificates for internal consistency. The command
5
+ Validates generated evaluation certificates for internal consistency. The command
6
6
  ensures schema compliance, checks that the primary metric ratio agrees with the
7
7
  baseline reference, and enforces paired-window guarantees (match=1.0,
8
8
  overlap=0.0).
@@ -66,9 +66,25 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
66
66
  errors.append("Certificate missing primary_metric block.")
67
67
  return errors
68
68
 
69
+ def _is_finite_number(value: Any) -> bool:
70
+ return isinstance(value, (int, float)) and math.isfinite(float(value))
71
+
72
+ def _declares_invalid_primary_metric(metric: dict[str, Any]) -> bool:
73
+ if bool(metric.get("invalid")):
74
+ return True
75
+ reason = metric.get("degraded_reason")
76
+ if isinstance(reason, str):
77
+ r = reason.strip().lower()
78
+ return r.startswith("non_finite") or r in {
79
+ "primary_metric_invalid",
80
+ "evaluation_error",
81
+ }
82
+ return False
83
+
69
84
  kind = str(pm.get("kind", "")).lower()
70
85
  ratio_vs_baseline = pm.get("ratio_vs_baseline")
71
86
  final = pm.get("final")
87
+ pm_invalid = _declares_invalid_primary_metric(pm)
72
88
 
73
89
  if kind.startswith("ppl"):
74
90
  baseline_ref = certificate.get("baseline_ref", {}) or {}
@@ -82,16 +98,14 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
82
98
  bv = baseline_pm.get("final")
83
99
  if isinstance(bv, (int | float)):
84
100
  baseline_final = float(bv)
85
- if isinstance(final, int | float) and isinstance(baseline_final, int | float):
86
- if baseline_final <= 0.0:
101
+ if _is_finite_number(final) and _is_finite_number(baseline_final):
102
+ if float(baseline_final) <= 0.0:
87
103
  errors.append(
88
104
  f"Baseline final must be > 0.0 to compute ratio (found {baseline_final})."
89
105
  )
90
106
  else:
91
107
  expected_ratio = float(final) / float(baseline_final)
92
- if not isinstance(ratio_vs_baseline, int | float) or not math.isfinite(
93
- float(ratio_vs_baseline)
94
- ):
108
+ if not _is_finite_number(ratio_vs_baseline):
95
109
  errors.append(
96
110
  "Certificate is missing a finite primary_metric.ratio_vs_baseline value."
97
111
  )
@@ -102,7 +116,18 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
102
116
  "Primary metric ratio mismatch: "
103
117
  f"recorded={float(ratio_vs_baseline):.12f}, expected={expected_ratio:.12f}"
104
118
  )
119
+ else:
120
+ # If the primary metric is non-finite, it must be explicitly marked invalid.
121
+ # This is expected for structural error-injection runs (NaN/Inf weights).
122
+ if (isinstance(final, (int | float)) and not _is_finite_number(final)) and (
123
+ not pm_invalid
124
+ ):
125
+ errors.append(
126
+ "Primary metric final is non-finite but primary_metric.invalid is not set."
127
+ )
105
128
  else:
129
+ if pm_invalid:
130
+ return errors
106
131
  if ratio_vs_baseline is None or not isinstance(ratio_vs_baseline, int | float):
107
132
  errors.append(
108
133
  "Certificate missing primary_metric.ratio_vs_baseline for non-ppl metric."
@@ -194,14 +219,29 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
194
219
 
195
220
 
196
221
  def _validate_drift_band(certificate: dict[str, Any]) -> list[str]:
197
- """Validate preview→final drift stays within the configured band (0.95–1.05)."""
222
+ """Validate preview→final drift stays within the configured band.
223
+
224
+ Defaults to 0.95–1.05 unless the certificate provides `primary_metric.drift_band`.
225
+ """
198
226
  errors: list[str] = []
199
227
  pm = certificate.get("primary_metric", {}) or {}
228
+ if not isinstance(pm, dict) or not pm:
229
+ errors.append("Certificate missing primary_metric block.")
230
+ return errors
231
+ if bool(pm.get("invalid")):
232
+ # Drift is undefined when the primary metric is invalid (e.g., NaN/Inf weights).
233
+ return errors
200
234
  drift_ratio = None
201
235
  try:
202
236
  prev = pm.get("preview")
203
237
  fin = pm.get("final")
204
- if isinstance(prev, int | float) and isinstance(fin, int | float) and prev > 0:
238
+ if (
239
+ isinstance(prev, int | float)
240
+ and isinstance(fin, int | float)
241
+ and math.isfinite(float(prev))
242
+ and math.isfinite(float(fin))
243
+ and prev > 0
244
+ ):
205
245
  drift_ratio = float(fin) / float(prev)
206
246
  except Exception:
207
247
  drift_ratio = None
@@ -210,9 +250,33 @@ def _validate_drift_band(certificate: dict[str, Any]) -> list[str]:
210
250
  errors.append("Certificate missing preview/final to compute drift ratio.")
211
251
  return errors
212
252
 
213
- if not 0.95 <= float(drift_ratio) <= 1.05:
253
+ drift_min = 0.95
254
+ drift_max = 1.05
255
+ band = pm.get("drift_band")
256
+ try:
257
+ if isinstance(band, dict):
258
+ lo = band.get("min")
259
+ hi = band.get("max")
260
+ if isinstance(lo, int | float) and isinstance(hi, int | float):
261
+ lo_f = float(lo)
262
+ hi_f = float(hi)
263
+ if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
264
+ drift_min = lo_f
265
+ drift_max = hi_f
266
+ elif isinstance(band, list | tuple) and len(band) == 2:
267
+ lo_raw, hi_raw = band[0], band[1]
268
+ if isinstance(lo_raw, int | float) and isinstance(hi_raw, int | float):
269
+ lo_f = float(lo_raw)
270
+ hi_f = float(hi_raw)
271
+ if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
272
+ drift_min = lo_f
273
+ drift_max = hi_f
274
+ except Exception:
275
+ pass
276
+
277
+ if not drift_min <= float(drift_ratio) <= drift_max:
214
278
  errors.append(
215
- f"Preview→final drift ratio out of band (0.951.05): observed {drift_ratio:.6f}."
279
+ f"Preview→final drift ratio out of band ({drift_min:.2f}{drift_max:.2f}): observed {drift_ratio:.6f}."
216
280
  )
217
281
 
218
282
  return errors
@@ -406,7 +470,8 @@ def _validate_certificate_payload(
406
470
  )
407
471
  except Exception:
408
472
  prof = "dev"
409
- # Enforce drift band only for CI/Release; skip in dev profile
473
+ # Drift band is a CI/Release enforcement check; dev profile should not
474
+ # fail verification due to preview→final drift.
410
475
  if prof in {"ci", "release"}:
411
476
  errors.extend(_validate_drift_band(certificate))
412
477
  errors.extend(_apply_profile_lints(certificate))
invarlock/cli/config.py CHANGED
@@ -415,7 +415,7 @@ def _deep_merge_dicts(a: dict, b: dict) -> dict: # pragma: no cover - trivial a
415
415
 
416
416
  def create_example_config() -> InvarLockConfig: # pragma: no cover - test helper
417
417
  return InvarLockConfig(
418
- model={"id": "gpt2", "adapter": "hf_gpt2", "device": "auto"},
418
+ model={"id": "gpt2", "adapter": "hf_causal", "device": "auto"},
419
419
  edit={"name": "quant_rtn", "plan": {}},
420
420
  dataset={"provider": "wikitext2", "seq_len": 512, "stride": 512},
421
421
  output={"dir": "runs"},
@@ -8,7 +8,7 @@ from typing import Any
8
8
  def get_adapter_rows() -> list[dict[str, Any]]:
9
9
  """Build adapter rows similar to doctor output for testing.
10
10
 
11
- Applies optional-extra detection for hf_onnx (optimum/onnxruntime) even if
11
+ Applies optional-extra detection for hf_causal_onnx (optimum/onnxruntime) even if
12
12
  registered as a core adapter, so missing extras are surfaced.
13
13
  """
14
14
  from invarlock.core.registry import get_registry
@@ -29,13 +29,12 @@ def get_adapter_rows() -> list[dict[str, Any]]:
29
29
  module = str(info.get("module") or "")
30
30
  support = (
31
31
  "auto"
32
- if module.startswith("invarlock.adapters")
33
- and name in {"hf_causal_auto", "hf_mlm_auto"}
32
+ if module.startswith("invarlock.adapters") and name in {"hf_auto"}
34
33
  else ("core" if module.startswith("invarlock.adapters") else "optional")
35
34
  )
36
35
  backend, status, enable = None, "ready", ""
37
36
 
38
- if name in {"hf_gpt2", "hf_bert", "hf_llama", "hf_causal_auto", "hf_mlm_auto"}:
37
+ if name in {"hf_causal", "hf_mlm", "hf_seq2seq", "hf_auto"}:
39
38
  backend = "transformers"
40
39
  elif name == "hf_gptq":
41
40
  backend = "auto-gptq"
@@ -49,7 +48,7 @@ def get_adapter_rows() -> list[dict[str, Any]]:
49
48
  backend = "bitsandbytes"
50
49
  if not has_cuda:
51
50
  status, enable = "unsupported", "Requires CUDA"
52
- elif name == "hf_onnx":
51
+ elif name == "hf_causal_onnx":
53
52
  backend = "onnxruntime"
54
53
  present = (
55
54
  importlib.util.find_spec("optimum.onnxruntime") is not None
@@ -0,0 +1,193 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ from collections.abc import Iterator
6
+ from contextlib import contextmanager
7
+ from dataclasses import dataclass
8
+ from typing import TextIO
9
+
10
+ from rich.console import Console
11
+
12
+ _STYLE_AUDIT = "audit"
13
+ _STYLE_FRIENDLY = "friendly"
14
+ _VALID_STYLES = {_STYLE_AUDIT, _STYLE_FRIENDLY}
15
+
16
+
17
+ def _safe_console_print(console: Console, *args: object, **kwargs: object) -> None:
18
+ try:
19
+ console.print(*args, **kwargs)
20
+ except TypeError:
21
+ console.print(*args)
22
+
23
+
24
+ def env_no_color() -> bool:
25
+ """Return True when NO_COLOR is set (value-agnostic)."""
26
+ return bool(str(os.environ.get("NO_COLOR", "")).strip())
27
+
28
+
29
+ def perf_counter() -> float:
30
+ return time.perf_counter()
31
+
32
+
33
+ @dataclass(frozen=True, slots=True)
34
+ class OutputStyle:
35
+ name: str
36
+ progress: bool = False
37
+ timing: bool = False
38
+ color: bool = True
39
+
40
+ @property
41
+ def emojis(self) -> bool:
42
+ return self.name != _STYLE_AUDIT
43
+
44
+ @property
45
+ def audit(self) -> bool:
46
+ return self.name == _STYLE_AUDIT
47
+
48
+
49
+ def normalize_style(style: str | None) -> str | None:
50
+ if style is None:
51
+ return None
52
+ value = str(style).strip().lower()
53
+ if not value:
54
+ return None
55
+ return value if value in _VALID_STYLES else None
56
+
57
+
58
+ def resolve_style_name(style: str | None, profile: str | None) -> str:
59
+ normalized = normalize_style(style)
60
+ if normalized is not None:
61
+ return normalized
62
+ profile_norm = str(profile or "").strip().lower()
63
+ if profile_norm in {"ci", "ci_cpu", "release"}:
64
+ return _STYLE_AUDIT
65
+ return _STYLE_FRIENDLY
66
+
67
+
68
+ def resolve_output_style(
69
+ *,
70
+ style: str | None,
71
+ profile: str | None,
72
+ progress: bool = False,
73
+ timing: bool = False,
74
+ no_color: bool = False,
75
+ ) -> OutputStyle:
76
+ name = resolve_style_name(style, profile)
77
+ color_enabled = not (bool(no_color) or env_no_color())
78
+ return OutputStyle(
79
+ name=name,
80
+ progress=bool(progress),
81
+ timing=bool(timing),
82
+ color=color_enabled,
83
+ )
84
+
85
+
86
+ def make_console(
87
+ *,
88
+ file: TextIO | None = None,
89
+ force_terminal: bool | None = None,
90
+ no_color: bool | None = None,
91
+ ) -> Console:
92
+ if no_color is None:
93
+ no_color = env_no_color()
94
+ if no_color:
95
+ color_system = None
96
+ else:
97
+ color_system = "standard" if force_terminal else "auto"
98
+ return Console(
99
+ file=file,
100
+ force_terminal=force_terminal,
101
+ no_color=bool(no_color),
102
+ color_system=color_system,
103
+ )
104
+
105
+
106
+ def format_event_line(
107
+ tag: str,
108
+ message: str,
109
+ *,
110
+ style: OutputStyle,
111
+ emoji: str | None = None,
112
+ ) -> str:
113
+ tag_norm = str(tag or "").strip().upper() or "INFO"
114
+ if style.emojis and emoji:
115
+ prefix = emoji
116
+ else:
117
+ prefix = f"[{tag_norm}]"
118
+ msg = str(message or "").rstrip()
119
+ return f"{prefix} {msg}".rstrip()
120
+
121
+
122
+ def print_event(
123
+ console: Console,
124
+ tag: str,
125
+ message: str,
126
+ *,
127
+ style: OutputStyle,
128
+ emoji: str | None = None,
129
+ console_style: str | None = None,
130
+ ) -> None:
131
+ line = format_event_line(tag, message, style=style, emoji=emoji)
132
+ if console_style is None and style.color:
133
+ tag_norm = str(tag or "").strip().upper()
134
+ if tag_norm in {"PASS"}:
135
+ console_style = "green"
136
+ elif tag_norm in {"FAIL", "ERROR"}:
137
+ console_style = "red"
138
+ elif tag_norm in {"WARN", "WARNING"}:
139
+ console_style = "yellow"
140
+ elif tag_norm in {"METRIC"}:
141
+ console_style = "cyan"
142
+ _safe_console_print(console, line, style=console_style, markup=False)
143
+
144
+
145
+ @contextmanager
146
+ def timed_step(
147
+ *,
148
+ console: Console,
149
+ style: OutputStyle,
150
+ timings: dict[str, float] | None,
151
+ key: str,
152
+ tag: str,
153
+ message: str,
154
+ emoji: str | None = None,
155
+ ) -> Iterator[None]:
156
+ start = perf_counter()
157
+ try:
158
+ yield
159
+ finally:
160
+ elapsed = max(0.0, float(perf_counter() - start))
161
+ if timings is not None:
162
+ timings[key] = elapsed
163
+ if style.progress:
164
+ print_event(
165
+ console,
166
+ tag,
167
+ f"{message} done ({elapsed:.2f}s)",
168
+ style=style,
169
+ emoji=emoji,
170
+ )
171
+
172
+
173
+ def print_timing_summary(
174
+ console: Console,
175
+ timings: dict[str, float],
176
+ *,
177
+ style: OutputStyle,
178
+ order: list[tuple[str, str]],
179
+ extra_lines: list[str] | None = None,
180
+ ) -> None:
181
+ if not style.timing:
182
+ return
183
+ _safe_console_print(console, "", markup=False)
184
+ _safe_console_print(console, "TIMING SUMMARY", markup=False)
185
+ for label, key in order:
186
+ if key not in timings:
187
+ continue
188
+ _safe_console_print(
189
+ console, f" {label:<11}: {timings[key]:.2f}s", markup=False
190
+ )
191
+ if extra_lines:
192
+ for line in extra_lines:
193
+ _safe_console_print(console, line, markup=False)
@@ -31,7 +31,7 @@ _FAMILY_MAP: dict[str, tuple[str, str, list[str]]] = {
31
31
  "hf_awq": ("awq", "autoawq", []),
32
32
  "hf_bnb": ("bnb", "bitsandbytes", []),
33
33
  # ONNX stack (requires extras: invarlock[onnx])
34
- "hf_onnx": ("onnx", "onnxruntime", []),
34
+ "hf_causal_onnx": ("onnx", "onnxruntime", []),
35
35
  }
36
36
 
37
37
 
@@ -6,7 +6,7 @@ Numerically stable bootstrap helpers for evaluation metrics.
6
6
 
7
7
  This module provides bias-corrected and accelerated (BCa) confidence
8
8
  intervals tailored for paired log-loss statistics used by the runner
9
- and safety certificate reports.
9
+ and evaluation certificate reports.
10
10
  """
11
11
 
12
12
  from __future__ import annotations
@@ -140,23 +140,21 @@ class CoreRegistry:
140
140
  )
141
141
 
142
142
  # Register built-in adapters
143
- _fallback(self._adapters, "hf_gpt2", "invarlock.adapters", "HF_GPT2_Adapter")
144
- _fallback(self._adapters, "hf_bert", "invarlock.adapters", "HF_BERT_Adapter")
145
- _fallback(self._adapters, "hf_llama", "invarlock.adapters", "HF_LLaMA_Adapter")
146
- _fallback(self._adapters, "hf_t5", "invarlock.adapters", "HF_T5_Adapter")
147
143
  _fallback(
148
- self._adapters, "hf_onnx", "invarlock.adapters", "HF_ORT_CausalLM_Adapter"
144
+ self._adapters, "hf_causal", "invarlock.adapters", "HF_Causal_Adapter"
149
145
  )
150
- # Convenience auto adapters (delegate to built-ins)
146
+ _fallback(self._adapters, "hf_mlm", "invarlock.adapters", "HF_MLM_Adapter")
151
147
  _fallback(
152
- self._adapters,
153
- "hf_causal_auto",
154
- "invarlock.adapters",
155
- "HF_Causal_Auto_Adapter",
148
+ self._adapters, "hf_seq2seq", "invarlock.adapters", "HF_Seq2Seq_Adapter"
156
149
  )
157
150
  _fallback(
158
- self._adapters, "hf_mlm_auto", "invarlock.adapters", "HF_MLM_Auto_Adapter"
151
+ self._adapters,
152
+ "hf_causal_onnx",
153
+ "invarlock.adapters",
154
+ "HF_Causal_ONNX_Adapter",
155
+ required_deps=["optimum"],
159
156
  )
157
+ _fallback(self._adapters, "hf_auto", "invarlock.adapters", "HF_Auto_Adapter")
160
158
  # Optional plugin adapters (verify runtime dependencies)
161
159
  _fallback(
162
160
  self._adapters,
invarlock/core/runner.py CHANGED
@@ -19,6 +19,11 @@ from typing import Any
19
19
  import numpy as np
20
20
 
21
21
  from invarlock.eval.tail_stats import evaluate_metric_tail
22
+ from invarlock.observability.metrics import (
23
+ capture_memory_snapshot,
24
+ reset_peak_memory_stats,
25
+ summarize_memory_snapshots,
26
+ )
22
27
 
23
28
  from .api import (
24
29
  EditLike,
@@ -190,6 +195,18 @@ class CoreRunner:
190
195
  pass
191
196
 
192
197
  report.status = RunStatus.RUNNING.value
198
+ timings: dict[str, float] = {}
199
+ guard_timings: dict[str, float] = {}
200
+ memory_snapshots: list[dict[str, Any]] = []
201
+ total_start = time.perf_counter()
202
+
203
+ def _record_timing(key: str, start: float) -> None:
204
+ timings[key] = max(0.0, float(time.perf_counter() - start))
205
+
206
+ def _capture_memory(phase: str) -> None:
207
+ snapshot = capture_memory_snapshot(phase)
208
+ if snapshot:
209
+ memory_snapshots.append(snapshot)
193
210
 
194
211
  try:
195
212
  # Log start
@@ -205,40 +222,78 @@ class CoreRunner:
205
222
  )
206
223
 
207
224
  # Phase 1: Prepare (describe model, create checkpoint)
208
- model_desc = self._prepare_phase(model, adapter, report)
225
+ reset_peak_memory_stats()
226
+ phase_start = time.perf_counter()
227
+ try:
228
+ model_desc = self._prepare_phase(model, adapter, report)
229
+ finally:
230
+ _record_timing("prepare", phase_start)
231
+ _capture_memory("prepare")
209
232
 
210
233
  # Phase 2: Prepare guards (must happen before edit)
211
- self._prepare_guards_phase(
212
- model,
213
- adapter,
214
- guards,
215
- calibration_data,
216
- report,
217
- auto_config,
218
- config,
219
- )
234
+ reset_peak_memory_stats()
235
+ phase_start = time.perf_counter()
236
+ try:
237
+ self._prepare_guards_phase(
238
+ model,
239
+ adapter,
240
+ guards,
241
+ calibration_data,
242
+ report,
243
+ auto_config,
244
+ config,
245
+ )
246
+ finally:
247
+ _record_timing("prepare_guards", phase_start)
248
+ _capture_memory("prepare_guards")
220
249
 
221
250
  # Phase 3: Apply edit
222
- self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
251
+ reset_peak_memory_stats()
252
+ phase_start = time.perf_counter()
253
+ try:
254
+ self._edit_phase(model, adapter, edit, model_desc, report, edit_config)
255
+ finally:
256
+ _record_timing("edit", phase_start)
257
+ _capture_memory("edit")
223
258
 
224
259
  # Phase 4: Run guards
225
- guard_results = self._guard_phase(model, adapter, guards, report)
260
+ reset_peak_memory_stats()
261
+ phase_start = time.perf_counter()
262
+ try:
263
+ guard_results = self._guard_phase(
264
+ model, adapter, guards, report, guard_timings=guard_timings
265
+ )
266
+ finally:
267
+ _record_timing("guards", phase_start)
268
+ _capture_memory("guards")
226
269
 
227
270
  # Phase 5: Evaluate final metrics
228
- metrics = self._eval_phase(
229
- model,
230
- adapter,
231
- calibration_data,
232
- report,
233
- preview_n,
234
- final_n,
235
- config,
236
- )
271
+ reset_peak_memory_stats()
272
+ phase_start = time.perf_counter()
273
+ try:
274
+ metrics = self._eval_phase(
275
+ model,
276
+ adapter,
277
+ calibration_data,
278
+ report,
279
+ preview_n,
280
+ final_n,
281
+ config,
282
+ )
283
+ finally:
284
+ _record_timing("eval", phase_start)
285
+ _capture_memory("eval")
237
286
 
238
287
  # Phase 6: Finalize or rollback
239
- final_status = self._finalize_phase(
240
- model, adapter, guard_results, metrics, config, report
241
- )
288
+ reset_peak_memory_stats()
289
+ phase_start = time.perf_counter()
290
+ try:
291
+ final_status = self._finalize_phase(
292
+ model, adapter, guard_results, metrics, config, report
293
+ )
294
+ finally:
295
+ _record_timing("finalize", phase_start)
296
+ _capture_memory("finalize")
242
297
 
243
298
  report.status = final_status
244
299
  report.meta["end_time"] = time.time()
@@ -260,6 +315,25 @@ class CoreRunner:
260
315
  return report
261
316
 
262
317
  finally:
318
+ _record_timing("total", total_start)
319
+ if not isinstance(report.metrics, dict):
320
+ report.metrics = {}
321
+ if timings:
322
+ report.metrics.setdefault("timings", {}).update(timings)
323
+ if guard_timings:
324
+ report.metrics["guard_timings"] = guard_timings
325
+ if memory_snapshots:
326
+ report.metrics["memory_snapshots"] = memory_snapshots
327
+ summary = summarize_memory_snapshots(memory_snapshots)
328
+ if summary:
329
+ mem_peak = summary.get("memory_mb_peak")
330
+ if isinstance(mem_peak, (int | float)):
331
+ existing = report.metrics.get("memory_mb_peak")
332
+ if isinstance(existing, (int | float)):
333
+ summary["memory_mb_peak"] = max(
334
+ float(existing), float(mem_peak)
335
+ )
336
+ report.metrics.update(summary)
263
337
  self._active_model = None
264
338
  self._active_adapter = None
265
339
  self._cleanup_services()
@@ -455,7 +529,13 @@ class CoreRunner:
455
529
  )
456
530
 
457
531
  def _guard_phase(
458
- self, model: Any, adapter: ModelAdapter, guards: list[Guard], report: RunReport
532
+ self,
533
+ model: Any,
534
+ adapter: ModelAdapter,
535
+ guards: list[Guard],
536
+ report: RunReport,
537
+ *,
538
+ guard_timings: dict[str, float] | None = None,
459
539
  ) -> dict[str, dict[str, Any]]:
460
540
  """Phase 4: Run safety guards."""
461
541
  self._log_event("guards", "start", LogLevel.INFO, {"count": len(guards)})
@@ -464,6 +544,7 @@ class CoreRunner:
464
544
 
465
545
  for guard in guards:
466
546
  self._log_event("guard", "start", LogLevel.INFO, {"guard": guard.name})
547
+ guard_start = time.perf_counter()
467
548
 
468
549
  if isinstance(guard, GuardWithContext):
469
550
  try:
@@ -497,6 +578,11 @@ class CoreRunner:
497
578
  LogLevel.ERROR,
498
579
  {"guard": guard.name, "error": str(e)},
499
580
  )
581
+ finally:
582
+ if guard_timings is not None:
583
+ guard_timings[guard.name] = max(
584
+ 0.0, float(time.perf_counter() - guard_start)
585
+ )
500
586
 
501
587
  report.guards = guard_results
502
588