invarlock 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +33 -0
- invarlock/__main__.py +10 -0
- invarlock/_data/runtime/profiles/ci_cpu.yaml +15 -0
- invarlock/_data/runtime/profiles/release.yaml +23 -0
- invarlock/_data/runtime/tiers.yaml +76 -0
- invarlock/adapters/__init__.py +102 -0
- invarlock/adapters/_capabilities.py +45 -0
- invarlock/adapters/auto.py +99 -0
- invarlock/adapters/base.py +530 -0
- invarlock/adapters/base_types.py +85 -0
- invarlock/adapters/hf_bert.py +852 -0
- invarlock/adapters/hf_gpt2.py +403 -0
- invarlock/adapters/hf_llama.py +485 -0
- invarlock/adapters/hf_mixin.py +383 -0
- invarlock/adapters/hf_onnx.py +112 -0
- invarlock/adapters/hf_t5.py +137 -0
- invarlock/adapters/py.typed +1 -0
- invarlock/assurance/__init__.py +43 -0
- invarlock/cli/__init__.py +8 -0
- invarlock/cli/__main__.py +8 -0
- invarlock/cli/_evidence.py +25 -0
- invarlock/cli/_json.py +75 -0
- invarlock/cli/adapter_auto.py +162 -0
- invarlock/cli/app.py +287 -0
- invarlock/cli/commands/__init__.py +26 -0
- invarlock/cli/commands/certify.py +403 -0
- invarlock/cli/commands/doctor.py +1358 -0
- invarlock/cli/commands/explain_gates.py +151 -0
- invarlock/cli/commands/export_html.py +100 -0
- invarlock/cli/commands/plugins.py +1331 -0
- invarlock/cli/commands/report.py +354 -0
- invarlock/cli/commands/run.py +4146 -0
- invarlock/cli/commands/verify.py +1040 -0
- invarlock/cli/config.py +396 -0
- invarlock/cli/constants.py +68 -0
- invarlock/cli/device.py +92 -0
- invarlock/cli/doctor_helpers.py +74 -0
- invarlock/cli/errors.py +6 -0
- invarlock/cli/overhead_utils.py +60 -0
- invarlock/cli/provenance.py +66 -0
- invarlock/cli/utils.py +41 -0
- invarlock/config.py +56 -0
- invarlock/core/__init__.py +62 -0
- invarlock/core/abi.py +15 -0
- invarlock/core/api.py +274 -0
- invarlock/core/auto_tuning.py +317 -0
- invarlock/core/bootstrap.py +226 -0
- invarlock/core/checkpoint.py +221 -0
- invarlock/core/contracts.py +73 -0
- invarlock/core/error_utils.py +64 -0
- invarlock/core/events.py +298 -0
- invarlock/core/exceptions.py +95 -0
- invarlock/core/registry.py +481 -0
- invarlock/core/retry.py +146 -0
- invarlock/core/runner.py +2041 -0
- invarlock/core/types.py +154 -0
- invarlock/edits/__init__.py +12 -0
- invarlock/edits/_edit_utils.py +249 -0
- invarlock/edits/_external_utils.py +268 -0
- invarlock/edits/noop.py +47 -0
- invarlock/edits/py.typed +1 -0
- invarlock/edits/quant_rtn.py +801 -0
- invarlock/edits/registry.py +166 -0
- invarlock/eval/__init__.py +23 -0
- invarlock/eval/bench.py +1207 -0
- invarlock/eval/bootstrap.py +50 -0
- invarlock/eval/data.py +2052 -0
- invarlock/eval/metrics.py +2167 -0
- invarlock/eval/primary_metric.py +767 -0
- invarlock/eval/probes/__init__.py +24 -0
- invarlock/eval/probes/fft.py +139 -0
- invarlock/eval/probes/mi.py +213 -0
- invarlock/eval/probes/post_attention.py +323 -0
- invarlock/eval/providers/base.py +67 -0
- invarlock/eval/providers/seq2seq.py +111 -0
- invarlock/eval/providers/text_lm.py +113 -0
- invarlock/eval/providers/vision_text.py +93 -0
- invarlock/eval/py.typed +1 -0
- invarlock/guards/__init__.py +18 -0
- invarlock/guards/_contracts.py +9 -0
- invarlock/guards/invariants.py +640 -0
- invarlock/guards/policies.py +805 -0
- invarlock/guards/py.typed +1 -0
- invarlock/guards/rmt.py +2097 -0
- invarlock/guards/spectral.py +1419 -0
- invarlock/guards/tier_config.py +354 -0
- invarlock/guards/variance.py +3298 -0
- invarlock/guards_ref/__init__.py +15 -0
- invarlock/guards_ref/rmt_ref.py +40 -0
- invarlock/guards_ref/spectral_ref.py +135 -0
- invarlock/guards_ref/variance_ref.py +60 -0
- invarlock/model_profile.py +353 -0
- invarlock/model_utils.py +221 -0
- invarlock/observability/__init__.py +10 -0
- invarlock/observability/alerting.py +535 -0
- invarlock/observability/core.py +546 -0
- invarlock/observability/exporters.py +565 -0
- invarlock/observability/health.py +588 -0
- invarlock/observability/metrics.py +457 -0
- invarlock/observability/py.typed +1 -0
- invarlock/observability/utils.py +553 -0
- invarlock/plugins/__init__.py +12 -0
- invarlock/plugins/hello_guard.py +33 -0
- invarlock/plugins/hf_awq_adapter.py +82 -0
- invarlock/plugins/hf_bnb_adapter.py +79 -0
- invarlock/plugins/hf_gptq_adapter.py +78 -0
- invarlock/plugins/py.typed +1 -0
- invarlock/py.typed +1 -0
- invarlock/reporting/__init__.py +7 -0
- invarlock/reporting/certificate.py +3221 -0
- invarlock/reporting/certificate_schema.py +244 -0
- invarlock/reporting/dataset_hashing.py +215 -0
- invarlock/reporting/guards_analysis.py +948 -0
- invarlock/reporting/html.py +32 -0
- invarlock/reporting/normalizer.py +235 -0
- invarlock/reporting/policy_utils.py +517 -0
- invarlock/reporting/primary_metric_utils.py +265 -0
- invarlock/reporting/render.py +1442 -0
- invarlock/reporting/report.py +903 -0
- invarlock/reporting/report_types.py +278 -0
- invarlock/reporting/utils.py +175 -0
- invarlock/reporting/validate.py +631 -0
- invarlock/security.py +176 -0
- invarlock/sparsity_utils.py +323 -0
- invarlock/utils/__init__.py +150 -0
- invarlock/utils/digest.py +45 -0
- invarlock-0.2.0.dist-info/METADATA +586 -0
- invarlock-0.2.0.dist-info/RECORD +132 -0
- invarlock-0.2.0.dist-info/WHEEL +5 -0
- invarlock-0.2.0.dist-info/entry_points.txt +20 -0
- invarlock-0.2.0.dist-info/licenses/LICENSE +201 -0
- invarlock-0.2.0.dist-info/top_level.txt +1 -0
invarlock/eval/bench.py
ADDED
|
@@ -0,0 +1,1207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
InvarLock Guard Effect Benchmark - Step 14 Implementation
|
|
3
|
+
=====================================================
|
|
4
|
+
|
|
5
|
+
Benchmark harness for comparing "bare" vs "guarded" runs across different edit types,
|
|
6
|
+
tiers, and probes configurations. Provides comprehensive analysis of guard effectiveness
|
|
7
|
+
and overhead with precise validation gates.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python -m invarlock.eval.bench --edits quant_rtn --tiers balanced --probes 0,2,4 --profile ci
|
|
11
|
+
|
|
12
|
+
Key Features:
|
|
13
|
+
- Edit × Tier × Probes scenario grid
|
|
14
|
+
- Paired runs (bare vs guarded) with identical windows
|
|
15
|
+
- Comprehensive metrics with validation gates
|
|
16
|
+
- Support for CI (50/50) and Release (100/100) profiles
|
|
17
|
+
- Optional dependency checking (e.g., GPTQ)
|
|
18
|
+
- JSON artifacts and Markdown summary tables
|
|
19
|
+
- Exit non-zero on any gate failure
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
import itertools
|
|
26
|
+
import json
|
|
27
|
+
import logging
|
|
28
|
+
import math
|
|
29
|
+
import sys
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from datetime import datetime
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Any
|
|
34
|
+
|
|
35
|
+
# Import InvarLock components
|
|
36
|
+
from invarlock.reporting.report_types import RunReport, create_empty_report
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ScenarioConfig:
|
|
43
|
+
"""Configuration for a single benchmark scenario."""
|
|
44
|
+
|
|
45
|
+
edit: str
|
|
46
|
+
tier: str
|
|
47
|
+
probes: int
|
|
48
|
+
profile: str = "ci" # "ci" or "release"
|
|
49
|
+
model_id: str = "gpt2"
|
|
50
|
+
adapter: str = "hf_gpt2"
|
|
51
|
+
device: str = "auto"
|
|
52
|
+
seq_len: int = 512
|
|
53
|
+
stride: int = 128
|
|
54
|
+
preview_n: int | None = None # Will be set by profile
|
|
55
|
+
final_n: int | None = None # Will be set by profile
|
|
56
|
+
seed: int = 42
|
|
57
|
+
|
|
58
|
+
def __post_init__(self):
|
|
59
|
+
"""Apply profile-specific settings."""
|
|
60
|
+
if self.profile == "ci":
|
|
61
|
+
if self.preview_n is None:
|
|
62
|
+
self.preview_n = 50
|
|
63
|
+
if self.final_n is None:
|
|
64
|
+
self.final_n = 50
|
|
65
|
+
elif self.profile == "release":
|
|
66
|
+
if self.preview_n is None:
|
|
67
|
+
self.preview_n = 100
|
|
68
|
+
if self.final_n is None:
|
|
69
|
+
self.final_n = 100
|
|
70
|
+
else:
|
|
71
|
+
raise ValueError(f"Unknown profile: {self.profile}. Use 'ci' or 'release'")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class BenchmarkConfig:
|
|
76
|
+
"""Global configuration for benchmark execution."""
|
|
77
|
+
|
|
78
|
+
edits: list[str]
|
|
79
|
+
tiers: list[str]
|
|
80
|
+
probes: list[int]
|
|
81
|
+
profile: str = "ci" # "ci" or "release"
|
|
82
|
+
dataset: str = "wikitext2"
|
|
83
|
+
model_id: str = "gpt2"
|
|
84
|
+
adapter: str = "hf_gpt2"
|
|
85
|
+
device: str = "auto"
|
|
86
|
+
seq_len: int = 512
|
|
87
|
+
stride: int = 128
|
|
88
|
+
seed: int = 42
|
|
89
|
+
output_dir: Path = Path("benchmarks")
|
|
90
|
+
|
|
91
|
+
# Threshold configuration
|
|
92
|
+
epsilon: float | None = (
|
|
93
|
+
None # RMT deadband tolerance (None = use resolved deadband)
|
|
94
|
+
)
|
|
95
|
+
strict: bool = False # If True, sets epsilon = 0
|
|
96
|
+
ppl_overhead_threshold: float = 0.01 # 1%
|
|
97
|
+
guard_overhead_time_threshold: float = 0.15 # 15%
|
|
98
|
+
guard_overhead_mem_threshold: float = 0.10 # 10%
|
|
99
|
+
catastrophic_spike_threshold: float = (
|
|
100
|
+
2.0 # Primary-metric ratio (ppl-like) that triggers rollback
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def __post_init__(self):
|
|
104
|
+
"""Apply post-initialization logic."""
|
|
105
|
+
self.output_dir = Path(self.output_dir)
|
|
106
|
+
|
|
107
|
+
# Handle strict mode
|
|
108
|
+
if self.strict:
|
|
109
|
+
self.epsilon = 0.0
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class ScenarioResult:
|
|
114
|
+
"""Results from a single benchmark scenario."""
|
|
115
|
+
|
|
116
|
+
config: ScenarioConfig
|
|
117
|
+
bare_result: RunResult | None = None
|
|
118
|
+
guarded_result: RunResult | None = None
|
|
119
|
+
metrics: dict[str, Any] = field(default_factory=dict)
|
|
120
|
+
gates: dict[str, bool] = field(default_factory=dict)
|
|
121
|
+
skipped: bool = False
|
|
122
|
+
skip_reason: str | None = None
|
|
123
|
+
probes_used: int = 0
|
|
124
|
+
epsilon_used: float = 0.0
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class RunResult:
|
|
129
|
+
"""Results from a single run (bare or guarded)."""
|
|
130
|
+
|
|
131
|
+
run_type: str # "bare" or "guarded"
|
|
132
|
+
report: RunReport
|
|
133
|
+
success: bool
|
|
134
|
+
error_message: str | None = None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@dataclass
|
|
138
|
+
class BenchmarkSummary:
|
|
139
|
+
"""Overall benchmark summary."""
|
|
140
|
+
|
|
141
|
+
config: BenchmarkConfig
|
|
142
|
+
scenarios: list[ScenarioResult]
|
|
143
|
+
overall_pass: bool
|
|
144
|
+
timestamp: str
|
|
145
|
+
execution_time_seconds: float
|
|
146
|
+
schema_version: str = "bench-v1"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class DependencyChecker:
|
|
150
|
+
"""Check for optional dependencies required by specific edit types."""
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def check_external_deps() -> tuple[bool, str]:
|
|
154
|
+
"""Check if external dependencies are available (placeholder for future use)."""
|
|
155
|
+
# Placeholder for when external edit plugins are re-enabled
|
|
156
|
+
return True, "Available"
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
def check_peft() -> tuple[bool, str]:
|
|
160
|
+
"""Deprecated: external fine-tuning adapters are not supported in this profile."""
|
|
161
|
+
return False, "unsupported edit"
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
def check_edit_dependencies(cls, edit_name: str) -> tuple[bool, str]:
|
|
165
|
+
"""Check dependencies for a specific edit type."""
|
|
166
|
+
# Only quant_rtn is supported
|
|
167
|
+
if edit_name.lower() == "quant_rtn":
|
|
168
|
+
return True, "Available"
|
|
169
|
+
return False, "unsupported edit"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class ConfigurationManager:
|
|
173
|
+
"""Manage configuration generation for bare vs guarded runs."""
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def create_base_config(scenario: ScenarioConfig) -> dict[str, Any]:
|
|
177
|
+
"""Create base configuration dictionary."""
|
|
178
|
+
return {
|
|
179
|
+
"model": {
|
|
180
|
+
"id": scenario.model_id,
|
|
181
|
+
"adapter": scenario.adapter,
|
|
182
|
+
"device": scenario.device,
|
|
183
|
+
"dtype": "float16",
|
|
184
|
+
},
|
|
185
|
+
"dataset": {
|
|
186
|
+
"provider": "wikitext2",
|
|
187
|
+
"seq_len": scenario.seq_len,
|
|
188
|
+
"stride": scenario.stride,
|
|
189
|
+
"preview_n": scenario.preview_n,
|
|
190
|
+
"final_n": scenario.final_n,
|
|
191
|
+
"seed": scenario.seed,
|
|
192
|
+
},
|
|
193
|
+
"edit": {
|
|
194
|
+
"name": scenario.edit,
|
|
195
|
+
"plan": ConfigurationManager._get_edit_plan(
|
|
196
|
+
scenario.edit, scenario.profile
|
|
197
|
+
),
|
|
198
|
+
},
|
|
199
|
+
"eval": {
|
|
200
|
+
"spike_threshold": 2.0 # Catastrophic spike threshold
|
|
201
|
+
},
|
|
202
|
+
"output": {
|
|
203
|
+
"dir": "runs" # Will be set per run
|
|
204
|
+
},
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def _get_edit_plan(edit_name: str, profile: str) -> dict[str, Any]:
|
|
209
|
+
"""Get edit plan configuration based on edit type and profile."""
|
|
210
|
+
plans = {
|
|
211
|
+
"quant_rtn": {
|
|
212
|
+
"bitwidth": 8,
|
|
213
|
+
"per_channel": True,
|
|
214
|
+
"group_size": 128,
|
|
215
|
+
"clamp_ratio": 0.0,
|
|
216
|
+
"scope": "ffn",
|
|
217
|
+
},
|
|
218
|
+
"gptq": {"bits": 4, "group_size": 128, "damp_percent": 0.01},
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
return plans.get(edit_name, {})
|
|
222
|
+
|
|
223
|
+
@classmethod
|
|
224
|
+
def create_bare_config(cls, scenario: ScenarioConfig) -> dict[str, Any]:
|
|
225
|
+
"""Create configuration for bare run (guards disabled)."""
|
|
226
|
+
base_config = cls.create_base_config(scenario)
|
|
227
|
+
|
|
228
|
+
# Disable auto-tuning for bare runs
|
|
229
|
+
base_config["auto"] = {"enabled": False, "tier": "balanced", "probes": 0}
|
|
230
|
+
|
|
231
|
+
# Disable all guards for bare run, but keep invariants in warn mode for metrics
|
|
232
|
+
base_config["guards"] = {
|
|
233
|
+
"order": ["invariants"], # Only pre-invariants for metrics collection
|
|
234
|
+
"invariants": {
|
|
235
|
+
"mode": "warn" # Collect metrics but don't enforce
|
|
236
|
+
},
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
return base_config
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def create_guarded_config(cls, scenario: ScenarioConfig) -> dict[str, Any]:
|
|
243
|
+
"""Create configuration for guarded run (full chain with tier-based auto-tuning)."""
|
|
244
|
+
base_config = cls.create_base_config(scenario)
|
|
245
|
+
|
|
246
|
+
# Enable auto-tuning with tier-based policies and probes
|
|
247
|
+
base_config["auto"] = {
|
|
248
|
+
"enabled": True,
|
|
249
|
+
"tier": scenario.tier,
|
|
250
|
+
"probes": scenario.probes,
|
|
251
|
+
"target_pm_ratio": None,
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
# Full guard chain - actual parameters will be set by auto-tuner based on tier
|
|
255
|
+
base_config["guards"] = {
|
|
256
|
+
"order": ["invariants", "spectral", "rmt", "variance", "invariants_post"],
|
|
257
|
+
"invariants": {"mode": "enforce"},
|
|
258
|
+
"invariants_post": {"mode": "enforce"},
|
|
259
|
+
# spectral, rmt, variance parameters will be set by auto-tuner based on tier
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return base_config
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class MetricsAggregator:
|
|
266
|
+
"""Aggregate and validate metrics from paired runs."""
|
|
267
|
+
|
|
268
|
+
@staticmethod
|
|
269
|
+
def extract_core_metrics(report: RunReport) -> dict[str, float]:
|
|
270
|
+
"""Extract core metrics from a RunReport (primary_metric-first)."""
|
|
271
|
+
metrics = report.get("metrics", {}) or {}
|
|
272
|
+
pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
|
|
273
|
+
pm_preview = float("nan")
|
|
274
|
+
pm_final = float("nan")
|
|
275
|
+
try:
|
|
276
|
+
if isinstance(pm, dict):
|
|
277
|
+
if isinstance(pm.get("preview"), int | float):
|
|
278
|
+
pm_preview = float(pm["preview"]) # type: ignore[index]
|
|
279
|
+
if isinstance(pm.get("final"), int | float):
|
|
280
|
+
pm_final = float(pm["final"]) # type: ignore[index]
|
|
281
|
+
except Exception:
|
|
282
|
+
pm_preview = float("nan")
|
|
283
|
+
pm_final = float("nan")
|
|
284
|
+
return {
|
|
285
|
+
"primary_metric_preview": pm_preview,
|
|
286
|
+
"primary_metric_final": pm_final,
|
|
287
|
+
"latency_ms_per_tok": metrics.get("latency_ms_per_tok", float("nan")),
|
|
288
|
+
"memory_mb_peak": metrics.get("memory_mb_peak", float("nan")),
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
@staticmethod
|
|
292
|
+
def extract_guard_metrics(report: RunReport) -> dict[str, Any]:
|
|
293
|
+
"""Extract guard-specific metrics from a RunReport."""
|
|
294
|
+
guard_metrics = {}
|
|
295
|
+
|
|
296
|
+
# Extract RMT outliers
|
|
297
|
+
rmt_metrics = report.get("metrics", {}).get("rmt", {})
|
|
298
|
+
guard_metrics["rmt_outliers"] = rmt_metrics.get("outliers", 0)
|
|
299
|
+
|
|
300
|
+
# Extract invariant violations
|
|
301
|
+
invariant_metrics = report.get("metrics", {}).get("invariants", {})
|
|
302
|
+
guard_metrics["tying_violations_post"] = invariant_metrics.get("violations", 0)
|
|
303
|
+
|
|
304
|
+
# Check if rollback occurred (catastrophic spike)
|
|
305
|
+
guard_metrics["catastrophic_spike"] = report.get("flags", {}).get(
|
|
306
|
+
"guard_recovered", False
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
return guard_metrics
|
|
310
|
+
|
|
311
|
+
@classmethod
|
|
312
|
+
def compute_comparison_metrics(
|
|
313
|
+
cls, bare_result: RunResult, guarded_result: RunResult
|
|
314
|
+
) -> dict[str, Any]:
|
|
315
|
+
"""Compute comparison metrics between bare and guarded runs."""
|
|
316
|
+
if not (
|
|
317
|
+
bare_result
|
|
318
|
+
and guarded_result
|
|
319
|
+
and bare_result.success
|
|
320
|
+
and guarded_result.success
|
|
321
|
+
):
|
|
322
|
+
return {}
|
|
323
|
+
|
|
324
|
+
bare_metrics = cls.extract_core_metrics(bare_result.report)
|
|
325
|
+
guarded_metrics = cls.extract_core_metrics(guarded_result.report)
|
|
326
|
+
|
|
327
|
+
bare_guards = cls.extract_guard_metrics(bare_result.report)
|
|
328
|
+
guarded_guards = cls.extract_guard_metrics(guarded_result.report)
|
|
329
|
+
|
|
330
|
+
comparison = {}
|
|
331
|
+
|
|
332
|
+
# Core metrics
|
|
333
|
+
comparison.update(
|
|
334
|
+
{
|
|
335
|
+
"primary_metric_bare": bare_metrics.get(
|
|
336
|
+
"primary_metric_final", float("nan")
|
|
337
|
+
),
|
|
338
|
+
"primary_metric_guarded": guarded_metrics.get(
|
|
339
|
+
"primary_metric_final", float("nan")
|
|
340
|
+
),
|
|
341
|
+
"latency_bare": bare_metrics.get("latency_ms_per_tok", float("nan")),
|
|
342
|
+
"latency_guarded": guarded_metrics.get(
|
|
343
|
+
"latency_ms_per_tok", float("nan")
|
|
344
|
+
),
|
|
345
|
+
"mem_bare": bare_metrics.get("memory_mb_peak", float("nan")),
|
|
346
|
+
"mem_guarded": guarded_metrics.get("memory_mb_peak", float("nan")),
|
|
347
|
+
}
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Compute overhead metrics
|
|
351
|
+
pm_bare = comparison["primary_metric_bare"]
|
|
352
|
+
pm_guarded = comparison["primary_metric_guarded"]
|
|
353
|
+
if not (math.isnan(pm_bare) or math.isnan(pm_guarded)) and pm_bare > 0:
|
|
354
|
+
comparison["primary_metric_overhead"] = (pm_guarded - pm_bare) / pm_bare
|
|
355
|
+
else:
|
|
356
|
+
comparison["primary_metric_overhead"] = float("nan")
|
|
357
|
+
|
|
358
|
+
latency_bare = comparison["latency_bare"]
|
|
359
|
+
latency_guarded = comparison["latency_guarded"]
|
|
360
|
+
if (
|
|
361
|
+
not (math.isnan(latency_bare) or math.isnan(latency_guarded))
|
|
362
|
+
and latency_bare > 0
|
|
363
|
+
):
|
|
364
|
+
comparison["guard_overhead_time"] = (
|
|
365
|
+
latency_guarded - latency_bare
|
|
366
|
+
) / latency_bare
|
|
367
|
+
else:
|
|
368
|
+
comparison["guard_overhead_time"] = float("nan")
|
|
369
|
+
|
|
370
|
+
mem_bare = comparison["mem_bare"]
|
|
371
|
+
mem_guarded = comparison["mem_guarded"]
|
|
372
|
+
if not (math.isnan(mem_bare) or math.isnan(mem_guarded)) and mem_bare > 0:
|
|
373
|
+
comparison["guard_overhead_mem"] = (mem_guarded - mem_bare) / mem_bare
|
|
374
|
+
else:
|
|
375
|
+
comparison["guard_overhead_mem"] = float("nan")
|
|
376
|
+
|
|
377
|
+
# Guard-specific metrics
|
|
378
|
+
comparison.update(
|
|
379
|
+
{
|
|
380
|
+
"rmt_outliers_bare": bare_guards.get("rmt_outliers", 0),
|
|
381
|
+
"rmt_outliers_guarded": guarded_guards.get("rmt_outliers", 0),
|
|
382
|
+
"tying_violations_post": guarded_guards.get("tying_violations_post", 0),
|
|
383
|
+
"catastrophic_spike": bare_guards.get("catastrophic_spike", False)
|
|
384
|
+
or guarded_guards.get("catastrophic_spike", False),
|
|
385
|
+
}
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
return comparison
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class ValidationGates:
|
|
392
|
+
"""Validate metrics against Step 14 gates."""
|
|
393
|
+
|
|
394
|
+
@staticmethod
|
|
395
|
+
def validate_catastrophic_spike_rate(comparison: dict[str, Any]) -> bool:
|
|
396
|
+
"""Gate: Catastrophic spike rate = 0 (no rollbacks)."""
|
|
397
|
+
return not comparison.get("catastrophic_spike", False)
|
|
398
|
+
|
|
399
|
+
@staticmethod
|
|
400
|
+
def validate_tying_violations(comparison: dict[str, Any]) -> bool:
|
|
401
|
+
"""Gate: Tying violations (post) = 0."""
|
|
402
|
+
return comparison.get("tying_violations_post", 0) == 0
|
|
403
|
+
|
|
404
|
+
@staticmethod
|
|
405
|
+
def validate_rmt_outliers(comparison: dict[str, Any], epsilon: float) -> bool:
|
|
406
|
+
"""Gate: RMT outliers ≤ ceil(bare × (1 + ε))."""
|
|
407
|
+
bare_outliers = comparison.get("rmt_outliers_bare", 0)
|
|
408
|
+
guarded_outliers = comparison.get("rmt_outliers_guarded", 0)
|
|
409
|
+
|
|
410
|
+
allowed = math.ceil(bare_outliers * (1.0 + epsilon))
|
|
411
|
+
return guarded_outliers <= allowed
|
|
412
|
+
|
|
413
|
+
@staticmethod
|
|
414
|
+
def validate_primary_metric_overhead(
|
|
415
|
+
comparison: dict[str, Any], threshold: float = 0.01
|
|
416
|
+
) -> bool:
|
|
417
|
+
"""Gate: Primary metric overhead ≤ threshold (default 1%)."""
|
|
418
|
+
overhead = comparison.get("primary_metric_overhead", float("nan"))
|
|
419
|
+
if math.isnan(overhead):
|
|
420
|
+
return True # Can't validate, assume pass
|
|
421
|
+
return overhead <= threshold
|
|
422
|
+
|
|
423
|
+
@staticmethod
|
|
424
|
+
def validate_time_overhead(
|
|
425
|
+
comparison: dict[str, Any], threshold: float = 0.15
|
|
426
|
+
) -> bool:
|
|
427
|
+
"""Gate: Time overhead ≤ 15%."""
|
|
428
|
+
overhead = comparison.get("guard_overhead_time", float("nan"))
|
|
429
|
+
if math.isnan(overhead):
|
|
430
|
+
return True # Can't validate, assume pass
|
|
431
|
+
return overhead <= threshold
|
|
432
|
+
|
|
433
|
+
@staticmethod
|
|
434
|
+
def validate_memory_overhead(
|
|
435
|
+
comparison: dict[str, Any], threshold: float = 0.10
|
|
436
|
+
) -> bool:
|
|
437
|
+
"""Gate: Memory overhead ≤ 10% (optional)."""
|
|
438
|
+
overhead = comparison.get("guard_overhead_mem", float("nan"))
|
|
439
|
+
if math.isnan(overhead):
|
|
440
|
+
return True # Can't validate, assume pass
|
|
441
|
+
return overhead <= threshold
|
|
442
|
+
|
|
443
|
+
@classmethod
|
|
444
|
+
def validate_all_gates(
|
|
445
|
+
cls, comparison: dict[str, Any], config: BenchmarkConfig, epsilon: float
|
|
446
|
+
) -> dict[str, bool]:
|
|
447
|
+
"""Validate all gates and return results."""
|
|
448
|
+
return {
|
|
449
|
+
"spike": cls.validate_catastrophic_spike_rate(comparison),
|
|
450
|
+
"tying": cls.validate_tying_violations(comparison),
|
|
451
|
+
"rmt": cls.validate_rmt_outliers(comparison, epsilon),
|
|
452
|
+
# quality gate measures relative change in primary metric
|
|
453
|
+
"quality": cls.validate_primary_metric_overhead(
|
|
454
|
+
comparison, config.ppl_overhead_threshold
|
|
455
|
+
),
|
|
456
|
+
"time": cls.validate_time_overhead(
|
|
457
|
+
comparison, config.guard_overhead_time_threshold
|
|
458
|
+
),
|
|
459
|
+
"mem": cls.validate_memory_overhead(
|
|
460
|
+
comparison, config.guard_overhead_mem_threshold
|
|
461
|
+
),
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def generate_scenarios(config: BenchmarkConfig) -> list[ScenarioConfig]:
|
|
466
|
+
"""Generate all scenarios from the Cartesian product of edits × tiers × probes."""
|
|
467
|
+
scenarios = []
|
|
468
|
+
|
|
469
|
+
for edit, tier, probes in itertools.product(
|
|
470
|
+
config.edits, config.tiers, config.probes
|
|
471
|
+
):
|
|
472
|
+
scenario = ScenarioConfig(
|
|
473
|
+
edit=edit,
|
|
474
|
+
tier=tier,
|
|
475
|
+
probes=probes,
|
|
476
|
+
profile=config.profile,
|
|
477
|
+
model_id=config.model_id,
|
|
478
|
+
adapter=config.adapter,
|
|
479
|
+
device=config.device,
|
|
480
|
+
seq_len=config.seq_len,
|
|
481
|
+
stride=config.stride,
|
|
482
|
+
seed=config.seed,
|
|
483
|
+
)
|
|
484
|
+
scenarios.append(scenario)
|
|
485
|
+
|
|
486
|
+
return scenarios
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def resolve_epsilon_from_runtime(guarded_report: RunReport) -> float:
|
|
490
|
+
"""Resolve epsilon from actual RMT deadband used at runtime."""
|
|
491
|
+
# Try to extract RMT deadband from guard reports
|
|
492
|
+
guards = guarded_report.get("guards", [])
|
|
493
|
+
for guard in guards:
|
|
494
|
+
if guard.get("name") == "rmt":
|
|
495
|
+
policy = guard.get("policy", {})
|
|
496
|
+
deadband = policy.get("deadband")
|
|
497
|
+
if deadband is not None:
|
|
498
|
+
return float(deadband)
|
|
499
|
+
|
|
500
|
+
# Fallback to default
|
|
501
|
+
return 0.10
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def execute_single_run(
|
|
505
|
+
run_config: dict[str, Any],
|
|
506
|
+
scenario: ScenarioConfig,
|
|
507
|
+
run_type: str,
|
|
508
|
+
output_dir: Path,
|
|
509
|
+
) -> RunResult:
|
|
510
|
+
"""Execute a single benchmark run and return results."""
|
|
511
|
+
try:
|
|
512
|
+
# For now, create a mock run since we don't have the full pipeline
|
|
513
|
+
# In real implementation, this would call the actual InvarLock pipeline
|
|
514
|
+
|
|
515
|
+
# Create a mock RunReport with realistic values
|
|
516
|
+
report = create_empty_report()
|
|
517
|
+
|
|
518
|
+
# Fill in metadata
|
|
519
|
+
report["meta"]["model_id"] = run_config["model"]["id"]
|
|
520
|
+
report["meta"]["adapter"] = run_config["model"]["adapter"]
|
|
521
|
+
report["meta"]["device"] = run_config["model"]["device"]
|
|
522
|
+
report["meta"]["ts"] = datetime.now().isoformat()
|
|
523
|
+
report["meta"]["seed"] = run_config["dataset"]["seed"]
|
|
524
|
+
|
|
525
|
+
# Fill in dataset config
|
|
526
|
+
report["data"]["dataset"] = run_config["dataset"]["provider"]
|
|
527
|
+
report["data"]["seq_len"] = run_config["dataset"]["seq_len"]
|
|
528
|
+
report["data"]["stride"] = run_config["dataset"]["stride"]
|
|
529
|
+
report["data"]["preview_n"] = run_config["dataset"]["preview_n"]
|
|
530
|
+
report["data"]["final_n"] = run_config["dataset"]["final_n"]
|
|
531
|
+
|
|
532
|
+
# Fill in edit info
|
|
533
|
+
report["edit"]["name"] = scenario.edit
|
|
534
|
+
report["edit"]["plan_digest"] = (
|
|
535
|
+
f"mock_digest_{scenario.edit}_{scenario.tier}_{scenario.probes}"
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Mock realistic metrics based on run type and scenario
|
|
539
|
+
if run_type == "bare":
|
|
540
|
+
# Bare runs: no guard overhead, potentially higher PM (ppl-like)
|
|
541
|
+
base_ppl = 45.0 + (hash(f"{scenario.edit}_{scenario.tier}") % 100) / 100.0
|
|
542
|
+
report["metrics"]["primary_metric"] = {
|
|
543
|
+
"kind": "perplexity",
|
|
544
|
+
"preview": base_ppl,
|
|
545
|
+
"final": base_ppl + 1.0,
|
|
546
|
+
}
|
|
547
|
+
report["metrics"]["latency_ms_per_tok"] = (
|
|
548
|
+
12.0 + (hash(scenario.tier) % 20) / 10.0
|
|
549
|
+
)
|
|
550
|
+
report["metrics"]["memory_mb_peak"] = 2000.0 + (
|
|
551
|
+
hash(str(scenario.probes)) % 200
|
|
552
|
+
)
|
|
553
|
+
report["metrics"]["rmt"] = {"outliers": 2 + (hash(scenario.edit) % 3)}
|
|
554
|
+
report["metrics"]["invariants"] = {"violations": 0}
|
|
555
|
+
else:
|
|
556
|
+
# Guarded runs: guard overhead, better stability, varies by tier
|
|
557
|
+
tier_factor = {"conservative": 0.95, "balanced": 0.97, "aggressive": 0.99}[
|
|
558
|
+
scenario.tier
|
|
559
|
+
]
|
|
560
|
+
probe_factor = 1.0 - (
|
|
561
|
+
scenario.probes * 0.01
|
|
562
|
+
) # Small improvement with probes
|
|
563
|
+
|
|
564
|
+
base_ppl = 45.0 + (hash(f"{scenario.edit}_{scenario.tier}") % 100) / 100.0
|
|
565
|
+
report["metrics"]["primary_metric"] = {
|
|
566
|
+
"kind": "perplexity",
|
|
567
|
+
"preview": base_ppl * tier_factor,
|
|
568
|
+
"final": base_ppl * tier_factor * probe_factor,
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
# Guard overhead varies by tier
|
|
572
|
+
time_overhead = {
|
|
573
|
+
"conservative": 0.12,
|
|
574
|
+
"balanced": 0.08,
|
|
575
|
+
"aggressive": 0.05,
|
|
576
|
+
}[scenario.tier]
|
|
577
|
+
mem_overhead = {"conservative": 0.08, "balanced": 0.06, "aggressive": 0.04}[
|
|
578
|
+
scenario.tier
|
|
579
|
+
]
|
|
580
|
+
|
|
581
|
+
report["metrics"]["latency_ms_per_tok"] = (
|
|
582
|
+
12.0 + (hash(scenario.tier) % 20) / 10.0
|
|
583
|
+
) * (1 + time_overhead)
|
|
584
|
+
report["metrics"]["memory_mb_peak"] = (
|
|
585
|
+
2000.0 + (hash(str(scenario.probes)) % 200)
|
|
586
|
+
) * (1 + mem_overhead)
|
|
587
|
+
report["metrics"]["rmt"] = {
|
|
588
|
+
"outliers": max(
|
|
589
|
+
0,
|
|
590
|
+
2
|
|
591
|
+
+ (hash(scenario.edit) % 3)
|
|
592
|
+
- (1 if scenario.tier == "conservative" else 0),
|
|
593
|
+
)
|
|
594
|
+
}
|
|
595
|
+
report["metrics"]["invariants"] = {"violations": 0}
|
|
596
|
+
|
|
597
|
+
# Mock guard reports for guarded runs
|
|
598
|
+
report["guards"] = [
|
|
599
|
+
{
|
|
600
|
+
"name": "invariants",
|
|
601
|
+
"policy": {"mode": "enforce"},
|
|
602
|
+
"metrics": {"checks": 5, "violations": 0},
|
|
603
|
+
"actions": ["validated"],
|
|
604
|
+
"violations": [],
|
|
605
|
+
},
|
|
606
|
+
{
|
|
607
|
+
"name": "spectral",
|
|
608
|
+
"policy": {
|
|
609
|
+
"sigma_quantile": tier_factor,
|
|
610
|
+
"scope": "ffn",
|
|
611
|
+
"deadband": 0.10,
|
|
612
|
+
},
|
|
613
|
+
"metrics": {
|
|
614
|
+
"max_sigma": 1.2,
|
|
615
|
+
"corrections": 1 if scenario.tier == "conservative" else 0,
|
|
616
|
+
},
|
|
617
|
+
"actions": ["monitored"],
|
|
618
|
+
"violations": [],
|
|
619
|
+
},
|
|
620
|
+
{
|
|
621
|
+
"name": "rmt",
|
|
622
|
+
"policy": {
|
|
623
|
+
"deadband": 0.05 if scenario.tier == "conservative" else 0.10,
|
|
624
|
+
"margin": 1.5,
|
|
625
|
+
},
|
|
626
|
+
"metrics": {
|
|
627
|
+
"outliers": report["metrics"]["rmt"]["outliers"],
|
|
628
|
+
"mp_fit": 0.95,
|
|
629
|
+
},
|
|
630
|
+
"actions": ["validated"],
|
|
631
|
+
"violations": [],
|
|
632
|
+
},
|
|
633
|
+
]
|
|
634
|
+
|
|
635
|
+
# Mock artifacts
|
|
636
|
+
report["artifacts"]["events_path"] = (
|
|
637
|
+
f"mock_events_{scenario.edit}_{scenario.tier}_{scenario.probes}_{run_type}.jsonl"
|
|
638
|
+
)
|
|
639
|
+
report["artifacts"]["logs_path"] = (
|
|
640
|
+
f"mock_logs_{scenario.edit}_{scenario.tier}_{scenario.probes}_{run_type}.txt"
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
return RunResult(run_type=run_type, report=report, success=True)
|
|
644
|
+
|
|
645
|
+
except Exception as e:
|
|
646
|
+
logger.error(f"Run failed for {scenario.edit} ({run_type}): {e}")
|
|
647
|
+
return RunResult(
|
|
648
|
+
run_type=run_type,
|
|
649
|
+
report=create_empty_report(),
|
|
650
|
+
success=False,
|
|
651
|
+
error_message=str(e),
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def execute_scenario(
|
|
656
|
+
scenario: ScenarioConfig, config: BenchmarkConfig, output_dir: Path
|
|
657
|
+
) -> ScenarioResult:
|
|
658
|
+
"""Execute a single benchmark scenario (both bare and guarded runs)."""
|
|
659
|
+
logger.info(
|
|
660
|
+
f"Executing scenario: {scenario.edit} × {scenario.tier} × {scenario.probes} probes"
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
# Check dependencies
|
|
664
|
+
deps_available, deps_message = DependencyChecker.check_edit_dependencies(
|
|
665
|
+
scenario.edit
|
|
666
|
+
)
|
|
667
|
+
if not deps_available:
|
|
668
|
+
logger.warning(f"Skipping scenario: {deps_message}")
|
|
669
|
+
return ScenarioResult(config=scenario, skipped=True, skip_reason=deps_message)
|
|
670
|
+
|
|
671
|
+
config_manager = ConfigurationManager()
|
|
672
|
+
metrics_aggregator = MetricsAggregator()
|
|
673
|
+
|
|
674
|
+
# Run bare configuration
|
|
675
|
+
logger.debug(f"Running bare configuration for {scenario.edit}")
|
|
676
|
+
bare_config = config_manager.create_bare_config(scenario)
|
|
677
|
+
bare_result = execute_single_run(bare_config, scenario, "bare", output_dir)
|
|
678
|
+
|
|
679
|
+
# Run guarded configuration
|
|
680
|
+
logger.debug(f"Running guarded configuration for {scenario.edit}")
|
|
681
|
+
guarded_config = config_manager.create_guarded_config(scenario)
|
|
682
|
+
guarded_result = execute_single_run(guarded_config, scenario, "guarded", output_dir)
|
|
683
|
+
|
|
684
|
+
# Compute comparison metrics
|
|
685
|
+
comparison_metrics = metrics_aggregator.compute_comparison_metrics(
|
|
686
|
+
bare_result, guarded_result
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
# Resolve epsilon from runtime or use config
|
|
690
|
+
epsilon_used = config.epsilon
|
|
691
|
+
if epsilon_used is None and guarded_result.success:
|
|
692
|
+
epsilon_used = resolve_epsilon_from_runtime(guarded_result.report)
|
|
693
|
+
elif epsilon_used is None:
|
|
694
|
+
epsilon_used = 0.10 # Default fallback
|
|
695
|
+
|
|
696
|
+
# Validate gates
|
|
697
|
+
gates = ValidationGates.validate_all_gates(comparison_metrics, config, epsilon_used)
|
|
698
|
+
|
|
699
|
+
# Mock probes_used based on scenario.probes (in real implementation, this would come from auto-tuner)
|
|
700
|
+
probes_used = min(
|
|
701
|
+
scenario.probes, scenario.probes
|
|
702
|
+
) # All requested probes used in mock
|
|
703
|
+
|
|
704
|
+
return ScenarioResult(
|
|
705
|
+
config=scenario,
|
|
706
|
+
bare_result=bare_result,
|
|
707
|
+
guarded_result=guarded_result,
|
|
708
|
+
metrics=comparison_metrics,
|
|
709
|
+
gates=gates,
|
|
710
|
+
probes_used=probes_used,
|
|
711
|
+
epsilon_used=epsilon_used,
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def run_guard_effect_benchmark(
|
|
716
|
+
edits: list[str],
|
|
717
|
+
tiers: list[str],
|
|
718
|
+
probes: list[int],
|
|
719
|
+
profile: str = "ci",
|
|
720
|
+
output_dir: str | Path = "benchmarks",
|
|
721
|
+
epsilon: float | None = None,
|
|
722
|
+
strict: bool = False,
|
|
723
|
+
**kwargs,
|
|
724
|
+
) -> dict[str, Any]:
|
|
725
|
+
"""
|
|
726
|
+
Run guard effect benchmark across edit × tier × probes scenarios.
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
edits: List of edit types to benchmark
|
|
730
|
+
tiers: List of tier configurations
|
|
731
|
+
probes: List of probe counts
|
|
732
|
+
profile: "ci" (50/50 windows) or "release" (100/100 windows)
|
|
733
|
+
output_dir: Directory to save results
|
|
734
|
+
epsilon: Optional epsilon override
|
|
735
|
+
strict: If True, sets epsilon = 0
|
|
736
|
+
**kwargs: Additional configuration options
|
|
737
|
+
|
|
738
|
+
Returns:
|
|
739
|
+
Dictionary with benchmark results and summary
|
|
740
|
+
|
|
741
|
+
Raises:
|
|
742
|
+
SystemExit: If any gates fail (non-zero exit code)
|
|
743
|
+
"""
|
|
744
|
+
start_time = datetime.now()
|
|
745
|
+
|
|
746
|
+
# Create configuration
|
|
747
|
+
config = BenchmarkConfig(
|
|
748
|
+
edits=edits,
|
|
749
|
+
tiers=tiers,
|
|
750
|
+
probes=probes,
|
|
751
|
+
profile=profile,
|
|
752
|
+
output_dir=Path(output_dir),
|
|
753
|
+
epsilon=epsilon,
|
|
754
|
+
strict=strict,
|
|
755
|
+
**kwargs,
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
config.output_dir.mkdir(parents=True, exist_ok=True)
|
|
759
|
+
|
|
760
|
+
logger.info(f"Starting guard effect benchmark with profile={profile}")
|
|
761
|
+
logger.info(
|
|
762
|
+
f"Scenario grid: {len(edits)} edits × {len(tiers)} tiers × {len(probes)} probes = {len(edits) * len(tiers) * len(probes)} scenarios"
|
|
763
|
+
)
|
|
764
|
+
logger.info(f"Output directory: {config.output_dir}")
|
|
765
|
+
|
|
766
|
+
# Generate scenarios
|
|
767
|
+
scenarios = generate_scenarios(config)
|
|
768
|
+
scenario_results = []
|
|
769
|
+
|
|
770
|
+
# Execute each scenario
|
|
771
|
+
for scenario in scenarios:
|
|
772
|
+
result = execute_scenario(scenario, config, config.output_dir)
|
|
773
|
+
scenario_results.append(result)
|
|
774
|
+
|
|
775
|
+
# Create summary
|
|
776
|
+
end_time = datetime.now()
|
|
777
|
+
execution_time = (end_time - start_time).total_seconds()
|
|
778
|
+
|
|
779
|
+
# Check overall pass/fail - any non-skipped scenario with failed gates = overall fail
|
|
780
|
+
overall_pass = True
|
|
781
|
+
for result in scenario_results:
|
|
782
|
+
if not result.skipped and result.gates:
|
|
783
|
+
if not all(result.gates.values()):
|
|
784
|
+
overall_pass = False
|
|
785
|
+
break
|
|
786
|
+
|
|
787
|
+
summary = BenchmarkSummary(
|
|
788
|
+
config=config,
|
|
789
|
+
scenarios=scenario_results,
|
|
790
|
+
overall_pass=overall_pass,
|
|
791
|
+
timestamp=start_time.isoformat(),
|
|
792
|
+
execution_time_seconds=execution_time,
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
# Generate outputs
|
|
796
|
+
_generate_outputs(summary)
|
|
797
|
+
|
|
798
|
+
logger.info(f"Benchmark completed in {execution_time:.1f}s")
|
|
799
|
+
logger.info(f"Overall result: {'PASS' if overall_pass else 'FAIL'}")
|
|
800
|
+
|
|
801
|
+
# Return results as dictionary
|
|
802
|
+
result = {
|
|
803
|
+
"overall_pass": overall_pass,
|
|
804
|
+
"execution_time_seconds": execution_time,
|
|
805
|
+
"timestamp": start_time.isoformat(),
|
|
806
|
+
"scenarios": [_scenario_result_to_dict(result) for result in scenario_results],
|
|
807
|
+
"config": _config_to_dict(config),
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
return result
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def _generate_outputs(summary: BenchmarkSummary) -> None:
|
|
814
|
+
"""Generate JSON and Markdown outputs according to Step 14 specification."""
|
|
815
|
+
results_dir = summary.config.output_dir / "results"
|
|
816
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
817
|
+
|
|
818
|
+
# Generate JSON artifact
|
|
819
|
+
json_path = results_dir / "guard_effect.json"
|
|
820
|
+
json_data = _summary_to_step14_json(summary)
|
|
821
|
+
with open(json_path, "w") as f:
|
|
822
|
+
json.dump(json_data, f, indent=2)
|
|
823
|
+
|
|
824
|
+
logger.info(f"JSON artifact saved: {json_path}")
|
|
825
|
+
|
|
826
|
+
# Generate Markdown summary
|
|
827
|
+
md_path = results_dir / "guard_effect.md"
|
|
828
|
+
with open(md_path, "w") as f:
|
|
829
|
+
f.write(_generate_step14_markdown(summary))
|
|
830
|
+
|
|
831
|
+
logger.info(f"Markdown report saved: {md_path}")
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def _summary_to_step14_json(summary: BenchmarkSummary) -> dict[str, Any]:
|
|
835
|
+
"""Convert summary to Step 14 JSON format."""
|
|
836
|
+
scenarios_data = []
|
|
837
|
+
|
|
838
|
+
for result in summary.scenarios:
|
|
839
|
+
scenario_data = {
|
|
840
|
+
"edit": result.config.edit,
|
|
841
|
+
"tier": result.config.tier,
|
|
842
|
+
"probes": result.config.probes,
|
|
843
|
+
"probes_used": result.probes_used,
|
|
844
|
+
"skip": result.skipped,
|
|
845
|
+
"skip_reason": result.skip_reason,
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
if not result.skipped and result.metrics:
|
|
849
|
+
# Add metrics
|
|
850
|
+
scenario_data.update(
|
|
851
|
+
{
|
|
852
|
+
"primary_metric_bare": result.metrics.get(
|
|
853
|
+
"primary_metric_bare", None
|
|
854
|
+
),
|
|
855
|
+
"primary_metric_guarded": result.metrics.get(
|
|
856
|
+
"primary_metric_guarded", None
|
|
857
|
+
),
|
|
858
|
+
"primary_metric_overhead": result.metrics.get(
|
|
859
|
+
"primary_metric_overhead", None
|
|
860
|
+
),
|
|
861
|
+
"latency_bare": result.metrics.get("latency_bare", None),
|
|
862
|
+
"latency_guarded": result.metrics.get("latency_guarded", None),
|
|
863
|
+
"guard_overhead_time": result.metrics.get(
|
|
864
|
+
"guard_overhead_time", None
|
|
865
|
+
),
|
|
866
|
+
"mem_bare": result.metrics.get("mem_bare", None),
|
|
867
|
+
"mem_guarded": result.metrics.get("mem_guarded", None),
|
|
868
|
+
"guard_overhead_mem": result.metrics.get(
|
|
869
|
+
"guard_overhead_mem", None
|
|
870
|
+
),
|
|
871
|
+
"rmt_outliers_bare": result.metrics.get("rmt_outliers_bare", None),
|
|
872
|
+
"rmt_outliers_guarded": result.metrics.get(
|
|
873
|
+
"rmt_outliers_guarded", None
|
|
874
|
+
),
|
|
875
|
+
"tying_violations_post": result.metrics.get(
|
|
876
|
+
"tying_violations_post", None
|
|
877
|
+
),
|
|
878
|
+
"epsilon": result.epsilon_used,
|
|
879
|
+
"pass": result.gates,
|
|
880
|
+
}
|
|
881
|
+
)
|
|
882
|
+
else:
|
|
883
|
+
# Skipped scenario
|
|
884
|
+
scenario_data.update(
|
|
885
|
+
{
|
|
886
|
+
"primary_metric_bare": None,
|
|
887
|
+
"primary_metric_guarded": None,
|
|
888
|
+
"primary_metric_overhead": None,
|
|
889
|
+
"latency_bare": None,
|
|
890
|
+
"latency_guarded": None,
|
|
891
|
+
"guard_overhead_time": None,
|
|
892
|
+
"mem_bare": None,
|
|
893
|
+
"mem_guarded": None,
|
|
894
|
+
"guard_overhead_mem": None,
|
|
895
|
+
"rmt_outliers_bare": None,
|
|
896
|
+
"rmt_outliers_guarded": None,
|
|
897
|
+
"tying_violations_post": None,
|
|
898
|
+
"epsilon": None,
|
|
899
|
+
"pass": {
|
|
900
|
+
"spike": None,
|
|
901
|
+
"tying": None,
|
|
902
|
+
"rmt": None,
|
|
903
|
+
"quality": None,
|
|
904
|
+
"time": None,
|
|
905
|
+
"mem": None,
|
|
906
|
+
},
|
|
907
|
+
}
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
scenarios_data.append(scenario_data)
|
|
911
|
+
|
|
912
|
+
return {
|
|
913
|
+
"schema_version": summary.schema_version,
|
|
914
|
+
"profile": summary.config.profile,
|
|
915
|
+
"seed": summary.config.seed,
|
|
916
|
+
"epsilon": summary.config.epsilon,
|
|
917
|
+
"scenarios": scenarios_data,
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
def _generate_step14_markdown(summary: BenchmarkSummary) -> str:
|
|
922
|
+
"""Generate Step 14 compliant Markdown report."""
|
|
923
|
+
lines = [
|
|
924
|
+
"# InvarLock Guard Effect Benchmark Report",
|
|
925
|
+
"",
|
|
926
|
+
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}",
|
|
927
|
+
f"**Profile:** {summary.config.profile}",
|
|
928
|
+
f"**Seed:** {summary.config.seed}",
|
|
929
|
+
f"**Epsilon:** {summary.config.epsilon if summary.config.epsilon is not None else 'auto'}",
|
|
930
|
+
f"**Execution Time:** {summary.execution_time_seconds:.1f}s",
|
|
931
|
+
f"**Overall Result:** {'✅ PASS' if summary.overall_pass else '❌ FAIL'}",
|
|
932
|
+
"",
|
|
933
|
+
"## Scenario Results",
|
|
934
|
+
"",
|
|
935
|
+
"| Edit | Tier | Probes | Status | PM Δ | Time Δ | Mem Δ | RMT | Gates |",
|
|
936
|
+
"|------|------|--------|--------|-------|--------|-------|-----|-------|",
|
|
937
|
+
]
|
|
938
|
+
|
|
939
|
+
for result in summary.scenarios:
|
|
940
|
+
if result.skipped:
|
|
941
|
+
status = "⏸️ SKIP"
|
|
942
|
+
ppl_delta = "-"
|
|
943
|
+
time_delta = "-"
|
|
944
|
+
mem_delta = "-"
|
|
945
|
+
rmt_info = "-"
|
|
946
|
+
gates_info = f"SKIP: {result.skip_reason}"
|
|
947
|
+
else:
|
|
948
|
+
# Determine status
|
|
949
|
+
all_pass = all(result.gates.values()) if result.gates else False
|
|
950
|
+
status = "✅ PASS" if all_pass else "❌ FAIL"
|
|
951
|
+
|
|
952
|
+
# Format metrics
|
|
953
|
+
pm_overhead = result.metrics.get("primary_metric_overhead")
|
|
954
|
+
if pm_overhead is not None and not math.isnan(pm_overhead):
|
|
955
|
+
ppl_delta = f"{pm_overhead:+.1%}"
|
|
956
|
+
if pm_overhead > 0.01: # > 1%
|
|
957
|
+
ppl_delta = f"🔴 {ppl_delta}"
|
|
958
|
+
else:
|
|
959
|
+
ppl_delta = f"🟢 {ppl_delta}"
|
|
960
|
+
else:
|
|
961
|
+
ppl_delta = "-"
|
|
962
|
+
|
|
963
|
+
time_overhead = result.metrics.get("guard_overhead_time")
|
|
964
|
+
if time_overhead is not None and not math.isnan(time_overhead):
|
|
965
|
+
time_delta = f"{time_overhead:+.1%}"
|
|
966
|
+
if time_overhead > 0.15: # > 15%
|
|
967
|
+
time_delta = f"🔴 {time_delta}"
|
|
968
|
+
else:
|
|
969
|
+
time_delta = f"🟢 {time_delta}"
|
|
970
|
+
else:
|
|
971
|
+
time_delta = "-"
|
|
972
|
+
|
|
973
|
+
mem_overhead = result.metrics.get("guard_overhead_mem")
|
|
974
|
+
if mem_overhead is not None and not math.isnan(mem_overhead):
|
|
975
|
+
mem_delta = f"{mem_overhead:+.1%}"
|
|
976
|
+
if mem_overhead > 0.10: # > 10%
|
|
977
|
+
mem_delta = f"🔴 {mem_delta}"
|
|
978
|
+
else:
|
|
979
|
+
mem_delta = f"🟢 {mem_delta}"
|
|
980
|
+
else:
|
|
981
|
+
mem_delta = "-"
|
|
982
|
+
|
|
983
|
+
bare_outliers = result.metrics.get("rmt_outliers_bare", 0)
|
|
984
|
+
guarded_outliers = result.metrics.get("rmt_outliers_guarded", 0)
|
|
985
|
+
rmt_info = f"{bare_outliers}→{guarded_outliers}"
|
|
986
|
+
|
|
987
|
+
# Gates summary
|
|
988
|
+
gates_status = []
|
|
989
|
+
if result.gates.get("spike", True):
|
|
990
|
+
gates_status.append("📈")
|
|
991
|
+
else:
|
|
992
|
+
gates_status.append("❌📈")
|
|
993
|
+
|
|
994
|
+
if result.gates.get("rmt", True):
|
|
995
|
+
gates_status.append("🔬")
|
|
996
|
+
else:
|
|
997
|
+
gates_status.append("❌🔬")
|
|
998
|
+
|
|
999
|
+
if result.gates.get("quality", True):
|
|
1000
|
+
gates_status.append("📊")
|
|
1001
|
+
else:
|
|
1002
|
+
gates_status.append("❌📊")
|
|
1003
|
+
|
|
1004
|
+
gates_info = " ".join(gates_status)
|
|
1005
|
+
|
|
1006
|
+
lines.append(
|
|
1007
|
+
f"| {result.config.edit} | {result.config.tier} | {result.config.probes} | {status} | {ppl_delta} | {time_delta} | {mem_delta} | {rmt_info} | {gates_info} |"
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
lines.extend(
|
|
1011
|
+
[
|
|
1012
|
+
"",
|
|
1013
|
+
"## Legend",
|
|
1014
|
+
"",
|
|
1015
|
+
"- 🟢 Within threshold",
|
|
1016
|
+
"- 🔴 Exceeds threshold",
|
|
1017
|
+
"- 📈 Spike gate",
|
|
1018
|
+
"- 🔬 RMT gate",
|
|
1019
|
+
"- 📊 Quality gate",
|
|
1020
|
+
"",
|
|
1021
|
+
]
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
return "\n".join(lines)
|
|
1025
|
+
|
|
1026
|
+
|
|
1027
|
+
def _scenario_result_to_dict(result: ScenarioResult) -> dict[str, Any]:
|
|
1028
|
+
"""Convert ScenarioResult to dictionary."""
|
|
1029
|
+
return {
|
|
1030
|
+
"edit": result.config.edit,
|
|
1031
|
+
"tier": result.config.tier,
|
|
1032
|
+
"probes": result.config.probes,
|
|
1033
|
+
"probes_used": result.probes_used,
|
|
1034
|
+
"skipped": result.skipped,
|
|
1035
|
+
"skip_reason": result.skip_reason,
|
|
1036
|
+
"metrics": result.metrics,
|
|
1037
|
+
"gates": result.gates,
|
|
1038
|
+
"epsilon_used": result.epsilon_used,
|
|
1039
|
+
"bare_success": result.bare_result.success if result.bare_result else False,
|
|
1040
|
+
"guarded_success": result.guarded_result.success
|
|
1041
|
+
if result.guarded_result
|
|
1042
|
+
else False,
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def _config_to_dict(config: BenchmarkConfig) -> dict[str, Any]:
|
|
1047
|
+
"""Convert BenchmarkConfig to dictionary."""
|
|
1048
|
+
return {
|
|
1049
|
+
"edits": config.edits,
|
|
1050
|
+
"tiers": config.tiers,
|
|
1051
|
+
"probes": config.probes,
|
|
1052
|
+
"profile": config.profile,
|
|
1053
|
+
"dataset": config.dataset,
|
|
1054
|
+
"model_id": config.model_id,
|
|
1055
|
+
"adapter": config.adapter,
|
|
1056
|
+
"device": config.device,
|
|
1057
|
+
"seq_len": config.seq_len,
|
|
1058
|
+
"stride": config.stride,
|
|
1059
|
+
"seed": config.seed,
|
|
1060
|
+
"epsilon": config.epsilon,
|
|
1061
|
+
"strict": config.strict,
|
|
1062
|
+
"ppl_overhead_threshold": config.ppl_overhead_threshold,
|
|
1063
|
+
"guard_overhead_time_threshold": config.guard_overhead_time_threshold,
|
|
1064
|
+
"guard_overhead_mem_threshold": config.guard_overhead_mem_threshold,
|
|
1065
|
+
"catastrophic_spike_threshold": config.catastrophic_spike_threshold,
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
def main():
|
|
1070
|
+
"""CLI entry point for Step 14 specification."""
|
|
1071
|
+
parser = argparse.ArgumentParser(
|
|
1072
|
+
description="InvarLock Guard Effect Benchmark - Step 14",
|
|
1073
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
# Required arguments
|
|
1077
|
+
parser.add_argument(
|
|
1078
|
+
"--edits",
|
|
1079
|
+
required=True,
|
|
1080
|
+
help="Comma-separated list of edit types (quant_rtn)",
|
|
1081
|
+
)
|
|
1082
|
+
parser.add_argument(
|
|
1083
|
+
"--tiers",
|
|
1084
|
+
default="balanced",
|
|
1085
|
+
help="Comma-separated list of tiers (conservative,balanced,aggressive)",
|
|
1086
|
+
)
|
|
1087
|
+
parser.add_argument(
|
|
1088
|
+
"--probes", default="0", help="Comma-separated list of probe counts (0,2,4)"
|
|
1089
|
+
)
|
|
1090
|
+
parser.add_argument(
|
|
1091
|
+
"--profile",
|
|
1092
|
+
default="ci",
|
|
1093
|
+
choices=["ci", "release"],
|
|
1094
|
+
help="Benchmark profile (ci=50/50 windows, release=100/100 windows)",
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
# Optional threshold configuration
|
|
1098
|
+
parser.add_argument(
|
|
1099
|
+
"--epsilon",
|
|
1100
|
+
type=float,
|
|
1101
|
+
help="RMT outliers epsilon threshold (default: use resolved RMT deadband)",
|
|
1102
|
+
)
|
|
1103
|
+
parser.add_argument(
|
|
1104
|
+
"--strict", action="store_true", help="Set epsilon=0 (overrides --epsilon)"
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
# Model and dataset configuration
|
|
1108
|
+
parser.add_argument(
|
|
1109
|
+
"--dataset", default="wikitext2", help="Dataset to use for benchmarking"
|
|
1110
|
+
)
|
|
1111
|
+
parser.add_argument("--model-id", default="gpt2", help="Model identifier")
|
|
1112
|
+
parser.add_argument("--adapter", default="hf_gpt2", help="Model adapter to use")
|
|
1113
|
+
parser.add_argument(
|
|
1114
|
+
"--device", default="auto", help="Device to use (auto|cuda|mps|cpu)"
|
|
1115
|
+
)
|
|
1116
|
+
parser.add_argument(
|
|
1117
|
+
"--seq-len", type=int, default=512, help="Sequence length for tokenization"
|
|
1118
|
+
)
|
|
1119
|
+
parser.add_argument(
|
|
1120
|
+
"--stride", type=int, default=128, help="Stride for window generation"
|
|
1121
|
+
)
|
|
1122
|
+
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
1123
|
+
parser.add_argument("--out", default="benchmarks", help="Output directory")
|
|
1124
|
+
parser.add_argument(
|
|
1125
|
+
"--verbose", "-v", action="store_true", help="Enable verbose logging"
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
args = parser.parse_args()
|
|
1129
|
+
|
|
1130
|
+
# Configure logging
|
|
1131
|
+
logging.basicConfig(
|
|
1132
|
+
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
1133
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
# Parse lists
|
|
1137
|
+
edits = [edit.strip() for edit in args.edits.split(",")]
|
|
1138
|
+
tiers = [tier.strip() for tier in args.tiers.split(",")]
|
|
1139
|
+
probes = [int(probe.strip()) for probe in args.probes.split(",")]
|
|
1140
|
+
|
|
1141
|
+
# Validate inputs — only quant_rtn is supported
|
|
1142
|
+
valid_edits = {"quant_rtn"}
|
|
1143
|
+
valid_tiers = {"conservative", "balanced", "aggressive"}
|
|
1144
|
+
|
|
1145
|
+
for edit in edits:
|
|
1146
|
+
if edit not in valid_edits:
|
|
1147
|
+
print(
|
|
1148
|
+
f"❌ Invalid edit type: {edit}. Valid: {', '.join(sorted(valid_edits))}"
|
|
1149
|
+
)
|
|
1150
|
+
sys.exit(1)
|
|
1151
|
+
|
|
1152
|
+
for tier in tiers:
|
|
1153
|
+
if tier not in valid_tiers:
|
|
1154
|
+
print(f"❌ Invalid tier: {tier}. Valid: {', '.join(sorted(valid_tiers))}")
|
|
1155
|
+
sys.exit(1)
|
|
1156
|
+
|
|
1157
|
+
for probe in probes:
|
|
1158
|
+
if probe < 0:
|
|
1159
|
+
print(f"❌ Invalid probe count: {probe}. Must be >= 0")
|
|
1160
|
+
sys.exit(1)
|
|
1161
|
+
|
|
1162
|
+
# Prepare kwargs
|
|
1163
|
+
kwargs = {
|
|
1164
|
+
"dataset": args.dataset,
|
|
1165
|
+
"model_id": args.model_id,
|
|
1166
|
+
"adapter": args.adapter,
|
|
1167
|
+
"device": args.device,
|
|
1168
|
+
"seq_len": args.seq_len,
|
|
1169
|
+
"stride": args.stride,
|
|
1170
|
+
"seed": args.seed,
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
try:
|
|
1174
|
+
# Run benchmark
|
|
1175
|
+
result = run_guard_effect_benchmark(
|
|
1176
|
+
edits=edits,
|
|
1177
|
+
tiers=tiers,
|
|
1178
|
+
probes=probes,
|
|
1179
|
+
profile=args.profile,
|
|
1180
|
+
output_dir=args.out,
|
|
1181
|
+
epsilon=args.epsilon,
|
|
1182
|
+
strict=args.strict,
|
|
1183
|
+
**kwargs,
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
# Exit with appropriate code per Step 14 specification
|
|
1187
|
+
if result["overall_pass"]:
|
|
1188
|
+
print("✅ All gates passed!")
|
|
1189
|
+
sys.exit(0)
|
|
1190
|
+
else:
|
|
1191
|
+
print("❌ Some gates failed!")
|
|
1192
|
+
sys.exit(1)
|
|
1193
|
+
|
|
1194
|
+
except KeyboardInterrupt:
|
|
1195
|
+
print("\n❌ Benchmark interrupted by user")
|
|
1196
|
+
sys.exit(1)
|
|
1197
|
+
except Exception as e:
|
|
1198
|
+
print(f"❌ Benchmark failed: {e}")
|
|
1199
|
+
if args.verbose:
|
|
1200
|
+
import traceback
|
|
1201
|
+
|
|
1202
|
+
traceback.print_exc()
|
|
1203
|
+
sys.exit(1)
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
if __name__ == "__main__":
|
|
1207
|
+
main()
|