invarlock 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/profiles/ci_cpu.yaml +5 -0
- invarlock/_data/runtime/tiers.yaml +61 -0
- invarlock/adapters/hf_loading.py +97 -0
- invarlock/calibration/__init__.py +6 -0
- invarlock/calibration/spectral_null.py +301 -0
- invarlock/calibration/variance_ve.py +154 -0
- invarlock/cli/app.py +15 -0
- invarlock/cli/commands/calibrate.py +576 -0
- invarlock/cli/commands/doctor.py +16 -4
- invarlock/cli/commands/explain_gates.py +53 -9
- invarlock/cli/commands/plugins.py +12 -2
- invarlock/cli/commands/run.py +323 -81
- invarlock/cli/commands/verify.py +40 -0
- invarlock/cli/determinism.py +237 -0
- invarlock/core/auto_tuning.py +215 -17
- invarlock/core/registry.py +9 -4
- invarlock/eval/bench.py +467 -141
- invarlock/eval/bench_regression.py +12 -0
- invarlock/eval/data.py +29 -7
- invarlock/guards/spectral.py +216 -9
- invarlock/guards/variance.py +6 -3
- invarlock/reporting/certificate.py +403 -51
- invarlock/reporting/certificate_schema.py +4 -1
- invarlock/reporting/guards_analysis.py +108 -10
- invarlock/reporting/normalizer.py +21 -1
- invarlock/reporting/policy_utils.py +100 -16
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/METADATA +12 -10
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/RECORD +33 -26
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/WHEEL +0 -0
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/top_level.txt +0 -0
invarlock/__init__.py
CHANGED
|
@@ -12,7 +12,7 @@ For torch-dependent functionality, see subpackages under `invarlock.*`:
|
|
|
12
12
|
- `invarlock.eval`: Metrics, guard-overhead checks, and certification
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
__version__ = "0.3.
|
|
15
|
+
__version__ = "0.3.2"
|
|
16
16
|
|
|
17
17
|
# Core exports - torch-independent
|
|
18
18
|
from .config import CFG, Defaults, get_default_config
|
|
@@ -5,6 +5,17 @@
|
|
|
5
5
|
# embedded in certificates and referenced by automation documentation.
|
|
6
6
|
|
|
7
7
|
balanced:
|
|
8
|
+
metrics:
|
|
9
|
+
pm_ratio:
|
|
10
|
+
ratio_limit_base: 1.10
|
|
11
|
+
min_tokens: 50000
|
|
12
|
+
hysteresis_ratio: 0.002
|
|
13
|
+
min_token_fraction: 0.01
|
|
14
|
+
accuracy:
|
|
15
|
+
delta_min_pp: -1.0
|
|
16
|
+
min_examples: 200
|
|
17
|
+
hysteresis_delta_pp: 0.1
|
|
18
|
+
min_examples_fraction: 0.01
|
|
8
19
|
variance_guard:
|
|
9
20
|
deadband: 0.02
|
|
10
21
|
min_abs_adjust: 0.012
|
|
@@ -41,6 +52,17 @@ balanced:
|
|
|
41
52
|
other: 0.12
|
|
42
53
|
|
|
43
54
|
conservative:
|
|
55
|
+
metrics:
|
|
56
|
+
pm_ratio:
|
|
57
|
+
ratio_limit_base: 1.05
|
|
58
|
+
min_tokens: 20000
|
|
59
|
+
hysteresis_ratio: 0.002
|
|
60
|
+
min_token_fraction: 0.01
|
|
61
|
+
accuracy:
|
|
62
|
+
delta_min_pp: -0.5
|
|
63
|
+
min_examples: 200
|
|
64
|
+
hysteresis_delta_pp: 0.1
|
|
65
|
+
min_examples_fraction: 0.01
|
|
44
66
|
variance_guard:
|
|
45
67
|
deadband: 0.03
|
|
46
68
|
min_abs_adjust: 0.02
|
|
@@ -74,3 +96,42 @@ conservative:
|
|
|
74
96
|
attn: 0.05
|
|
75
97
|
embed: 0.07
|
|
76
98
|
other: 0.07
|
|
99
|
+
|
|
100
|
+
aggressive:
|
|
101
|
+
metrics:
|
|
102
|
+
pm_ratio:
|
|
103
|
+
ratio_limit_base: 1.20
|
|
104
|
+
min_tokens: 50000
|
|
105
|
+
hysteresis_ratio: 0.002
|
|
106
|
+
min_token_fraction: 0.01
|
|
107
|
+
accuracy:
|
|
108
|
+
delta_min_pp: -2.0
|
|
109
|
+
min_examples: 200
|
|
110
|
+
hysteresis_delta_pp: 0.1
|
|
111
|
+
min_examples_fraction: 0.01
|
|
112
|
+
variance_guard:
|
|
113
|
+
deadband: 0.12
|
|
114
|
+
min_effect_lognll: 0.0005
|
|
115
|
+
spectral_guard:
|
|
116
|
+
sigma_quantile: 0.98
|
|
117
|
+
deadband: 0.15
|
|
118
|
+
scope: ffn
|
|
119
|
+
max_caps: 8
|
|
120
|
+
family_caps:
|
|
121
|
+
ffn: 3.0
|
|
122
|
+
attn: 3.5
|
|
123
|
+
embed: 2.5
|
|
124
|
+
other: 3.5
|
|
125
|
+
multiple_testing:
|
|
126
|
+
method: bh
|
|
127
|
+
alpha: 0.1
|
|
128
|
+
m: 4
|
|
129
|
+
rmt_guard:
|
|
130
|
+
deadband: 0.15
|
|
131
|
+
margin: 1.8
|
|
132
|
+
epsilon_default: 0.15
|
|
133
|
+
epsilon_by_family:
|
|
134
|
+
ffn: 0.15
|
|
135
|
+
attn: 0.15
|
|
136
|
+
embed: 0.15
|
|
137
|
+
other: 0.15
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Helpers for Hugging Face model loading.
|
|
2
|
+
|
|
3
|
+
Centralizes security- and performance-sensitive defaults used by HF adapters.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import torch
|
|
12
|
+
|
|
13
|
+
_TRUE = {"1", "true", "yes", "on"}
|
|
14
|
+
_FALSE = {"0", "false", "no", "off"}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _coerce_bool(val: Any) -> bool | None:
|
|
18
|
+
if isinstance(val, bool):
|
|
19
|
+
return val
|
|
20
|
+
if isinstance(val, int):
|
|
21
|
+
return bool(val)
|
|
22
|
+
if isinstance(val, str):
|
|
23
|
+
s = val.strip().lower()
|
|
24
|
+
if s in _TRUE:
|
|
25
|
+
return True
|
|
26
|
+
if s in _FALSE:
|
|
27
|
+
return False
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def resolve_trust_remote_code(
|
|
32
|
+
kwargs: dict[str, Any] | None = None, *, default: bool = False
|
|
33
|
+
) -> bool:
|
|
34
|
+
"""Resolve trust_remote_code with config override and env opt-in."""
|
|
35
|
+
if kwargs and "trust_remote_code" in kwargs:
|
|
36
|
+
coerced = _coerce_bool(kwargs.get("trust_remote_code"))
|
|
37
|
+
if coerced is not None:
|
|
38
|
+
return coerced
|
|
39
|
+
|
|
40
|
+
for env_name in (
|
|
41
|
+
"INVARLOCK_TRUST_REMOTE_CODE",
|
|
42
|
+
"TRUST_REMOTE_CODE_BOOL",
|
|
43
|
+
"ALLOW_REMOTE_CODE",
|
|
44
|
+
):
|
|
45
|
+
env_val = os.environ.get(env_name)
|
|
46
|
+
coerced = _coerce_bool(env_val)
|
|
47
|
+
if coerced is not None:
|
|
48
|
+
return coerced
|
|
49
|
+
|
|
50
|
+
return default
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def default_torch_dtype() -> torch.dtype:
|
|
54
|
+
"""Pick a safe default dtype for HF loads based on hardware."""
|
|
55
|
+
if torch.cuda.is_available():
|
|
56
|
+
try:
|
|
57
|
+
if (
|
|
58
|
+
hasattr(torch.cuda, "is_bf16_supported")
|
|
59
|
+
and torch.cuda.is_bf16_supported()
|
|
60
|
+
):
|
|
61
|
+
return torch.bfloat16
|
|
62
|
+
except Exception:
|
|
63
|
+
pass
|
|
64
|
+
return torch.float16
|
|
65
|
+
|
|
66
|
+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
67
|
+
return torch.float16
|
|
68
|
+
|
|
69
|
+
return torch.float32
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def resolve_torch_dtype(kwargs: dict[str, Any] | None = None) -> torch.dtype | str:
|
|
73
|
+
"""Resolve torch_dtype from kwargs or choose a hardware-aware default."""
|
|
74
|
+
if kwargs and "torch_dtype" in kwargs:
|
|
75
|
+
val = kwargs.get("torch_dtype")
|
|
76
|
+
if isinstance(val, torch.dtype):
|
|
77
|
+
return val
|
|
78
|
+
if isinstance(val, str):
|
|
79
|
+
s = val.strip().lower()
|
|
80
|
+
if s == "auto":
|
|
81
|
+
return "auto"
|
|
82
|
+
mapping = {
|
|
83
|
+
"float16": torch.float16,
|
|
84
|
+
"fp16": torch.float16,
|
|
85
|
+
"half": torch.float16,
|
|
86
|
+
"bfloat16": torch.bfloat16,
|
|
87
|
+
"bf16": torch.bfloat16,
|
|
88
|
+
"float32": torch.float32,
|
|
89
|
+
"fp32": torch.float32,
|
|
90
|
+
}
|
|
91
|
+
if s in mapping:
|
|
92
|
+
return mapping[s]
|
|
93
|
+
|
|
94
|
+
return default_torch_dtype()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
__all__ = ["resolve_trust_remote_code", "default_torch_dtype", "resolve_torch_dtype"]
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from collections import Counter, defaultdict
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _finite01(value: Any) -> bool:
|
|
9
|
+
try:
|
|
10
|
+
f = float(value)
|
|
11
|
+
return math.isfinite(f) and 0.0 <= f <= 1.0
|
|
12
|
+
except Exception:
|
|
13
|
+
return False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _bh_reject_families(
|
|
17
|
+
family_pvals: dict[str, float],
|
|
18
|
+
*,
|
|
19
|
+
alpha: float,
|
|
20
|
+
m: int,
|
|
21
|
+
) -> set[str]:
|
|
22
|
+
if not family_pvals:
|
|
23
|
+
return set()
|
|
24
|
+
try:
|
|
25
|
+
alpha_f = float(alpha)
|
|
26
|
+
except Exception:
|
|
27
|
+
return set()
|
|
28
|
+
if not (0.0 < alpha_f <= 1.0):
|
|
29
|
+
return set()
|
|
30
|
+
|
|
31
|
+
names = list(family_pvals.keys())
|
|
32
|
+
pvals = [family_pvals[name] for name in names]
|
|
33
|
+
n = len(pvals)
|
|
34
|
+
m_eff = max(int(m) if isinstance(m, int) else 0, n, 1)
|
|
35
|
+
|
|
36
|
+
order = sorted(
|
|
37
|
+
range(n),
|
|
38
|
+
key=lambda idx: (float("inf") if not _finite01(pvals[idx]) else pvals[idx]),
|
|
39
|
+
)
|
|
40
|
+
max_k = 0
|
|
41
|
+
for rank, idx in enumerate(order, start=1):
|
|
42
|
+
p = pvals[idx]
|
|
43
|
+
if not _finite01(p):
|
|
44
|
+
continue
|
|
45
|
+
if p <= (alpha_f * rank) / m_eff:
|
|
46
|
+
max_k = rank
|
|
47
|
+
if max_k <= 0:
|
|
48
|
+
return set()
|
|
49
|
+
cutoff = (alpha_f * max_k) / m_eff
|
|
50
|
+
selected: set[str] = set()
|
|
51
|
+
for idx in order:
|
|
52
|
+
p = pvals[idx]
|
|
53
|
+
if _finite01(p) and p <= cutoff:
|
|
54
|
+
selected.add(names[idx])
|
|
55
|
+
return selected
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _bonferroni_reject_families(
|
|
59
|
+
family_pvals: dict[str, float],
|
|
60
|
+
*,
|
|
61
|
+
alpha: float,
|
|
62
|
+
m: int,
|
|
63
|
+
) -> set[str]:
|
|
64
|
+
if not family_pvals:
|
|
65
|
+
return set()
|
|
66
|
+
try:
|
|
67
|
+
alpha_f = float(alpha)
|
|
68
|
+
except Exception:
|
|
69
|
+
return set()
|
|
70
|
+
if not (0.0 < alpha_f <= 1.0):
|
|
71
|
+
return set()
|
|
72
|
+
m_eff = max(int(m) if isinstance(m, int) else 0, len(family_pvals), 1)
|
|
73
|
+
cutoff = alpha_f / m_eff
|
|
74
|
+
return {fam for fam, p in family_pvals.items() if _finite01(p) and p <= cutoff}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _extract_guard(report: dict[str, Any], name: str) -> dict[str, Any] | None:
|
|
78
|
+
guards = report.get("guards")
|
|
79
|
+
if isinstance(guards, list):
|
|
80
|
+
for item in guards:
|
|
81
|
+
if isinstance(item, dict) and item.get("name") == name:
|
|
82
|
+
return item
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _extract_family_max_z(metrics: dict[str, Any]) -> dict[str, float]:
|
|
87
|
+
out: dict[str, float] = {}
|
|
88
|
+
summary = metrics.get("family_z_summary")
|
|
89
|
+
if isinstance(summary, dict):
|
|
90
|
+
for fam, vals in summary.items():
|
|
91
|
+
if not isinstance(vals, dict):
|
|
92
|
+
continue
|
|
93
|
+
z = vals.get("max")
|
|
94
|
+
try:
|
|
95
|
+
if z is not None and math.isfinite(float(z)):
|
|
96
|
+
out[str(fam)] = float(z)
|
|
97
|
+
except Exception:
|
|
98
|
+
continue
|
|
99
|
+
q = metrics.get("family_z_quantiles")
|
|
100
|
+
if isinstance(q, dict):
|
|
101
|
+
for fam, vals in q.items():
|
|
102
|
+
if not isinstance(vals, dict):
|
|
103
|
+
continue
|
|
104
|
+
z = vals.get("max")
|
|
105
|
+
try:
|
|
106
|
+
if z is not None and math.isfinite(float(z)):
|
|
107
|
+
out[str(fam)] = max(out.get(str(fam), float("-inf")), float(z))
|
|
108
|
+
except Exception:
|
|
109
|
+
continue
|
|
110
|
+
return out
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _extract_multiple_testing(metrics: dict[str, Any]) -> dict[str, Any]:
|
|
114
|
+
mt = metrics.get("multiple_testing")
|
|
115
|
+
if not isinstance(mt, dict):
|
|
116
|
+
return {}
|
|
117
|
+
out: dict[str, Any] = {}
|
|
118
|
+
method = mt.get("method")
|
|
119
|
+
if isinstance(method, str) and method.strip():
|
|
120
|
+
out["method"] = method.strip().lower()
|
|
121
|
+
try:
|
|
122
|
+
alpha = mt.get("alpha")
|
|
123
|
+
if alpha is not None:
|
|
124
|
+
out["alpha"] = float(alpha)
|
|
125
|
+
except Exception:
|
|
126
|
+
pass
|
|
127
|
+
try:
|
|
128
|
+
m_val = mt.get("m")
|
|
129
|
+
if m_val is not None:
|
|
130
|
+
out["m"] = int(m_val)
|
|
131
|
+
except Exception:
|
|
132
|
+
pass
|
|
133
|
+
return out
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _selected_families_for_alpha(
|
|
137
|
+
pvals: dict[str, float],
|
|
138
|
+
*,
|
|
139
|
+
method: str,
|
|
140
|
+
alpha: float,
|
|
141
|
+
m: int,
|
|
142
|
+
) -> set[str]:
|
|
143
|
+
meth = (method or "").strip().lower()
|
|
144
|
+
if meth == "bonferroni":
|
|
145
|
+
return _bonferroni_reject_families(pvals, alpha=alpha, m=m)
|
|
146
|
+
# Default: BH
|
|
147
|
+
return _bh_reject_families(pvals, alpha=alpha, m=m)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def summarize_null_sweep_reports(
|
|
151
|
+
reports: list[dict[str, Any]],
|
|
152
|
+
*,
|
|
153
|
+
tier: str,
|
|
154
|
+
safety_margin: float = 0.05,
|
|
155
|
+
target_any_warning_rate: float = 0.01,
|
|
156
|
+
) -> dict[str, Any]:
|
|
157
|
+
"""Summarize spectral null-sweep results and recommend κ/alpha.
|
|
158
|
+
|
|
159
|
+
Inputs are run report dicts produced by `invarlock run` (or equivalent).
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
tier_norm = (tier or "").strip().lower() or "balanced"
|
|
163
|
+
margin = float(safety_margin or 0.0)
|
|
164
|
+
if not (0.0 <= margin <= 1.0):
|
|
165
|
+
margin = 0.05
|
|
166
|
+
target = float(target_any_warning_rate or 0.0)
|
|
167
|
+
if not (0.0 <= target <= 1.0):
|
|
168
|
+
target = 0.01
|
|
169
|
+
|
|
170
|
+
family_max_z: dict[str, float] = defaultdict(lambda: float("-inf"))
|
|
171
|
+
has_warning_default: list[bool] = []
|
|
172
|
+
run_pvals: list[dict[str, float]] = []
|
|
173
|
+
|
|
174
|
+
mt_method = "bh"
|
|
175
|
+
mt_alpha = 0.05
|
|
176
|
+
mt_m = 4
|
|
177
|
+
|
|
178
|
+
selected_by_family: Counter[str] = Counter()
|
|
179
|
+
candidate_by_family: Counter[str] = Counter()
|
|
180
|
+
|
|
181
|
+
for report in reports:
|
|
182
|
+
if not isinstance(report, dict):
|
|
183
|
+
continue
|
|
184
|
+
g = _extract_guard(report, "spectral") or {}
|
|
185
|
+
metrics = g.get("metrics", {}) if isinstance(g.get("metrics"), dict) else {}
|
|
186
|
+
mt = _extract_multiple_testing(metrics)
|
|
187
|
+
if mt:
|
|
188
|
+
mt_method = str(mt.get("method", mt_method))
|
|
189
|
+
if mt.get("alpha") is not None:
|
|
190
|
+
mt_alpha = float(mt.get("alpha"))
|
|
191
|
+
if mt.get("m") is not None:
|
|
192
|
+
mt_m = int(mt.get("m"))
|
|
193
|
+
|
|
194
|
+
fam_z = _extract_family_max_z(metrics)
|
|
195
|
+
for fam, z in fam_z.items():
|
|
196
|
+
family_max_z[fam] = max(family_max_z[fam], float(z))
|
|
197
|
+
|
|
198
|
+
selection = (
|
|
199
|
+
metrics.get("multiple_testing_selection")
|
|
200
|
+
if isinstance(metrics.get("multiple_testing_selection"), dict)
|
|
201
|
+
else {}
|
|
202
|
+
)
|
|
203
|
+
pvals = selection.get("family_pvalues")
|
|
204
|
+
if not isinstance(pvals, dict):
|
|
205
|
+
pvals = {}
|
|
206
|
+
parsed_pvals: dict[str, float] = {}
|
|
207
|
+
for fam, p in pvals.items():
|
|
208
|
+
try:
|
|
209
|
+
pf = float(p)
|
|
210
|
+
except Exception:
|
|
211
|
+
continue
|
|
212
|
+
if _finite01(pf):
|
|
213
|
+
parsed_pvals[str(fam)] = pf
|
|
214
|
+
run_pvals.append(parsed_pvals)
|
|
215
|
+
|
|
216
|
+
families_selected = selection.get("families_selected")
|
|
217
|
+
if isinstance(families_selected, list):
|
|
218
|
+
for fam in families_selected:
|
|
219
|
+
selected_by_family[str(fam)] += 1
|
|
220
|
+
|
|
221
|
+
fam_counts = selection.get("family_violation_counts")
|
|
222
|
+
if isinstance(fam_counts, dict):
|
|
223
|
+
for fam, count in fam_counts.items():
|
|
224
|
+
try:
|
|
225
|
+
candidate_by_family[str(fam)] += int(count)
|
|
226
|
+
except Exception:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
caps_applied = metrics.get("caps_applied")
|
|
230
|
+
try:
|
|
231
|
+
caps_applied_int = int(caps_applied) if caps_applied is not None else 0
|
|
232
|
+
except Exception:
|
|
233
|
+
caps_applied_int = 0
|
|
234
|
+
violations = g.get("violations", [])
|
|
235
|
+
has_warning_default.append(bool(caps_applied_int) or bool(violations))
|
|
236
|
+
|
|
237
|
+
n = max(len(has_warning_default), 1)
|
|
238
|
+
observed_any_rate = sum(1 for v in has_warning_default if v) / float(n)
|
|
239
|
+
|
|
240
|
+
# κ recommendation: max observed z per family (+ margin), rounded for stable tiers.yaml diffs.
|
|
241
|
+
rec_caps: dict[str, float] = {}
|
|
242
|
+
for fam, z in sorted(family_max_z.items()):
|
|
243
|
+
if not math.isfinite(z):
|
|
244
|
+
continue
|
|
245
|
+
kappa = z * (1.0 + margin)
|
|
246
|
+
rec_caps[fam] = float(round(kappa, 3))
|
|
247
|
+
|
|
248
|
+
# α calibration: choose the largest alpha that meets target_any_warning_rate.
|
|
249
|
+
# This uses per-run family p-values (from spectral.multiple_testing_selection).
|
|
250
|
+
def _rate_for_alpha(alpha: float) -> float:
|
|
251
|
+
any_sel = 0
|
|
252
|
+
for pvals in run_pvals:
|
|
253
|
+
selected = _selected_families_for_alpha(
|
|
254
|
+
pvals, method=mt_method, alpha=alpha, m=mt_m
|
|
255
|
+
)
|
|
256
|
+
any_sel += 1 if selected else 0
|
|
257
|
+
return any_sel / float(max(len(run_pvals), 1))
|
|
258
|
+
|
|
259
|
+
recommended_alpha = float(mt_alpha)
|
|
260
|
+
if run_pvals and observed_any_rate > target:
|
|
261
|
+
# Halving search is stable/deterministic and avoids dependency-heavy optimizers.
|
|
262
|
+
alpha_grid: list[float] = []
|
|
263
|
+
a = float(mt_alpha)
|
|
264
|
+
for _ in range(20):
|
|
265
|
+
if a <= 1e-6:
|
|
266
|
+
break
|
|
267
|
+
alpha_grid.append(a)
|
|
268
|
+
a *= 0.5
|
|
269
|
+
alpha_grid.append(1e-6)
|
|
270
|
+
best = None
|
|
271
|
+
for candidate in alpha_grid:
|
|
272
|
+
rate = _rate_for_alpha(candidate)
|
|
273
|
+
if rate <= target:
|
|
274
|
+
best = candidate
|
|
275
|
+
break
|
|
276
|
+
if best is not None:
|
|
277
|
+
recommended_alpha = float(best)
|
|
278
|
+
|
|
279
|
+
return {
|
|
280
|
+
"tier": tier_norm,
|
|
281
|
+
"n_runs": int(len(has_warning_default)),
|
|
282
|
+
"observed": {
|
|
283
|
+
"any_warning_rate": float(observed_any_rate),
|
|
284
|
+
"selected_by_family_runs": dict(selected_by_family),
|
|
285
|
+
"candidate_violations_by_family_total": dict(candidate_by_family),
|
|
286
|
+
"family_max_z": {
|
|
287
|
+
k: float(v) for k, v in sorted(family_max_z.items()) if math.isfinite(v)
|
|
288
|
+
},
|
|
289
|
+
},
|
|
290
|
+
"recommendations": {
|
|
291
|
+
"family_caps": rec_caps,
|
|
292
|
+
"multiple_testing": {
|
|
293
|
+
"method": str(mt_method),
|
|
294
|
+
"alpha": float(recommended_alpha),
|
|
295
|
+
"m": int(mt_m),
|
|
296
|
+
},
|
|
297
|
+
},
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
__all__ = ["summarize_null_sweep_reports"]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _extract_guard(report: dict[str, Any], name: str) -> dict[str, Any] | None:
|
|
8
|
+
guards = report.get("guards")
|
|
9
|
+
if isinstance(guards, list):
|
|
10
|
+
for item in guards:
|
|
11
|
+
if isinstance(item, dict) and item.get("name") == name:
|
|
12
|
+
return item
|
|
13
|
+
return None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _coerce_delta_ci(value: Any) -> tuple[float, float] | None:
|
|
17
|
+
if not (isinstance(value, tuple | list) and len(value) == 2):
|
|
18
|
+
return None
|
|
19
|
+
try:
|
|
20
|
+
lo = float(value[0])
|
|
21
|
+
hi = float(value[1])
|
|
22
|
+
except Exception:
|
|
23
|
+
return None
|
|
24
|
+
if not (math.isfinite(lo) and math.isfinite(hi)):
|
|
25
|
+
return None
|
|
26
|
+
return (lo, hi)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _gain_lower_bound(
|
|
30
|
+
*, mean_delta: float | None, delta_ci: tuple[float, float] | None, one_sided: bool
|
|
31
|
+
) -> float:
|
|
32
|
+
if delta_ci is None:
|
|
33
|
+
return 0.0
|
|
34
|
+
lo, hi = delta_ci
|
|
35
|
+
if hi >= 0.0:
|
|
36
|
+
return 0.0
|
|
37
|
+
if one_sided and (mean_delta is None or not (mean_delta < 0.0)):
|
|
38
|
+
return 0.0
|
|
39
|
+
# Gain CI lower bound is -upper (worst-case gain).
|
|
40
|
+
return max(0.0, -hi)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _recommend_threshold_for_target_rate(
|
|
44
|
+
gains: list[float],
|
|
45
|
+
*,
|
|
46
|
+
target_rate: float,
|
|
47
|
+
safety_margin: float,
|
|
48
|
+
) -> tuple[float, float]:
|
|
49
|
+
n = len(gains)
|
|
50
|
+
if n <= 0:
|
|
51
|
+
return 0.0, 0.0
|
|
52
|
+
target = float(target_rate)
|
|
53
|
+
if not (0.0 <= target <= 1.0):
|
|
54
|
+
target = 0.05
|
|
55
|
+
desired_passes = int(math.floor(target * n))
|
|
56
|
+
gains_desc = sorted((max(0.0, float(g)) for g in gains), reverse=True)
|
|
57
|
+
|
|
58
|
+
def pass_count(thr: float) -> int:
|
|
59
|
+
return sum(1 for g in gains_desc if g >= thr)
|
|
60
|
+
|
|
61
|
+
if desired_passes <= 0:
|
|
62
|
+
thr = (
|
|
63
|
+
(gains_desc[0] * (1.0 + max(0.0, safety_margin)))
|
|
64
|
+
if gains_desc[0] > 0
|
|
65
|
+
else 0.0
|
|
66
|
+
)
|
|
67
|
+
return float(round(thr, 3)), 0.0
|
|
68
|
+
|
|
69
|
+
unique_vals = sorted(set(gains_desc), reverse=True)
|
|
70
|
+
chosen = None
|
|
71
|
+
chosen_rate = 0.0
|
|
72
|
+
for val in unique_vals:
|
|
73
|
+
cnt = pass_count(val)
|
|
74
|
+
rate = cnt / float(n)
|
|
75
|
+
if cnt <= desired_passes:
|
|
76
|
+
chosen = float(val)
|
|
77
|
+
chosen_rate = float(rate)
|
|
78
|
+
break
|
|
79
|
+
|
|
80
|
+
if chosen is None:
|
|
81
|
+
# Ties at max prevent meeting desired_passes; force zero enable rate.
|
|
82
|
+
thr = gains_desc[0] * (1.0 + max(0.0, safety_margin))
|
|
83
|
+
return float(round(thr, 3)), 0.0
|
|
84
|
+
|
|
85
|
+
thr = chosen * (1.0 + max(0.0, safety_margin))
|
|
86
|
+
return float(round(thr, 3)), float(chosen_rate)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def summarize_ve_sweep_reports(
|
|
90
|
+
reports: list[dict[str, Any]],
|
|
91
|
+
*,
|
|
92
|
+
tier: str,
|
|
93
|
+
target_enable_rate: float = 0.05,
|
|
94
|
+
safety_margin: float = 0.0,
|
|
95
|
+
predictive_one_sided: bool = True,
|
|
96
|
+
) -> dict[str, Any]:
|
|
97
|
+
"""Summarize VE predictive-gate sweeps and recommend min_effect_lognll."""
|
|
98
|
+
|
|
99
|
+
tier_norm = (tier or "").strip().lower() or "balanced"
|
|
100
|
+
one_sided = bool(predictive_one_sided)
|
|
101
|
+
margin = float(safety_margin or 0.0)
|
|
102
|
+
if not (0.0 <= margin <= 1.0):
|
|
103
|
+
margin = 0.0
|
|
104
|
+
|
|
105
|
+
gains: list[float] = []
|
|
106
|
+
widths: list[float] = []
|
|
107
|
+
evaluated = 0
|
|
108
|
+
|
|
109
|
+
for report in reports:
|
|
110
|
+
if not isinstance(report, dict):
|
|
111
|
+
continue
|
|
112
|
+
g = _extract_guard(report, "variance") or {}
|
|
113
|
+
metrics = g.get("metrics", {}) if isinstance(g.get("metrics"), dict) else {}
|
|
114
|
+
pg = metrics.get("predictive_gate")
|
|
115
|
+
if not isinstance(pg, dict):
|
|
116
|
+
continue
|
|
117
|
+
evaluated += 1 if pg.get("evaluated") is True else 0
|
|
118
|
+
|
|
119
|
+
delta_ci = _coerce_delta_ci(pg.get("delta_ci"))
|
|
120
|
+
mean_delta = pg.get("mean_delta")
|
|
121
|
+
try:
|
|
122
|
+
mean_delta_f = float(mean_delta) if mean_delta is not None else None
|
|
123
|
+
except Exception:
|
|
124
|
+
mean_delta_f = None
|
|
125
|
+
|
|
126
|
+
gains.append(
|
|
127
|
+
_gain_lower_bound(
|
|
128
|
+
mean_delta=mean_delta_f, delta_ci=delta_ci, one_sided=one_sided
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
if delta_ci is not None:
|
|
132
|
+
widths.append(float(abs(delta_ci[1] - delta_ci[0])))
|
|
133
|
+
|
|
134
|
+
recommended, expected_rate = _recommend_threshold_for_target_rate(
|
|
135
|
+
gains, target_rate=target_enable_rate, safety_margin=margin
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
mean_width = float(sum(widths) / max(len(widths), 1)) if widths else None
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"tier": tier_norm,
|
|
142
|
+
"n_runs": int(len(gains)),
|
|
143
|
+
"observed": {
|
|
144
|
+
"evaluated_runs": int(evaluated),
|
|
145
|
+
"mean_ci_width": mean_width,
|
|
146
|
+
},
|
|
147
|
+
"recommendations": {
|
|
148
|
+
"min_effect_lognll": float(recommended),
|
|
149
|
+
"expected_enable_rate": float(expected_rate),
|
|
150
|
+
},
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
__all__ = ["summarize_ve_sweep_reports"]
|
invarlock/cli/app.py
CHANGED
|
@@ -34,6 +34,7 @@ class OrderedGroup(TyperGroup):
|
|
|
34
34
|
def list_commands(self, ctx): # type: ignore[override]
|
|
35
35
|
return [
|
|
36
36
|
"certify",
|
|
37
|
+
"calibrate",
|
|
37
38
|
"report",
|
|
38
39
|
"verify",
|
|
39
40
|
"run",
|
|
@@ -161,10 +162,24 @@ def _register_subapps() -> None:
|
|
|
161
162
|
from .commands.plugins import plugins_app as _plugins_app
|
|
162
163
|
from .commands.report import report_app as _report_app
|
|
163
164
|
|
|
165
|
+
# Always-available subapps (lightweight imports)
|
|
164
166
|
app.add_typer(_report_app, name="report")
|
|
165
167
|
app.add_typer(_plugins_app, name="plugins")
|
|
166
168
|
app.command(name="doctor")(_doctor_cmd)
|
|
167
169
|
|
|
170
|
+
# Optional: calibration subapp. This transitively imports guards, which may
|
|
171
|
+
# depend on torch/transformers. In minimal environments (no heavy deps),
|
|
172
|
+
# skip registration so `python -m invarlock --help` stays import-safe.
|
|
173
|
+
try:
|
|
174
|
+
from .commands.calibrate import calibrate_app as _calibrate_app
|
|
175
|
+
except ModuleNotFoundError as exc: # pragma: no cover - exercised in venv test
|
|
176
|
+
missing = getattr(exc, "name", "") or ""
|
|
177
|
+
if missing in {"torch", "transformers"}:
|
|
178
|
+
return
|
|
179
|
+
raise
|
|
180
|
+
else:
|
|
181
|
+
app.add_typer(_calibrate_app, name="calibrate")
|
|
182
|
+
|
|
168
183
|
|
|
169
184
|
@app.command(
|
|
170
185
|
name="verify",
|