claude-turing 3.0.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +8 -2
- package/commands/baseline.md +45 -0
- package/commands/calibrate.md +47 -0
- package/commands/leak.md +47 -0
- package/commands/sanity.md +48 -0
- package/commands/sensitivity.md +41 -0
- package/commands/turing.md +12 -0
- package/commands/xray.md +43 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +6 -0
- package/templates/scripts/__pycache__/calibration.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_baselines.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/leakage_detector.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_xray.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sanity_checks.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sensitivity_analysis.cpython-314.pyc +0 -0
- package/templates/scripts/calibration.py +364 -0
- package/templates/scripts/generate_baselines.py +423 -0
- package/templates/scripts/leakage_detector.py +402 -0
- package/templates/scripts/model_xray.py +317 -0
- package/templates/scripts/sanity_checks.py +503 -0
- package/templates/scripts/scaffold.py +12 -0
- package/templates/scripts/sensitivity_analysis.py +335 -0
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Probability calibration for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Measures whether model probabilities are well-calibrated, computes ECE/MCE,
|
|
5
|
+
generates reliability diagrams, and applies post-hoc calibration (Platt
|
|
6
|
+
scaling, isotonic regression, temperature scaling).
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/calibration.py exp-042
|
|
10
|
+
python scripts/calibration.py exp-042 --method platt
|
|
11
|
+
python scripts/calibration.py exp-042 --method auto
|
|
12
|
+
python scripts/calibration.py --json
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from scripts.turing_io import load_config, load_experiments
|
|
27
|
+
|
|
28
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
29
|
+
DEFAULT_N_BINS = 10
|
|
30
|
+
CALIBRATION_METHODS = ["platt", "isotonic", "temperature"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# --- Calibration Metrics ---
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def compute_ece(
|
|
37
|
+
probabilities: np.ndarray,
|
|
38
|
+
labels: np.ndarray,
|
|
39
|
+
n_bins: int = DEFAULT_N_BINS,
|
|
40
|
+
) -> float:
|
|
41
|
+
"""Compute Expected Calibration Error.
|
|
42
|
+
|
|
43
|
+
ECE = sum(|bin_accuracy - bin_confidence| * bin_size / total)
|
|
44
|
+
"""
|
|
45
|
+
if len(probabilities) == 0:
|
|
46
|
+
return 0.0
|
|
47
|
+
|
|
48
|
+
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
49
|
+
ece = 0.0
|
|
50
|
+
|
|
51
|
+
for i in range(n_bins):
|
|
52
|
+
mask = (probabilities >= bin_boundaries[i]) & (probabilities < bin_boundaries[i + 1])
|
|
53
|
+
if i == n_bins - 1:
|
|
54
|
+
mask = (probabilities >= bin_boundaries[i]) & (probabilities <= bin_boundaries[i + 1])
|
|
55
|
+
|
|
56
|
+
bin_size = np.sum(mask)
|
|
57
|
+
if bin_size == 0:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
bin_accuracy = np.mean(labels[mask])
|
|
61
|
+
bin_confidence = np.mean(probabilities[mask])
|
|
62
|
+
ece += abs(bin_accuracy - bin_confidence) * bin_size / len(probabilities)
|
|
63
|
+
|
|
64
|
+
return round(float(ece), 6)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def compute_mce(
|
|
68
|
+
probabilities: np.ndarray,
|
|
69
|
+
labels: np.ndarray,
|
|
70
|
+
n_bins: int = DEFAULT_N_BINS,
|
|
71
|
+
) -> float:
|
|
72
|
+
"""Compute Maximum Calibration Error."""
|
|
73
|
+
if len(probabilities) == 0:
|
|
74
|
+
return 0.0
|
|
75
|
+
|
|
76
|
+
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
77
|
+
max_gap = 0.0
|
|
78
|
+
|
|
79
|
+
for i in range(n_bins):
|
|
80
|
+
mask = (probabilities >= bin_boundaries[i]) & (probabilities < bin_boundaries[i + 1])
|
|
81
|
+
if i == n_bins - 1:
|
|
82
|
+
mask = (probabilities >= bin_boundaries[i]) & (probabilities <= bin_boundaries[i + 1])
|
|
83
|
+
|
|
84
|
+
if np.sum(mask) == 0:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
bin_accuracy = np.mean(labels[mask])
|
|
88
|
+
bin_confidence = np.mean(probabilities[mask])
|
|
89
|
+
max_gap = max(max_gap, abs(bin_accuracy - bin_confidence))
|
|
90
|
+
|
|
91
|
+
return round(float(max_gap), 6)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def compute_reliability_diagram(
|
|
95
|
+
probabilities: np.ndarray,
|
|
96
|
+
labels: np.ndarray,
|
|
97
|
+
n_bins: int = DEFAULT_N_BINS,
|
|
98
|
+
) -> list[dict]:
|
|
99
|
+
"""Compute reliability diagram data."""
|
|
100
|
+
if len(probabilities) == 0:
|
|
101
|
+
return []
|
|
102
|
+
|
|
103
|
+
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
104
|
+
bins = []
|
|
105
|
+
|
|
106
|
+
for i in range(n_bins):
|
|
107
|
+
lo = bin_boundaries[i]
|
|
108
|
+
hi = bin_boundaries[i + 1]
|
|
109
|
+
mask = (probabilities >= lo) & (probabilities < hi)
|
|
110
|
+
if i == n_bins - 1:
|
|
111
|
+
mask = (probabilities >= lo) & (probabilities <= hi)
|
|
112
|
+
|
|
113
|
+
bin_size = int(np.sum(mask))
|
|
114
|
+
if bin_size == 0:
|
|
115
|
+
bins.append({"bin": f"[{lo:.1f}-{hi:.1f}]", "predicted": None,
|
|
116
|
+
"actual": None, "gap": None, "n": 0})
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
predicted = float(np.mean(probabilities[mask]))
|
|
120
|
+
actual = float(np.mean(labels[mask]))
|
|
121
|
+
gap = actual - predicted
|
|
122
|
+
|
|
123
|
+
bins.append({
|
|
124
|
+
"bin": f"[{lo:.1f}-{hi:.1f}]",
|
|
125
|
+
"predicted": round(predicted, 4),
|
|
126
|
+
"actual": round(actual, 4),
|
|
127
|
+
"gap": round(gap, 4),
|
|
128
|
+
"n": bin_size,
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
return bins
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# --- Calibration Methods ---
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def platt_scaling(
|
|
138
|
+
logits: np.ndarray,
|
|
139
|
+
labels: np.ndarray,
|
|
140
|
+
) -> dict:
|
|
141
|
+
"""Apply Platt scaling (logistic regression on logits)."""
|
|
142
|
+
from scipy.special import expit
|
|
143
|
+
|
|
144
|
+
# Fit logistic regression: P(y=1|f) = sigmoid(a*f + b)
|
|
145
|
+
# Simple gradient descent for a, b
|
|
146
|
+
a, b = 1.0, 0.0
|
|
147
|
+
lr = 0.01
|
|
148
|
+
for _ in range(1000):
|
|
149
|
+
pred = expit(a * logits + b)
|
|
150
|
+
pred = np.clip(pred, 1e-7, 1 - 1e-7)
|
|
151
|
+
grad_a = np.mean((pred - labels) * logits)
|
|
152
|
+
grad_b = np.mean(pred - labels)
|
|
153
|
+
a -= lr * grad_a
|
|
154
|
+
b -= lr * grad_b
|
|
155
|
+
|
|
156
|
+
calibrated = expit(a * logits + b)
|
|
157
|
+
return {"method": "platt", "params": {"a": round(float(a), 6), "b": round(float(b), 6)},
|
|
158
|
+
"calibrated_probabilities": calibrated}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def isotonic_calibration(
|
|
162
|
+
probabilities: np.ndarray,
|
|
163
|
+
labels: np.ndarray,
|
|
164
|
+
) -> dict:
|
|
165
|
+
"""Apply isotonic regression calibration."""
|
|
166
|
+
from sklearn.isotonic import IsotonicRegression
|
|
167
|
+
|
|
168
|
+
iso = IsotonicRegression(out_of_bounds="clip")
|
|
169
|
+
calibrated = iso.fit_transform(probabilities, labels)
|
|
170
|
+
return {"method": "isotonic", "params": {},
|
|
171
|
+
"calibrated_probabilities": np.clip(calibrated, 0, 1)}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def temperature_scaling(
|
|
175
|
+
logits: np.ndarray,
|
|
176
|
+
labels: np.ndarray,
|
|
177
|
+
) -> dict:
|
|
178
|
+
"""Apply temperature scaling (single parameter T)."""
|
|
179
|
+
from scipy.special import expit
|
|
180
|
+
|
|
181
|
+
best_t = 1.0
|
|
182
|
+
best_ece = float("inf")
|
|
183
|
+
|
|
184
|
+
for t in np.arange(0.5, 5.0, 0.1):
|
|
185
|
+
scaled = expit(logits / t)
|
|
186
|
+
ece = compute_ece(scaled, labels)
|
|
187
|
+
if ece < best_ece:
|
|
188
|
+
best_ece = ece
|
|
189
|
+
best_t = t
|
|
190
|
+
|
|
191
|
+
calibrated = expit(logits / best_t)
|
|
192
|
+
return {"method": "temperature", "params": {"T": round(float(best_t), 2)},
|
|
193
|
+
"calibrated_probabilities": calibrated}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# --- Full Pipeline ---
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def calibrate_model(
|
|
200
|
+
probabilities: np.ndarray | None = None,
|
|
201
|
+
logits: np.ndarray | None = None,
|
|
202
|
+
labels: np.ndarray | None = None,
|
|
203
|
+
method: str = "auto",
|
|
204
|
+
exp_id: str | None = None,
|
|
205
|
+
config_path: str = "config.yaml",
|
|
206
|
+
) -> dict:
|
|
207
|
+
"""Run calibration analysis and optionally apply post-hoc calibration."""
|
|
208
|
+
if (probabilities is None and logits is None) or labels is None:
|
|
209
|
+
return {"error": "Provide probabilities (or logits) and labels for calibration"}
|
|
210
|
+
|
|
211
|
+
if probabilities is None and logits is not None:
|
|
212
|
+
from scipy.special import expit
|
|
213
|
+
probabilities = expit(logits)
|
|
214
|
+
|
|
215
|
+
# Before calibration
|
|
216
|
+
ece_before = compute_ece(probabilities, labels)
|
|
217
|
+
mce_before = compute_mce(probabilities, labels)
|
|
218
|
+
reliability = compute_reliability_diagram(probabilities, labels)
|
|
219
|
+
|
|
220
|
+
# Determine overconfidence
|
|
221
|
+
overconfident_bins = [b for b in reliability if b.get("gap") is not None and b["gap"] < -0.05 and b["n"] > 0]
|
|
222
|
+
|
|
223
|
+
report = {
|
|
224
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
225
|
+
"experiment_id": exp_id,
|
|
226
|
+
"before": {"ece": ece_before, "mce": mce_before},
|
|
227
|
+
"reliability_diagram": reliability,
|
|
228
|
+
"overconfident_bins": len(overconfident_bins),
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
# Apply calibration
|
|
232
|
+
methods_to_try = CALIBRATION_METHODS if method == "auto" else [method]
|
|
233
|
+
results = []
|
|
234
|
+
|
|
235
|
+
for m in methods_to_try:
|
|
236
|
+
try:
|
|
237
|
+
if m == "platt" and logits is not None:
|
|
238
|
+
cal = platt_scaling(logits, labels)
|
|
239
|
+
elif m == "isotonic":
|
|
240
|
+
cal = isotonic_calibration(probabilities, labels)
|
|
241
|
+
elif m == "temperature" and logits is not None:
|
|
242
|
+
cal = temperature_scaling(logits, labels)
|
|
243
|
+
else:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
ece_after = compute_ece(cal["calibrated_probabilities"], labels)
|
|
247
|
+
results.append({
|
|
248
|
+
"method": m,
|
|
249
|
+
"ece_after": ece_after,
|
|
250
|
+
"improvement": round(ece_before - ece_after, 6),
|
|
251
|
+
"params": cal.get("params", {}),
|
|
252
|
+
})
|
|
253
|
+
except Exception:
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# Find best method
|
|
257
|
+
best = None
|
|
258
|
+
if results:
|
|
259
|
+
best = min(results, key=lambda r: r["ece_after"])
|
|
260
|
+
|
|
261
|
+
report["calibration_results"] = results
|
|
262
|
+
report["best_method"] = best
|
|
263
|
+
|
|
264
|
+
# Verdict
|
|
265
|
+
if ece_before < 0.02:
|
|
266
|
+
report["verdict"] = "already_calibrated"
|
|
267
|
+
report["reason"] = f"ECE {ece_before:.4f} is already low — calibration not needed"
|
|
268
|
+
elif best and best["improvement"] > 0.01:
|
|
269
|
+
report["verdict"] = "improved"
|
|
270
|
+
report["reason"] = f"{best['method']} reduces ECE from {ece_before:.4f} to {best['ece_after']:.4f}"
|
|
271
|
+
elif best:
|
|
272
|
+
report["verdict"] = "marginal_improvement"
|
|
273
|
+
report["reason"] = f"Best method ({best['method']}) improves ECE by only {best['improvement']:.4f}"
|
|
274
|
+
else:
|
|
275
|
+
report["verdict"] = "no_improvement"
|
|
276
|
+
report["reason"] = "No calibration method improved ECE"
|
|
277
|
+
|
|
278
|
+
return report
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# --- Report Formatting ---
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def save_calibration_report(report: dict, output_dir: str = "experiments/calibration") -> Path:
|
|
285
|
+
out_path = Path(output_dir)
|
|
286
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
287
|
+
exp_id = report.get("experiment_id", "unknown")
|
|
288
|
+
filepath = out_path / f"{exp_id}-calibration.yaml"
|
|
289
|
+
clean = json.loads(json.dumps(report, default=str))
|
|
290
|
+
with open(filepath, "w") as f:
|
|
291
|
+
yaml.dump(clean, f, default_flow_style=False, sort_keys=False)
|
|
292
|
+
return filepath
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def format_calibration_report(report: dict) -> str:
|
|
296
|
+
if "error" in report:
|
|
297
|
+
return f"ERROR: {report['error']}"
|
|
298
|
+
|
|
299
|
+
exp_id = report.get("experiment_id", "?")
|
|
300
|
+
before = report.get("before", {})
|
|
301
|
+
|
|
302
|
+
lines = [f"# Calibration: {exp_id}", "",
|
|
303
|
+
f"*Generated {report.get('generated_at', 'N/A')[:19]}*", "",
|
|
304
|
+
f"**ECE before:** {before.get('ece', '?')}",
|
|
305
|
+
f"**MCE before:** {before.get('mce', '?')}", ""]
|
|
306
|
+
|
|
307
|
+
# Reliability diagram
|
|
308
|
+
diagram = report.get("reliability_diagram", [])
|
|
309
|
+
if diagram:
|
|
310
|
+
lines.extend(["## Reliability Diagram", "",
|
|
311
|
+
"| Bin | Predicted | Actual | Gap |",
|
|
312
|
+
"|-----|-----------|--------|-----|"])
|
|
313
|
+
for b in diagram:
|
|
314
|
+
if b["predicted"] is not None:
|
|
315
|
+
gap_marker = " overconfident" if b["gap"] is not None and b["gap"] < -0.05 else ""
|
|
316
|
+
lines.append(f"| {b['bin']} | {b['predicted']:.4f} | {b['actual']:.4f} | {b['gap']:+.4f}{gap_marker} |")
|
|
317
|
+
lines.append("")
|
|
318
|
+
|
|
319
|
+
# Calibration results
|
|
320
|
+
results = report.get("calibration_results", [])
|
|
321
|
+
if results:
|
|
322
|
+
lines.extend(["## Calibration Methods", "",
|
|
323
|
+
"| Method | ECE After | Improvement |",
|
|
324
|
+
"|--------|-----------|-------------|"])
|
|
325
|
+
best = report.get("best_method", {})
|
|
326
|
+
for r in results:
|
|
327
|
+
marker = " BEST" if r["method"] == best.get("method") else ""
|
|
328
|
+
lines.append(f"| {r['method']} | {r['ece_after']:.4f} | {r['improvement']:+.4f} |{marker}")
|
|
329
|
+
lines.append("")
|
|
330
|
+
|
|
331
|
+
# Verdict
|
|
332
|
+
verdict = report.get("verdict", "?")
|
|
333
|
+
labels = {"already_calibrated": "ALREADY CALIBRATED", "improved": "IMPROVED",
|
|
334
|
+
"marginal_improvement": "MARGINAL IMPROVEMENT", "no_improvement": "NO IMPROVEMENT"}
|
|
335
|
+
lines.extend(["## Verdict", "", f"**{labels.get(verdict, verdict.upper())}**", "",
|
|
336
|
+
report.get("reason", "")])
|
|
337
|
+
|
|
338
|
+
return "\n".join(lines)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def main() -> None:
|
|
342
|
+
parser = argparse.ArgumentParser(description="Probability calibration")
|
|
343
|
+
parser.add_argument("exp_id", nargs="?", help="Experiment ID")
|
|
344
|
+
parser.add_argument("--method", choices=CALIBRATION_METHODS + ["auto"], default="auto")
|
|
345
|
+
parser.add_argument("--config", default="config.yaml")
|
|
346
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH)
|
|
347
|
+
parser.add_argument("--json", action="store_true")
|
|
348
|
+
args = parser.parse_args()
|
|
349
|
+
|
|
350
|
+
# Without data, show usage
|
|
351
|
+
report = calibrate_model(exp_id=args.exp_id, method=args.method, config_path=args.config)
|
|
352
|
+
|
|
353
|
+
if "error" not in report:
|
|
354
|
+
filepath = save_calibration_report(report)
|
|
355
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
356
|
+
|
|
357
|
+
if args.json:
|
|
358
|
+
print(json.dumps(report, indent=2, default=str))
|
|
359
|
+
else:
|
|
360
|
+
print(format_calibration_report(report))
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
if __name__ == "__main__":
|
|
364
|
+
main()
|