claude-turing 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,364 @@
1
+ #!/usr/bin/env python3
2
+ """Probability calibration for the autoresearch pipeline.
3
+
4
+ Measures whether model probabilities are well-calibrated, computes ECE/MCE,
5
+ generates reliability diagrams, and applies post-hoc calibration (Platt
6
+ scaling, isotonic regression, temperature scaling).
7
+
8
+ Usage:
9
+ python scripts/calibration.py exp-042
10
+ python scripts/calibration.py exp-042 --method platt
11
+ python scripts/calibration.py exp-042 --method auto
12
+ python scripts/calibration.py --json
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import sys
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+
23
+ import numpy as np
24
+ import yaml
25
+
26
+ from scripts.turing_io import load_config, load_experiments
27
+
28
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
29
+ DEFAULT_N_BINS = 10
30
+ CALIBRATION_METHODS = ["platt", "isotonic", "temperature"]
31
+
32
+
33
+ # --- Calibration Metrics ---
34
+
35
+
36
+ def compute_ece(
37
+ probabilities: np.ndarray,
38
+ labels: np.ndarray,
39
+ n_bins: int = DEFAULT_N_BINS,
40
+ ) -> float:
41
+ """Compute Expected Calibration Error.
42
+
43
+ ECE = sum(|bin_accuracy - bin_confidence| * bin_size / total)
44
+ """
45
+ if len(probabilities) == 0:
46
+ return 0.0
47
+
48
+ bin_boundaries = np.linspace(0, 1, n_bins + 1)
49
+ ece = 0.0
50
+
51
+ for i in range(n_bins):
52
+ mask = (probabilities >= bin_boundaries[i]) & (probabilities < bin_boundaries[i + 1])
53
+ if i == n_bins - 1:
54
+ mask = (probabilities >= bin_boundaries[i]) & (probabilities <= bin_boundaries[i + 1])
55
+
56
+ bin_size = np.sum(mask)
57
+ if bin_size == 0:
58
+ continue
59
+
60
+ bin_accuracy = np.mean(labels[mask])
61
+ bin_confidence = np.mean(probabilities[mask])
62
+ ece += abs(bin_accuracy - bin_confidence) * bin_size / len(probabilities)
63
+
64
+ return round(float(ece), 6)
65
+
66
+
67
+ def compute_mce(
68
+ probabilities: np.ndarray,
69
+ labels: np.ndarray,
70
+ n_bins: int = DEFAULT_N_BINS,
71
+ ) -> float:
72
+ """Compute Maximum Calibration Error."""
73
+ if len(probabilities) == 0:
74
+ return 0.0
75
+
76
+ bin_boundaries = np.linspace(0, 1, n_bins + 1)
77
+ max_gap = 0.0
78
+
79
+ for i in range(n_bins):
80
+ mask = (probabilities >= bin_boundaries[i]) & (probabilities < bin_boundaries[i + 1])
81
+ if i == n_bins - 1:
82
+ mask = (probabilities >= bin_boundaries[i]) & (probabilities <= bin_boundaries[i + 1])
83
+
84
+ if np.sum(mask) == 0:
85
+ continue
86
+
87
+ bin_accuracy = np.mean(labels[mask])
88
+ bin_confidence = np.mean(probabilities[mask])
89
+ max_gap = max(max_gap, abs(bin_accuracy - bin_confidence))
90
+
91
+ return round(float(max_gap), 6)
92
+
93
+
94
+ def compute_reliability_diagram(
95
+ probabilities: np.ndarray,
96
+ labels: np.ndarray,
97
+ n_bins: int = DEFAULT_N_BINS,
98
+ ) -> list[dict]:
99
+ """Compute reliability diagram data."""
100
+ if len(probabilities) == 0:
101
+ return []
102
+
103
+ bin_boundaries = np.linspace(0, 1, n_bins + 1)
104
+ bins = []
105
+
106
+ for i in range(n_bins):
107
+ lo = bin_boundaries[i]
108
+ hi = bin_boundaries[i + 1]
109
+ mask = (probabilities >= lo) & (probabilities < hi)
110
+ if i == n_bins - 1:
111
+ mask = (probabilities >= lo) & (probabilities <= hi)
112
+
113
+ bin_size = int(np.sum(mask))
114
+ if bin_size == 0:
115
+ bins.append({"bin": f"[{lo:.1f}-{hi:.1f}]", "predicted": None,
116
+ "actual": None, "gap": None, "n": 0})
117
+ continue
118
+
119
+ predicted = float(np.mean(probabilities[mask]))
120
+ actual = float(np.mean(labels[mask]))
121
+ gap = actual - predicted
122
+
123
+ bins.append({
124
+ "bin": f"[{lo:.1f}-{hi:.1f}]",
125
+ "predicted": round(predicted, 4),
126
+ "actual": round(actual, 4),
127
+ "gap": round(gap, 4),
128
+ "n": bin_size,
129
+ })
130
+
131
+ return bins
132
+
133
+
134
+ # --- Calibration Methods ---
135
+
136
+
137
+ def platt_scaling(
138
+ logits: np.ndarray,
139
+ labels: np.ndarray,
140
+ ) -> dict:
141
+ """Apply Platt scaling (logistic regression on logits)."""
142
+ from scipy.special import expit
143
+
144
+ # Fit logistic regression: P(y=1|f) = sigmoid(a*f + b)
145
+ # Simple gradient descent for a, b
146
+ a, b = 1.0, 0.0
147
+ lr = 0.01
148
+ for _ in range(1000):
149
+ pred = expit(a * logits + b)
150
+ pred = np.clip(pred, 1e-7, 1 - 1e-7)
151
+ grad_a = np.mean((pred - labels) * logits)
152
+ grad_b = np.mean(pred - labels)
153
+ a -= lr * grad_a
154
+ b -= lr * grad_b
155
+
156
+ calibrated = expit(a * logits + b)
157
+ return {"method": "platt", "params": {"a": round(float(a), 6), "b": round(float(b), 6)},
158
+ "calibrated_probabilities": calibrated}
159
+
160
+
161
+ def isotonic_calibration(
162
+ probabilities: np.ndarray,
163
+ labels: np.ndarray,
164
+ ) -> dict:
165
+ """Apply isotonic regression calibration."""
166
+ from sklearn.isotonic import IsotonicRegression
167
+
168
+ iso = IsotonicRegression(out_of_bounds="clip")
169
+ calibrated = iso.fit_transform(probabilities, labels)
170
+ return {"method": "isotonic", "params": {},
171
+ "calibrated_probabilities": np.clip(calibrated, 0, 1)}
172
+
173
+
174
+ def temperature_scaling(
175
+ logits: np.ndarray,
176
+ labels: np.ndarray,
177
+ ) -> dict:
178
+ """Apply temperature scaling (single parameter T)."""
179
+ from scipy.special import expit
180
+
181
+ best_t = 1.0
182
+ best_ece = float("inf")
183
+
184
+ for t in np.arange(0.5, 5.0, 0.1):
185
+ scaled = expit(logits / t)
186
+ ece = compute_ece(scaled, labels)
187
+ if ece < best_ece:
188
+ best_ece = ece
189
+ best_t = t
190
+
191
+ calibrated = expit(logits / best_t)
192
+ return {"method": "temperature", "params": {"T": round(float(best_t), 2)},
193
+ "calibrated_probabilities": calibrated}
194
+
195
+
196
+ # --- Full Pipeline ---
197
+
198
+
199
+ def calibrate_model(
200
+ probabilities: np.ndarray | None = None,
201
+ logits: np.ndarray | None = None,
202
+ labels: np.ndarray | None = None,
203
+ method: str = "auto",
204
+ exp_id: str | None = None,
205
+ config_path: str = "config.yaml",
206
+ ) -> dict:
207
+ """Run calibration analysis and optionally apply post-hoc calibration."""
208
+ if (probabilities is None and logits is None) or labels is None:
209
+ return {"error": "Provide probabilities (or logits) and labels for calibration"}
210
+
211
+ if probabilities is None and logits is not None:
212
+ from scipy.special import expit
213
+ probabilities = expit(logits)
214
+
215
+ # Before calibration
216
+ ece_before = compute_ece(probabilities, labels)
217
+ mce_before = compute_mce(probabilities, labels)
218
+ reliability = compute_reliability_diagram(probabilities, labels)
219
+
220
+ # Determine overconfidence
221
+ overconfident_bins = [b for b in reliability if b.get("gap") is not None and b["gap"] < -0.05 and b["n"] > 0]
222
+
223
+ report = {
224
+ "generated_at": datetime.now(timezone.utc).isoformat(),
225
+ "experiment_id": exp_id,
226
+ "before": {"ece": ece_before, "mce": mce_before},
227
+ "reliability_diagram": reliability,
228
+ "overconfident_bins": len(overconfident_bins),
229
+ }
230
+
231
+ # Apply calibration
232
+ methods_to_try = CALIBRATION_METHODS if method == "auto" else [method]
233
+ results = []
234
+
235
+ for m in methods_to_try:
236
+ try:
237
+ if m == "platt" and logits is not None:
238
+ cal = platt_scaling(logits, labels)
239
+ elif m == "isotonic":
240
+ cal = isotonic_calibration(probabilities, labels)
241
+ elif m == "temperature" and logits is not None:
242
+ cal = temperature_scaling(logits, labels)
243
+ else:
244
+ continue
245
+
246
+ ece_after = compute_ece(cal["calibrated_probabilities"], labels)
247
+ results.append({
248
+ "method": m,
249
+ "ece_after": ece_after,
250
+ "improvement": round(ece_before - ece_after, 6),
251
+ "params": cal.get("params", {}),
252
+ })
253
+ except Exception:
254
+ continue
255
+
256
+ # Find best method
257
+ best = None
258
+ if results:
259
+ best = min(results, key=lambda r: r["ece_after"])
260
+
261
+ report["calibration_results"] = results
262
+ report["best_method"] = best
263
+
264
+ # Verdict
265
+ if ece_before < 0.02:
266
+ report["verdict"] = "already_calibrated"
267
+ report["reason"] = f"ECE {ece_before:.4f} is already low — calibration not needed"
268
+ elif best and best["improvement"] > 0.01:
269
+ report["verdict"] = "improved"
270
+ report["reason"] = f"{best['method']} reduces ECE from {ece_before:.4f} to {best['ece_after']:.4f}"
271
+ elif best:
272
+ report["verdict"] = "marginal_improvement"
273
+ report["reason"] = f"Best method ({best['method']}) improves ECE by only {best['improvement']:.4f}"
274
+ else:
275
+ report["verdict"] = "no_improvement"
276
+ report["reason"] = "No calibration method improved ECE"
277
+
278
+ return report
279
+
280
+
281
+ # --- Report Formatting ---
282
+
283
+
284
+ def save_calibration_report(report: dict, output_dir: str = "experiments/calibration") -> Path:
285
+ out_path = Path(output_dir)
286
+ out_path.mkdir(parents=True, exist_ok=True)
287
+ exp_id = report.get("experiment_id", "unknown")
288
+ filepath = out_path / f"{exp_id}-calibration.yaml"
289
+ clean = json.loads(json.dumps(report, default=str))
290
+ with open(filepath, "w") as f:
291
+ yaml.dump(clean, f, default_flow_style=False, sort_keys=False)
292
+ return filepath
293
+
294
+
295
+ def format_calibration_report(report: dict) -> str:
296
+ if "error" in report:
297
+ return f"ERROR: {report['error']}"
298
+
299
+ exp_id = report.get("experiment_id", "?")
300
+ before = report.get("before", {})
301
+
302
+ lines = [f"# Calibration: {exp_id}", "",
303
+ f"*Generated {report.get('generated_at', 'N/A')[:19]}*", "",
304
+ f"**ECE before:** {before.get('ece', '?')}",
305
+ f"**MCE before:** {before.get('mce', '?')}", ""]
306
+
307
+ # Reliability diagram
308
+ diagram = report.get("reliability_diagram", [])
309
+ if diagram:
310
+ lines.extend(["## Reliability Diagram", "",
311
+ "| Bin | Predicted | Actual | Gap |",
312
+ "|-----|-----------|--------|-----|"])
313
+ for b in diagram:
314
+ if b["predicted"] is not None:
315
+ gap_marker = " overconfident" if b["gap"] is not None and b["gap"] < -0.05 else ""
316
+ lines.append(f"| {b['bin']} | {b['predicted']:.4f} | {b['actual']:.4f} | {b['gap']:+.4f}{gap_marker} |")
317
+ lines.append("")
318
+
319
+ # Calibration results
320
+ results = report.get("calibration_results", [])
321
+ if results:
322
+ lines.extend(["## Calibration Methods", "",
323
+ "| Method | ECE After | Improvement |",
324
+ "|--------|-----------|-------------|"])
325
+ best = report.get("best_method", {})
326
+ for r in results:
327
+ marker = " BEST" if r["method"] == best.get("method") else ""
328
+ lines.append(f"| {r['method']} | {r['ece_after']:.4f} | {r['improvement']:+.4f} |{marker}")
329
+ lines.append("")
330
+
331
+ # Verdict
332
+ verdict = report.get("verdict", "?")
333
+ labels = {"already_calibrated": "ALREADY CALIBRATED", "improved": "IMPROVED",
334
+ "marginal_improvement": "MARGINAL IMPROVEMENT", "no_improvement": "NO IMPROVEMENT"}
335
+ lines.extend(["## Verdict", "", f"**{labels.get(verdict, verdict.upper())}**", "",
336
+ report.get("reason", "")])
337
+
338
+ return "\n".join(lines)
339
+
340
+
341
+ def main() -> None:
342
+ parser = argparse.ArgumentParser(description="Probability calibration")
343
+ parser.add_argument("exp_id", nargs="?", help="Experiment ID")
344
+ parser.add_argument("--method", choices=CALIBRATION_METHODS + ["auto"], default="auto")
345
+ parser.add_argument("--config", default="config.yaml")
346
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH)
347
+ parser.add_argument("--json", action="store_true")
348
+ args = parser.parse_args()
349
+
350
+ # Without data, show usage
351
+ report = calibrate_model(exp_id=args.exp_id, method=args.method, config_path=args.config)
352
+
353
+ if "error" not in report:
354
+ filepath = save_calibration_report(report)
355
+ print(f"Saved to {filepath}", file=sys.stderr)
356
+
357
+ if args.json:
358
+ print(json.dumps(report, indent=2, default=str))
359
+ else:
360
+ print(format_calibration_report(report))
361
+
362
+
363
+ if __name__ == "__main__":
364
+ main()