claude-turing 3.1.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -121,6 +121,11 @@ TEMPLATE_DIRS = {
121
121
  "sanity_checks.py",
122
122
  "generate_baselines.py",
123
123
  "leakage_detector.py",
124
+ "model_xray.py",
125
+ "sensitivity_analysis.py",
126
+ "calibration.py",
127
+ "feature_intelligence.py",
128
+ "curriculum_optimizer.py",
124
129
  ],
125
130
  "tests": ["__init__.py", "conftest.py"],
126
131
  }
@@ -154,6 +159,11 @@ DIRECTORIES_TO_CREATE = [
154
159
  "experiments/sanity",
155
160
  "experiments/baselines",
156
161
  "experiments/leakage",
162
+ "experiments/xrays",
163
+ "experiments/sensitivity",
164
+ "experiments/calibration",
165
+ "experiments/features",
166
+ "experiments/curriculum",
157
167
  "experiments/logs",
158
168
  "models/best",
159
169
  "models/archive",
@@ -0,0 +1,335 @@
1
+ #!/usr/bin/env python3
2
+ """Hyperparameter sensitivity analysis for the autoresearch pipeline.
3
+
4
+ Varies each hyperparameter individually while holding others fixed,
5
+ measures the metric response, and ranks hyperparameters by sensitivity.
6
+ Answers "which hyperparameters actually matter?"
7
+
8
+ Usage:
9
+ python scripts/sensitivity_analysis.py exp-042
10
+ python scripts/sensitivity_analysis.py --params "learning_rate,max_depth"
11
+ python scripts/sensitivity_analysis.py --json
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import math
19
+ import sys
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+
23
+ import numpy as np
24
+ import yaml
25
+
26
+ from scripts.turing_io import load_config, load_experiments
27
+
28
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
29
+ DEFAULT_N_POINTS = 5
30
+ SENSITIVITY_THRESHOLDS = {"HIGH": 0.02, "MED": 0.005, "LOW": 0.002}
31
+ DEFAULT_MULTIPLIERS = [0.5, 0.75, 1.0, 1.5, 2.0]
32
+
33
+
34
+ # --- Sweep Generation ---
35
+
36
+
37
+ def generate_sweep(
38
+ param_name: str,
39
+ current_value: float | int,
40
+ n_points: int = DEFAULT_N_POINTS,
41
+ multipliers: list[float] | None = None,
42
+ ) -> list[dict]:
43
+ """Generate sweep values for a hyperparameter.
44
+
45
+ Returns list of {value, multiplier} dicts.
46
+ """
47
+ if multipliers is None:
48
+ multipliers = DEFAULT_MULTIPLIERS[:n_points]
49
+
50
+ points = []
51
+ for m in multipliers:
52
+ if isinstance(current_value, int):
53
+ val = max(1, int(current_value * m))
54
+ else:
55
+ val = current_value * m
56
+ points.append({
57
+ "value": val,
58
+ "multiplier": round(m, 2),
59
+ "is_current": abs(m - 1.0) < 0.01,
60
+ })
61
+
62
+ return points
63
+
64
+
65
+ def extract_tunable_params(config: dict) -> dict:
66
+ """Extract tunable hyperparameters from config."""
67
+ hyperparams = config.get("model", {}).get("hyperparams", {})
68
+
69
+ tunable = {}
70
+ for key, val in hyperparams.items():
71
+ if isinstance(val, (int, float)) and key not in ("seed", "random_state", "verbose"):
72
+ tunable[key] = val
73
+
74
+ return tunable
75
+
76
+
77
+ # --- Sensitivity Scoring ---
78
+
79
+
80
+ def compute_sensitivity(
81
+ param_name: str,
82
+ sweep_results: list[dict],
83
+ primary_metric: str,
84
+ ) -> dict:
85
+ """Compute sensitivity score for a hyperparameter.
86
+
87
+ Args:
88
+ param_name: Hyperparameter name.
89
+ sweep_results: List of {value, metric_value} dicts.
90
+ primary_metric: Name of the primary metric.
91
+
92
+ Returns:
93
+ Sensitivity dict with score, level, range, best value, monotonicity.
94
+ """
95
+ if not sweep_results or len(sweep_results) < 2:
96
+ return {"param": param_name, "sensitivity": 0, "level": "NONE",
97
+ "reason": "Insufficient sweep data"}
98
+
99
+ values = [r.get("value") for r in sweep_results]
100
+ metrics = [r.get("metric_value") for r in sweep_results
101
+ if r.get("metric_value") is not None]
102
+
103
+ if len(metrics) < 2:
104
+ return {"param": param_name, "sensitivity": 0, "level": "NONE",
105
+ "reason": "Insufficient metric data"}
106
+
107
+ metric_range = max(metrics) - min(metrics)
108
+ metric_mean = np.mean(metrics)
109
+
110
+ # Normalized sensitivity
111
+ sensitivity = metric_range / abs(metric_mean) if metric_mean != 0 else metric_range
112
+
113
+ # Classify level
114
+ if sensitivity > SENSITIVITY_THRESHOLDS["HIGH"]:
115
+ level = "HIGH"
116
+ elif sensitivity > SENSITIVITY_THRESHOLDS["MED"]:
117
+ level = "MED"
118
+ elif sensitivity > SENSITIVITY_THRESHOLDS["LOW"]:
119
+ level = "LOW"
120
+ else:
121
+ level = "NONE"
122
+
123
+ # Check monotonicity
124
+ monotonic = _check_monotonicity(metrics)
125
+
126
+ # Best value
127
+ best_idx = np.argmax(metrics)
128
+ best_value = values[best_idx] if best_idx < len(values) else None
129
+
130
+ return {
131
+ "param": param_name,
132
+ "current_value": next((r["value"] for r in sweep_results if r.get("is_current")), None),
133
+ "sensitivity": round(float(sensitivity), 6),
134
+ "metric_range": round(float(metric_range), 6),
135
+ "metric_min": round(float(min(metrics)), 6),
136
+ "metric_max": round(float(max(metrics)), 6),
137
+ "level": level,
138
+ "best_value": best_value,
139
+ "monotonic": monotonic,
140
+ }
141
+
142
+
143
+ def _check_monotonicity(values: list[float]) -> str:
144
+ """Check if values are monotonically increasing, decreasing, or non-monotonic."""
145
+ if len(values) < 2:
146
+ return "unknown"
147
+
148
+ diffs = [values[i + 1] - values[i] for i in range(len(values) - 1)]
149
+ all_pos = all(d >= 0 for d in diffs)
150
+ all_neg = all(d <= 0 for d in diffs)
151
+
152
+ if all_pos:
153
+ return "increasing"
154
+ elif all_neg:
155
+ return "decreasing"
156
+ else:
157
+ return "non_monotonic"
158
+
159
+
160
+ def rank_sensitivities(sensitivities: list[dict]) -> list[dict]:
161
+ """Rank parameters by sensitivity (highest first)."""
162
+ return sorted(sensitivities, key=lambda s: s.get("sensitivity", 0), reverse=True)
163
+
164
+
165
+ # --- Recommendations ---
166
+
167
+
168
+ def generate_recommendations(ranked: list[dict]) -> list[str]:
169
+ """Generate tuning recommendations from sensitivity ranking."""
170
+ recs = []
171
+
172
+ high = [s for s in ranked if s["level"] == "HIGH"]
173
+ none = [s for s in ranked if s["level"] == "NONE"]
174
+
175
+ if high:
176
+ names = ", ".join(s["param"] for s in high)
177
+ recs.append(f"Focus tuning on {names}")
178
+
179
+ if none:
180
+ names = ", ".join(s["param"] for s in none)
181
+ recs.append(f"Stop tuning {names} — they don't matter for this model")
182
+
183
+ non_mono = [s for s in ranked if s.get("monotonic") == "non_monotonic" and s["level"] in ("HIGH", "MED")]
184
+ if non_mono:
185
+ for s in non_mono:
186
+ recs.append(f"{s['param']} has a non-monotonic relationship — there's an optimal sweet spot around {s.get('best_value')}")
187
+
188
+ return recs
189
+
190
+
191
+ # --- Full Pipeline ---
192
+
193
+
194
+ def sensitivity_analysis(
195
+ exp_id: str | None = None,
196
+ params: list[str] | None = None,
197
+ sweep_data: dict[str, list[dict]] | None = None,
198
+ config_path: str = "config.yaml",
199
+ log_path: str = DEFAULT_LOG_PATH,
200
+ ) -> dict:
201
+ """Run sensitivity analysis.
202
+
203
+ Args:
204
+ exp_id: Experiment ID to analyze.
205
+ params: Specific parameters to analyze.
206
+ sweep_data: Pre-computed sweep results {param: [{value, metric_value}]}.
207
+ config_path: Path to config.yaml.
208
+ log_path: Path to experiment log.
209
+
210
+ Returns:
211
+ Sensitivity analysis report.
212
+ """
213
+ config = load_config(config_path)
214
+ eval_cfg = config.get("evaluation", {})
215
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
216
+
217
+ sensitivities = []
218
+
219
+ if sweep_data:
220
+ # Analyze pre-computed sweep data
221
+ for param, results in sweep_data.items():
222
+ sens = compute_sensitivity(param, results, primary_metric)
223
+ sensitivities.append(sens)
224
+ else:
225
+ # Generate sweep plan (actual execution done by agent)
226
+ tunable = extract_tunable_params(config)
227
+ if params:
228
+ tunable = {k: v for k, v in tunable.items() if k in params}
229
+
230
+ if not tunable:
231
+ return {"error": "No tunable hyperparameters found in config"}
232
+
233
+ sweep_plans = {}
234
+ for param, value in tunable.items():
235
+ sweep_plans[param] = generate_sweep(param, value)
236
+
237
+ return {
238
+ "action": "plan",
239
+ "generated_at": datetime.now(timezone.utc).isoformat(),
240
+ "primary_metric": primary_metric,
241
+ "experiment_id": exp_id,
242
+ "sweep_plans": sweep_plans,
243
+ "n_experiments_needed": sum(len(s) for s in sweep_plans.values()),
244
+ "message": f"Sweep {len(sweep_plans)} parameters × {DEFAULT_N_POINTS} values each",
245
+ }
246
+
247
+ ranked = rank_sensitivities(sensitivities)
248
+ recommendations = generate_recommendations(ranked)
249
+
250
+ return {
251
+ "generated_at": datetime.now(timezone.utc).isoformat(),
252
+ "primary_metric": primary_metric,
253
+ "experiment_id": exp_id,
254
+ "sensitivities": ranked,
255
+ "recommendations": recommendations,
256
+ }
257
+
258
+
259
+ # --- Report Formatting ---
260
+
261
+
262
+ def save_sensitivity_report(report: dict, output_dir: str = "experiments/sensitivity") -> Path:
263
+ out_path = Path(output_dir)
264
+ out_path.mkdir(parents=True, exist_ok=True)
265
+ exp_id = report.get("experiment_id", "unknown")
266
+ filepath = out_path / f"{exp_id}-sensitivity.yaml"
267
+ with open(filepath, "w") as f:
268
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
269
+ return filepath
270
+
271
+
272
+ def format_sensitivity_report(report: dict) -> str:
273
+ if "error" in report:
274
+ return f"ERROR: {report['error']}"
275
+
276
+ if report.get("action") == "plan":
277
+ plans = report.get("sweep_plans", {})
278
+ lines = ["# Sensitivity Analysis Plan", "",
279
+ f"**{report.get('n_experiments_needed', 0)} experiments** needed for {len(plans)} parameters", ""]
280
+ for param, points in plans.items():
281
+ vals = ", ".join(str(p["value"]) for p in points)
282
+ lines.append(f"- **{param}:** [{vals}]")
283
+ return "\n".join(lines)
284
+
285
+ metric = report.get("primary_metric", "metric")
286
+ exp_id = report.get("experiment_id", "?")
287
+
288
+ lines = [f"# Hyperparameter Sensitivity Analysis ({exp_id})", "",
289
+ f"*Generated {report.get('generated_at', 'N/A')[:19]}*", "",
290
+ f"| Parameter | Current | Range Tested | {metric} Range | Sensitivity |",
291
+ "|-----------|---------|-------------|----------------|-------------|"]
292
+
293
+ for s in report.get("sensitivities", []):
294
+ current = s.get("current_value", "?")
295
+ metric_range = f"{s['metric_min']:.4f}–{s['metric_max']:.4f}" if s.get("metric_min") is not None else "N/A"
296
+ sens = f"{s['level']} ({s['sensitivity']:.4f})"
297
+ lines.append(f"| {s['param']} | {current} | — | {metric_range} | {sens} |")
298
+
299
+ recs = report.get("recommendations", [])
300
+ if recs:
301
+ lines.extend(["", "## Recommendations", ""])
302
+ for r in recs:
303
+ lines.append(f"- {r}")
304
+
305
+ return "\n".join(lines)
306
+
307
+
308
+ def main() -> None:
309
+ parser = argparse.ArgumentParser(description="Hyperparameter sensitivity analysis")
310
+ parser.add_argument("exp_id", nargs="?", help="Experiment ID")
311
+ parser.add_argument("--params", help="Comma-separated parameter names")
312
+ parser.add_argument("--config", default="config.yaml")
313
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH)
314
+ parser.add_argument("--json", action="store_true")
315
+ args = parser.parse_args()
316
+
317
+ params = [p.strip() for p in args.params.split(",")] if args.params else None
318
+
319
+ report = sensitivity_analysis(
320
+ exp_id=args.exp_id, params=params,
321
+ config_path=args.config, log_path=args.log,
322
+ )
323
+
324
+ if "error" not in report:
325
+ filepath = save_sensitivity_report(report)
326
+ print(f"Saved to {filepath}", file=sys.stderr)
327
+
328
+ if args.json:
329
+ print(json.dumps(report, indent=2, default=str))
330
+ else:
331
+ print(format_sensitivity_report(report))
332
+
333
+
334
+ if __name__ == "__main__":
335
+ main()