claude-turing 3.1.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +7 -2
- package/commands/calibrate.md +47 -0
- package/commands/curriculum.md +43 -0
- package/commands/feature.md +42 -0
- package/commands/sensitivity.md +41 -0
- package/commands/turing.md +10 -0
- package/commands/xray.md +43 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +5 -0
- package/templates/scripts/__pycache__/calibration.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/curriculum_optimizer.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/feature_intelligence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_xray.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sensitivity_analysis.cpython-314.pyc +0 -0
- package/templates/scripts/calibration.py +364 -0
- package/templates/scripts/curriculum_optimizer.py +337 -0
- package/templates/scripts/feature_intelligence.py +369 -0
- package/templates/scripts/model_xray.py +317 -0
- package/templates/scripts/scaffold.py +10 -0
- package/templates/scripts/sensitivity_analysis.py +335 -0
|
@@ -121,6 +121,11 @@ TEMPLATE_DIRS = {
|
|
|
121
121
|
"sanity_checks.py",
|
|
122
122
|
"generate_baselines.py",
|
|
123
123
|
"leakage_detector.py",
|
|
124
|
+
"model_xray.py",
|
|
125
|
+
"sensitivity_analysis.py",
|
|
126
|
+
"calibration.py",
|
|
127
|
+
"feature_intelligence.py",
|
|
128
|
+
"curriculum_optimizer.py",
|
|
124
129
|
],
|
|
125
130
|
"tests": ["__init__.py", "conftest.py"],
|
|
126
131
|
}
|
|
@@ -154,6 +159,11 @@ DIRECTORIES_TO_CREATE = [
|
|
|
154
159
|
"experiments/sanity",
|
|
155
160
|
"experiments/baselines",
|
|
156
161
|
"experiments/leakage",
|
|
162
|
+
"experiments/xrays",
|
|
163
|
+
"experiments/sensitivity",
|
|
164
|
+
"experiments/calibration",
|
|
165
|
+
"experiments/features",
|
|
166
|
+
"experiments/curriculum",
|
|
157
167
|
"experiments/logs",
|
|
158
168
|
"models/best",
|
|
159
169
|
"models/archive",
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Hyperparameter sensitivity analysis for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Varies each hyperparameter individually while holding others fixed,
|
|
5
|
+
measures the metric response, and ranks hyperparameters by sensitivity.
|
|
6
|
+
Answers "which hyperparameters actually matter?"
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/sensitivity_analysis.py exp-042
|
|
10
|
+
python scripts/sensitivity_analysis.py --params "learning_rate,max_depth"
|
|
11
|
+
python scripts/sensitivity_analysis.py --json
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import math
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from scripts.turing_io import load_config, load_experiments
|
|
27
|
+
|
|
28
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
29
|
+
DEFAULT_N_POINTS = 5
|
|
30
|
+
SENSITIVITY_THRESHOLDS = {"HIGH": 0.02, "MED": 0.005, "LOW": 0.002}
|
|
31
|
+
DEFAULT_MULTIPLIERS = [0.5, 0.75, 1.0, 1.5, 2.0]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# --- Sweep Generation ---
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def generate_sweep(
|
|
38
|
+
param_name: str,
|
|
39
|
+
current_value: float | int,
|
|
40
|
+
n_points: int = DEFAULT_N_POINTS,
|
|
41
|
+
multipliers: list[float] | None = None,
|
|
42
|
+
) -> list[dict]:
|
|
43
|
+
"""Generate sweep values for a hyperparameter.
|
|
44
|
+
|
|
45
|
+
Returns list of {value, multiplier} dicts.
|
|
46
|
+
"""
|
|
47
|
+
if multipliers is None:
|
|
48
|
+
multipliers = DEFAULT_MULTIPLIERS[:n_points]
|
|
49
|
+
|
|
50
|
+
points = []
|
|
51
|
+
for m in multipliers:
|
|
52
|
+
if isinstance(current_value, int):
|
|
53
|
+
val = max(1, int(current_value * m))
|
|
54
|
+
else:
|
|
55
|
+
val = current_value * m
|
|
56
|
+
points.append({
|
|
57
|
+
"value": val,
|
|
58
|
+
"multiplier": round(m, 2),
|
|
59
|
+
"is_current": abs(m - 1.0) < 0.01,
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
return points
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def extract_tunable_params(config: dict) -> dict:
|
|
66
|
+
"""Extract tunable hyperparameters from config."""
|
|
67
|
+
hyperparams = config.get("model", {}).get("hyperparams", {})
|
|
68
|
+
|
|
69
|
+
tunable = {}
|
|
70
|
+
for key, val in hyperparams.items():
|
|
71
|
+
if isinstance(val, (int, float)) and key not in ("seed", "random_state", "verbose"):
|
|
72
|
+
tunable[key] = val
|
|
73
|
+
|
|
74
|
+
return tunable
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# --- Sensitivity Scoring ---
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def compute_sensitivity(
|
|
81
|
+
param_name: str,
|
|
82
|
+
sweep_results: list[dict],
|
|
83
|
+
primary_metric: str,
|
|
84
|
+
) -> dict:
|
|
85
|
+
"""Compute sensitivity score for a hyperparameter.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
param_name: Hyperparameter name.
|
|
89
|
+
sweep_results: List of {value, metric_value} dicts.
|
|
90
|
+
primary_metric: Name of the primary metric.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Sensitivity dict with score, level, range, best value, monotonicity.
|
|
94
|
+
"""
|
|
95
|
+
if not sweep_results or len(sweep_results) < 2:
|
|
96
|
+
return {"param": param_name, "sensitivity": 0, "level": "NONE",
|
|
97
|
+
"reason": "Insufficient sweep data"}
|
|
98
|
+
|
|
99
|
+
values = [r.get("value") for r in sweep_results]
|
|
100
|
+
metrics = [r.get("metric_value") for r in sweep_results
|
|
101
|
+
if r.get("metric_value") is not None]
|
|
102
|
+
|
|
103
|
+
if len(metrics) < 2:
|
|
104
|
+
return {"param": param_name, "sensitivity": 0, "level": "NONE",
|
|
105
|
+
"reason": "Insufficient metric data"}
|
|
106
|
+
|
|
107
|
+
metric_range = max(metrics) - min(metrics)
|
|
108
|
+
metric_mean = np.mean(metrics)
|
|
109
|
+
|
|
110
|
+
# Normalized sensitivity
|
|
111
|
+
sensitivity = metric_range / abs(metric_mean) if metric_mean != 0 else metric_range
|
|
112
|
+
|
|
113
|
+
# Classify level
|
|
114
|
+
if sensitivity > SENSITIVITY_THRESHOLDS["HIGH"]:
|
|
115
|
+
level = "HIGH"
|
|
116
|
+
elif sensitivity > SENSITIVITY_THRESHOLDS["MED"]:
|
|
117
|
+
level = "MED"
|
|
118
|
+
elif sensitivity > SENSITIVITY_THRESHOLDS["LOW"]:
|
|
119
|
+
level = "LOW"
|
|
120
|
+
else:
|
|
121
|
+
level = "NONE"
|
|
122
|
+
|
|
123
|
+
# Check monotonicity
|
|
124
|
+
monotonic = _check_monotonicity(metrics)
|
|
125
|
+
|
|
126
|
+
# Best value
|
|
127
|
+
best_idx = np.argmax(metrics)
|
|
128
|
+
best_value = values[best_idx] if best_idx < len(values) else None
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"param": param_name,
|
|
132
|
+
"current_value": next((r["value"] for r in sweep_results if r.get("is_current")), None),
|
|
133
|
+
"sensitivity": round(float(sensitivity), 6),
|
|
134
|
+
"metric_range": round(float(metric_range), 6),
|
|
135
|
+
"metric_min": round(float(min(metrics)), 6),
|
|
136
|
+
"metric_max": round(float(max(metrics)), 6),
|
|
137
|
+
"level": level,
|
|
138
|
+
"best_value": best_value,
|
|
139
|
+
"monotonic": monotonic,
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _check_monotonicity(values: list[float]) -> str:
|
|
144
|
+
"""Check if values are monotonically increasing, decreasing, or non-monotonic."""
|
|
145
|
+
if len(values) < 2:
|
|
146
|
+
return "unknown"
|
|
147
|
+
|
|
148
|
+
diffs = [values[i + 1] - values[i] for i in range(len(values) - 1)]
|
|
149
|
+
all_pos = all(d >= 0 for d in diffs)
|
|
150
|
+
all_neg = all(d <= 0 for d in diffs)
|
|
151
|
+
|
|
152
|
+
if all_pos:
|
|
153
|
+
return "increasing"
|
|
154
|
+
elif all_neg:
|
|
155
|
+
return "decreasing"
|
|
156
|
+
else:
|
|
157
|
+
return "non_monotonic"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def rank_sensitivities(sensitivities: list[dict]) -> list[dict]:
|
|
161
|
+
"""Rank parameters by sensitivity (highest first)."""
|
|
162
|
+
return sorted(sensitivities, key=lambda s: s.get("sensitivity", 0), reverse=True)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# --- Recommendations ---
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def generate_recommendations(ranked: list[dict]) -> list[str]:
|
|
169
|
+
"""Generate tuning recommendations from sensitivity ranking."""
|
|
170
|
+
recs = []
|
|
171
|
+
|
|
172
|
+
high = [s for s in ranked if s["level"] == "HIGH"]
|
|
173
|
+
none = [s for s in ranked if s["level"] == "NONE"]
|
|
174
|
+
|
|
175
|
+
if high:
|
|
176
|
+
names = ", ".join(s["param"] for s in high)
|
|
177
|
+
recs.append(f"Focus tuning on {names}")
|
|
178
|
+
|
|
179
|
+
if none:
|
|
180
|
+
names = ", ".join(s["param"] for s in none)
|
|
181
|
+
recs.append(f"Stop tuning {names} — they don't matter for this model")
|
|
182
|
+
|
|
183
|
+
non_mono = [s for s in ranked if s.get("monotonic") == "non_monotonic" and s["level"] in ("HIGH", "MED")]
|
|
184
|
+
if non_mono:
|
|
185
|
+
for s in non_mono:
|
|
186
|
+
recs.append(f"{s['param']} has a non-monotonic relationship — there's an optimal sweet spot around {s.get('best_value')}")
|
|
187
|
+
|
|
188
|
+
return recs
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# --- Full Pipeline ---
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def sensitivity_analysis(
|
|
195
|
+
exp_id: str | None = None,
|
|
196
|
+
params: list[str] | None = None,
|
|
197
|
+
sweep_data: dict[str, list[dict]] | None = None,
|
|
198
|
+
config_path: str = "config.yaml",
|
|
199
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
200
|
+
) -> dict:
|
|
201
|
+
"""Run sensitivity analysis.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
exp_id: Experiment ID to analyze.
|
|
205
|
+
params: Specific parameters to analyze.
|
|
206
|
+
sweep_data: Pre-computed sweep results {param: [{value, metric_value}]}.
|
|
207
|
+
config_path: Path to config.yaml.
|
|
208
|
+
log_path: Path to experiment log.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Sensitivity analysis report.
|
|
212
|
+
"""
|
|
213
|
+
config = load_config(config_path)
|
|
214
|
+
eval_cfg = config.get("evaluation", {})
|
|
215
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
216
|
+
|
|
217
|
+
sensitivities = []
|
|
218
|
+
|
|
219
|
+
if sweep_data:
|
|
220
|
+
# Analyze pre-computed sweep data
|
|
221
|
+
for param, results in sweep_data.items():
|
|
222
|
+
sens = compute_sensitivity(param, results, primary_metric)
|
|
223
|
+
sensitivities.append(sens)
|
|
224
|
+
else:
|
|
225
|
+
# Generate sweep plan (actual execution done by agent)
|
|
226
|
+
tunable = extract_tunable_params(config)
|
|
227
|
+
if params:
|
|
228
|
+
tunable = {k: v for k, v in tunable.items() if k in params}
|
|
229
|
+
|
|
230
|
+
if not tunable:
|
|
231
|
+
return {"error": "No tunable hyperparameters found in config"}
|
|
232
|
+
|
|
233
|
+
sweep_plans = {}
|
|
234
|
+
for param, value in tunable.items():
|
|
235
|
+
sweep_plans[param] = generate_sweep(param, value)
|
|
236
|
+
|
|
237
|
+
return {
|
|
238
|
+
"action": "plan",
|
|
239
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
240
|
+
"primary_metric": primary_metric,
|
|
241
|
+
"experiment_id": exp_id,
|
|
242
|
+
"sweep_plans": sweep_plans,
|
|
243
|
+
"n_experiments_needed": sum(len(s) for s in sweep_plans.values()),
|
|
244
|
+
"message": f"Sweep {len(sweep_plans)} parameters × {DEFAULT_N_POINTS} values each",
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
ranked = rank_sensitivities(sensitivities)
|
|
248
|
+
recommendations = generate_recommendations(ranked)
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
252
|
+
"primary_metric": primary_metric,
|
|
253
|
+
"experiment_id": exp_id,
|
|
254
|
+
"sensitivities": ranked,
|
|
255
|
+
"recommendations": recommendations,
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# --- Report Formatting ---
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def save_sensitivity_report(report: dict, output_dir: str = "experiments/sensitivity") -> Path:
|
|
263
|
+
out_path = Path(output_dir)
|
|
264
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
265
|
+
exp_id = report.get("experiment_id", "unknown")
|
|
266
|
+
filepath = out_path / f"{exp_id}-sensitivity.yaml"
|
|
267
|
+
with open(filepath, "w") as f:
|
|
268
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
269
|
+
return filepath
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def format_sensitivity_report(report: dict) -> str:
|
|
273
|
+
if "error" in report:
|
|
274
|
+
return f"ERROR: {report['error']}"
|
|
275
|
+
|
|
276
|
+
if report.get("action") == "plan":
|
|
277
|
+
plans = report.get("sweep_plans", {})
|
|
278
|
+
lines = ["# Sensitivity Analysis Plan", "",
|
|
279
|
+
f"**{report.get('n_experiments_needed', 0)} experiments** needed for {len(plans)} parameters", ""]
|
|
280
|
+
for param, points in plans.items():
|
|
281
|
+
vals = ", ".join(str(p["value"]) for p in points)
|
|
282
|
+
lines.append(f"- **{param}:** [{vals}]")
|
|
283
|
+
return "\n".join(lines)
|
|
284
|
+
|
|
285
|
+
metric = report.get("primary_metric", "metric")
|
|
286
|
+
exp_id = report.get("experiment_id", "?")
|
|
287
|
+
|
|
288
|
+
lines = [f"# Hyperparameter Sensitivity Analysis ({exp_id})", "",
|
|
289
|
+
f"*Generated {report.get('generated_at', 'N/A')[:19]}*", "",
|
|
290
|
+
f"| Parameter | Current | Range Tested | {metric} Range | Sensitivity |",
|
|
291
|
+
"|-----------|---------|-------------|----------------|-------------|"]
|
|
292
|
+
|
|
293
|
+
for s in report.get("sensitivities", []):
|
|
294
|
+
current = s.get("current_value", "?")
|
|
295
|
+
metric_range = f"{s['metric_min']:.4f}–{s['metric_max']:.4f}" if s.get("metric_min") is not None else "N/A"
|
|
296
|
+
sens = f"{s['level']} ({s['sensitivity']:.4f})"
|
|
297
|
+
lines.append(f"| {s['param']} | {current} | — | {metric_range} | {sens} |")
|
|
298
|
+
|
|
299
|
+
recs = report.get("recommendations", [])
|
|
300
|
+
if recs:
|
|
301
|
+
lines.extend(["", "## Recommendations", ""])
|
|
302
|
+
for r in recs:
|
|
303
|
+
lines.append(f"- {r}")
|
|
304
|
+
|
|
305
|
+
return "\n".join(lines)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def main() -> None:
|
|
309
|
+
parser = argparse.ArgumentParser(description="Hyperparameter sensitivity analysis")
|
|
310
|
+
parser.add_argument("exp_id", nargs="?", help="Experiment ID")
|
|
311
|
+
parser.add_argument("--params", help="Comma-separated parameter names")
|
|
312
|
+
parser.add_argument("--config", default="config.yaml")
|
|
313
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH)
|
|
314
|
+
parser.add_argument("--json", action="store_true")
|
|
315
|
+
args = parser.parse_args()
|
|
316
|
+
|
|
317
|
+
params = [p.strip() for p in args.params.split(",")] if args.params else None
|
|
318
|
+
|
|
319
|
+
report = sensitivity_analysis(
|
|
320
|
+
exp_id=args.exp_id, params=params,
|
|
321
|
+
config_path=args.config, log_path=args.log,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
if "error" not in report:
|
|
325
|
+
filepath = save_sensitivity_report(report)
|
|
326
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
327
|
+
|
|
328
|
+
if args.json:
|
|
329
|
+
print(json.dumps(report, indent=2, default=str))
|
|
330
|
+
else:
|
|
331
|
+
print(format_sensitivity_report(report))
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
if __name__ == "__main__":
|
|
335
|
+
main()
|