claude-turing 4.0.0 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,463 @@
1
+ #!/usr/bin/env python3
2
+ """Experiment outcome simulator for the autoresearch pipeline.
3
+
4
+ Predicts experiment outcomes from prior data using a surrogate model.
5
+ Pre-filters experiment configs to save budget — only run the ones
6
+ predicted to beat the current best.
7
+
8
+ Usage:
9
+ python scripts/experiment_simulator.py --configs configs.yaml
10
+ python scripts/experiment_simulator.py --top-k 5
11
+ python scripts/experiment_simulator.py --threshold 0.001
12
+ python scripts/experiment_simulator.py --json
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import sys
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+
23
+ import numpy as np
24
+ import yaml
25
+
26
+ from scripts.turing_io import load_config, load_experiments
27
+
28
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
29
+ DEFAULT_TOP_K = 5
30
+ DEFAULT_IMPROVEMENT_THRESHOLD = 0.0
31
+ NOVELTY_PENALTY_FACTOR = 0.1
32
+ MIN_HISTORY_FOR_SURROGATE = 5
33
+
34
+
35
+ # --- Feature Extraction ---
36
+
37
+
38
+ def extract_config_features(config: dict) -> dict[str, float]:
39
+ """Extract numeric features from an experiment config.
40
+
41
+ Flattens nested config into a flat dict of numeric values.
42
+ """
43
+ features = {}
44
+ _flatten(config, features, prefix="")
45
+ return features
46
+
47
+
48
+ def _flatten(obj: dict, out: dict, prefix: str) -> None:
49
+ """Recursively flatten a dict, keeping only numeric values."""
50
+ for key, val in obj.items():
51
+ full_key = f"{prefix}{key}" if not prefix else f"{prefix}.{key}"
52
+ if isinstance(val, (int, float)) and not isinstance(val, bool):
53
+ out[full_key] = float(val)
54
+ elif isinstance(val, dict):
55
+ _flatten(val, out, full_key)
56
+
57
+
58
+ def experiment_to_features(exp: dict) -> dict[str, float]:
59
+ """Extract feature vector from an experiment log entry."""
60
+ features = {}
61
+
62
+ # Extract from config
63
+ config = exp.get("config", {})
64
+ if isinstance(config, dict):
65
+ hyperparams = config.get("hyperparams", config.get("model", {}).get("hyperparams", {}))
66
+ if isinstance(hyperparams, dict):
67
+ for k, v in hyperparams.items():
68
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
69
+ features[k] = float(v)
70
+
71
+ # Also check top-level numeric fields
72
+ for key in ("learning_rate", "lr", "max_depth", "depth", "n_estimators",
73
+ "epochs", "batch_size", "hidden_size", "dropout"):
74
+ val = exp.get(key) or config.get(key)
75
+ if isinstance(val, (int, float)):
76
+ features[key] = float(val)
77
+
78
+ return features
79
+
80
+
81
+ # --- Surrogate Model ---
82
+
83
+
84
+ def build_surrogate(
85
+ experiments: list[dict],
86
+ primary_metric: str,
87
+ ) -> dict:
88
+ """Build a simple surrogate model from experiment history.
89
+
90
+ Uses a weighted k-NN approach: for a new config, predict the metric
91
+ as a weighted average of the k nearest experiments in config space.
92
+
93
+ Returns:
94
+ Surrogate model dict with training data and feature info.
95
+ """
96
+ data_points = []
97
+ for exp in experiments:
98
+ metric = exp.get("metrics", {}).get(primary_metric)
99
+ if metric is None:
100
+ continue
101
+ features = experiment_to_features(exp)
102
+ if features:
103
+ data_points.append({"features": features, "metric": metric})
104
+
105
+ if len(data_points) < MIN_HISTORY_FOR_SURROGATE:
106
+ return {
107
+ "status": "insufficient",
108
+ "n_points": len(data_points),
109
+ "min_required": MIN_HISTORY_FOR_SURROGATE,
110
+ }
111
+
112
+ # Collect all feature names
113
+ all_features = set()
114
+ for dp in data_points:
115
+ all_features.update(dp["features"].keys())
116
+
117
+ return {
118
+ "status": "ready",
119
+ "data_points": data_points,
120
+ "feature_names": sorted(all_features),
121
+ "n_points": len(data_points),
122
+ }
123
+
124
+
125
+ def predict_with_surrogate(
126
+ surrogate: dict,
127
+ config_features: dict[str, float],
128
+ k: int = 3,
129
+ ) -> dict:
130
+ """Predict metric for a config using weighted k-NN surrogate.
131
+
132
+ Args:
133
+ surrogate: Built surrogate model.
134
+ config_features: Feature dict for the config to predict.
135
+ k: Number of nearest neighbors.
136
+
137
+ Returns:
138
+ Prediction dict with predicted metric and uncertainty.
139
+ """
140
+ if surrogate.get("status") != "ready":
141
+ return {"error": "Surrogate not ready", "predicted": None, "uncertainty": None}
142
+
143
+ data_points = surrogate["data_points"]
144
+ feature_names = surrogate["feature_names"]
145
+
146
+ # Compute distances
147
+ distances = []
148
+ for dp in data_points:
149
+ dist = _config_distance(config_features, dp["features"], feature_names)
150
+ distances.append((dist, dp["metric"]))
151
+
152
+ distances.sort(key=lambda x: x[0])
153
+ neighbors = distances[:k]
154
+
155
+ if not neighbors:
156
+ return {"error": "No neighbors found", "predicted": None, "uncertainty": None}
157
+
158
+ # Weighted average (inverse distance weighting)
159
+ metrics = [m for _, m in neighbors]
160
+ dists = [d for d, _ in neighbors]
161
+
162
+ if all(d == 0 for d in dists):
163
+ predicted = np.mean(metrics)
164
+ uncertainty = 0.0
165
+ elif all(d == float("inf") for d in dists):
166
+ predicted = np.mean(metrics)
167
+ uncertainty = float(np.std(metrics)) if len(metrics) > 1 else 0.0
168
+ else:
169
+ weights = [1.0 / (d + 1e-6) for d in dists if d != float("inf")]
170
+ valid_metrics = [m for d, m in zip(dists, metrics) if d != float("inf")]
171
+ if not weights:
172
+ predicted = np.mean(metrics)
173
+ uncertainty = float(np.std(metrics)) if len(metrics) > 1 else 0.0
174
+ else:
175
+ total_weight = sum(weights)
176
+ predicted = sum(w * m for w, m in zip(weights, valid_metrics)) / total_weight
177
+ uncertainty = float(np.std(valid_metrics)) if len(valid_metrics) > 1 else 0.0
178
+
179
+ # Novelty penalty: discount if far from training distribution
180
+ finite_dists = [d for d, _ in distances if d != float("inf")]
181
+ min_dist = min(finite_dists) if finite_dists else float("inf")
182
+ avg_dist = float(np.mean(finite_dists)) if finite_dists else 1.0
183
+ novelty = min_dist / avg_dist if avg_dist > 0 and min_dist != float("inf") else 1.0
184
+ novelty_penalty = novelty * NOVELTY_PENALTY_FACTOR
185
+
186
+ return {
187
+ "predicted": round(float(predicted - novelty_penalty), 6),
188
+ "uncertainty": round(float(uncertainty), 6),
189
+ "novelty_score": round(float(novelty), 4),
190
+ "n_neighbors": len(neighbors),
191
+ "nearest_distance": round(float(min_dist), 4),
192
+ }
193
+
194
+
195
+ def _config_distance(
196
+ config_a: dict[str, float],
197
+ config_b: dict[str, float],
198
+ feature_names: list[str],
199
+ ) -> float:
200
+ """Compute normalized distance between two configs."""
201
+ total = 0.0
202
+ n = 0
203
+ for feat in feature_names:
204
+ a = config_a.get(feat)
205
+ b = config_b.get(feat)
206
+ if a is not None and b is not None:
207
+ # Normalize by max(|a|, |b|, 1) to handle different scales
208
+ scale = max(abs(a), abs(b), 1.0)
209
+ total += ((a - b) / scale) ** 2
210
+ n += 1
211
+
212
+ if n == 0:
213
+ return float("inf")
214
+ return float(np.sqrt(total / n))
215
+
216
+
217
+ # --- Simulation Pipeline ---
218
+
219
+
220
+ def simulate_experiments(
221
+ proposed_configs: list[dict],
222
+ experiments: list[dict],
223
+ primary_metric: str,
224
+ top_k: int = DEFAULT_TOP_K,
225
+ improvement_threshold: float = DEFAULT_IMPROVEMENT_THRESHOLD,
226
+ lower_is_better: bool = False,
227
+ ) -> dict:
228
+ """Simulate proposed experiments and rank by predicted outcome.
229
+
230
+ Args:
231
+ proposed_configs: List of experiment configs to simulate.
232
+ experiments: Historical experiment data.
233
+ primary_metric: Metric to predict.
234
+ top_k: Number of top configs to recommend running.
235
+ improvement_threshold: Minimum predicted improvement over current best.
236
+ lower_is_better: Whether lower metric is better.
237
+
238
+ Returns:
239
+ Simulation report with ranked configs and budget savings.
240
+ """
241
+ if not proposed_configs:
242
+ return {"error": "No proposed configs to simulate"}
243
+
244
+ surrogate = build_surrogate(experiments, primary_metric)
245
+ if surrogate.get("status") != "ready":
246
+ return {
247
+ "error": f"Insufficient experiment history ({surrogate.get('n_points', 0)} experiments, "
248
+ f"need {MIN_HISTORY_FOR_SURROGATE})",
249
+ "suggestion": "Run more experiments first to build a reliable surrogate model.",
250
+ }
251
+
252
+ # Get current best
253
+ best_metrics = [
254
+ exp.get("metrics", {}).get(primary_metric)
255
+ for exp in experiments
256
+ if exp.get("metrics", {}).get(primary_metric) is not None
257
+ ]
258
+ if lower_is_better:
259
+ current_best = min(best_metrics) if best_metrics else float("inf")
260
+ else:
261
+ current_best = max(best_metrics) if best_metrics else 0
262
+
263
+ # Predict each config
264
+ predictions = []
265
+ for i, config in enumerate(proposed_configs):
266
+ features = extract_config_features(config)
267
+ pred = predict_with_surrogate(surrogate, features)
268
+ predicted = pred.get("predicted")
269
+ uncertainty = pred.get("uncertainty", 0)
270
+
271
+ if predicted is not None:
272
+ if lower_is_better:
273
+ improvement = current_best - predicted
274
+ else:
275
+ improvement = predicted - current_best
276
+
277
+ # Classify uncertainty
278
+ if uncertainty < 0.005:
279
+ unc_level = "LOW"
280
+ elif uncertainty < 0.015:
281
+ unc_level = "MED"
282
+ else:
283
+ unc_level = "HIGH"
284
+
285
+ verdict = "RUN" if improvement > improvement_threshold else "SKIP"
286
+
287
+ predictions.append({
288
+ "rank": 0, # filled later
289
+ "config_index": i,
290
+ "config_summary": _summarize_config(config),
291
+ "predicted_metric": predicted,
292
+ "uncertainty": uncertainty,
293
+ "uncertainty_level": unc_level,
294
+ "improvement": round(improvement, 6),
295
+ "verdict": verdict,
296
+ "novelty_score": pred.get("novelty_score", 0),
297
+ })
298
+
299
+ # Sort by predicted metric
300
+ predictions.sort(
301
+ key=lambda p: p["predicted_metric"],
302
+ reverse=not lower_is_better,
303
+ )
304
+
305
+ # Assign ranks
306
+ for i, p in enumerate(predictions):
307
+ p["rank"] = i + 1
308
+
309
+ # Apply top-k
310
+ run_configs = [p for p in predictions if p["verdict"] == "RUN"][:top_k]
311
+ skip_configs = [p for p in predictions if p not in run_configs]
312
+
313
+ # Mark skipped
314
+ for p in skip_configs:
315
+ p["verdict"] = "SKIP"
316
+
317
+ total = len(predictions)
318
+ n_run = len(run_configs)
319
+ n_skip = total - n_run
320
+ savings = round(n_skip / total * 100, 1) if total > 0 else 0
321
+
322
+ return {
323
+ "current_best": current_best,
324
+ "primary_metric": primary_metric,
325
+ "total_proposed": total,
326
+ "run_count": n_run,
327
+ "skip_count": n_skip,
328
+ "budget_savings_pct": savings,
329
+ "predictions": predictions,
330
+ "surrogate_info": {
331
+ "n_training_points": surrogate["n_points"],
332
+ "n_features": len(surrogate["feature_names"]),
333
+ },
334
+ "generated_at": datetime.now(timezone.utc).isoformat(),
335
+ }
336
+
337
+
338
+ def _summarize_config(config: dict, max_items: int = 4) -> str:
339
+ """Create a short summary of a config dict."""
340
+ features = extract_config_features(config)
341
+ items = list(features.items())[:max_items]
342
+ parts = [f"{k}={v}" for k, v in items]
343
+ if len(features) > max_items:
344
+ parts.append("...")
345
+ return ", ".join(parts) if parts else "(empty config)"
346
+
347
+
348
+ # --- Report Formatting ---
349
+
350
+
351
+ def save_simulation_report(report: dict, output_dir: str = "experiments/simulations") -> Path:
352
+ """Save simulation report to YAML."""
353
+ out_path = Path(output_dir)
354
+ out_path.mkdir(parents=True, exist_ok=True)
355
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
356
+ filepath = out_path / f"simulation-{ts}.yaml"
357
+ with open(filepath, "w") as f:
358
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
359
+ return filepath
360
+
361
+
362
+ def format_simulation_report(report: dict) -> str:
363
+ """Format simulation report as readable markdown."""
364
+ if "error" in report:
365
+ lines = [f"ERROR: {report['error']}"]
366
+ if "suggestion" in report:
367
+ lines.append(f"\n{report['suggestion']}")
368
+ return "\n".join(lines)
369
+
370
+ lines = ["# Experiment Simulation", ""]
371
+ lines.append(f"**Current best:** {report.get('current_best', 'N/A')}")
372
+ lines.append(f"**Proposed configs:** {report.get('total_proposed', 0)}")
373
+ lines.append(f"**Recommended to run:** {report.get('run_count', 0)}")
374
+ lines.append(f"**Budget savings:** {report.get('budget_savings_pct', 0)}%")
375
+ lines.append("")
376
+
377
+ predictions = report.get("predictions", [])
378
+ if predictions:
379
+ lines.append("| Rank | Config Summary | Predicted | Uncertainty | Verdict |")
380
+ lines.append("|------|---------------|-----------|-------------|---------|")
381
+ for p in predictions:
382
+ unc = f"{p['predicted_metric']:.4f} \u00b1 {p['uncertainty']:.4f}"
383
+ lines.append(
384
+ f"| {p['rank']} | {p['config_summary'][:40]} | {p['predicted_metric']:.4f} "
385
+ f"| {p['uncertainty_level']} | {p['verdict']} {'✓' if p['verdict'] == 'RUN' else '✗'} |"
386
+ )
387
+
388
+ lines.append("")
389
+ rec_run = report.get("run_count", 0)
390
+ rec_skip = report.get("skip_count", 0)
391
+ lines.append(
392
+ f"**Recommendation:** Run top {rec_run}, skip {rec_skip}. "
393
+ f"Estimated budget savings: {report.get('budget_savings_pct', 0)}%."
394
+ )
395
+
396
+ lines.append("")
397
+ lines.append(f"*Generated: {report.get('generated_at', 'N/A')}*")
398
+ return "\n".join(lines)
399
+
400
+
401
+ # --- CLI ---
402
+
403
+
404
+ def main():
405
+ parser = argparse.ArgumentParser(
406
+ description="Experiment outcome simulator — predict results before running"
407
+ )
408
+ parser.add_argument("--configs", help="YAML file with proposed experiment configs")
409
+ parser.add_argument("--top-k", type=int, default=DEFAULT_TOP_K,
410
+ help="Number of top configs to recommend")
411
+ parser.add_argument("--threshold", type=float, default=DEFAULT_IMPROVEMENT_THRESHOLD,
412
+ help="Minimum predicted improvement to recommend running")
413
+ parser.add_argument("--config", default="config.yaml", help="Path to project config.yaml")
414
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
415
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
416
+
417
+ args = parser.parse_args()
418
+
419
+ config = load_config(args.config)
420
+ eval_cfg = config.get("evaluation", {})
421
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
422
+ lower_is_better = eval_cfg.get("lower_is_better", False)
423
+
424
+ experiments = load_experiments(args.log)
425
+
426
+ # Load proposed configs
427
+ proposed = []
428
+ if args.configs:
429
+ with open(args.configs) as f:
430
+ data = yaml.safe_load(f)
431
+ if isinstance(data, list):
432
+ proposed = data
433
+ elif isinstance(data, dict) and "configs" in data:
434
+ proposed = data["configs"]
435
+ else:
436
+ proposed = [data]
437
+
438
+ if not proposed:
439
+ print("No proposed configs provided. Use --configs <file.yaml>")
440
+ sys.exit(1)
441
+
442
+ report = simulate_experiments(
443
+ proposed_configs=proposed,
444
+ experiments=experiments,
445
+ primary_metric=primary_metric,
446
+ top_k=args.top_k,
447
+ improvement_threshold=args.threshold,
448
+ lower_is_better=lower_is_better,
449
+ )
450
+
451
+ if args.json:
452
+ print(json.dumps(report, indent=2))
453
+ else:
454
+ print(format_simulation_report(report))
455
+
456
+ if "error" not in report:
457
+ saved = save_simulation_report(report)
458
+ if not args.json:
459
+ print(f"\nSaved: {saved}")
460
+
461
+
462
+ if __name__ == "__main__":
463
+ main()
@@ -371,6 +371,39 @@ def load_audit_report(audit_dir: str = "experiments/audits") -> dict | None:
371
371
  return None
372
372
 
373
373
 
374
+ def load_whatif_results(whatif_dir: str = "experiments/whatif") -> list[dict]:
375
+ """Load recent what-if analysis results."""
376
+ path = Path(whatif_dir)
377
+ if not path.exists():
378
+ return []
379
+ results = []
380
+ for f in sorted(path.glob("whatif-*.yaml"))[-5:]: # Last 5
381
+ try:
382
+ with open(f) as fh:
383
+ data = yaml.safe_load(fh)
384
+ if isinstance(data, dict):
385
+ results.append(data)
386
+ except (yaml.YAMLError, OSError):
387
+ continue
388
+ return results
389
+
390
+
391
+ def load_simulation_results(sim_dir: str = "experiments/simulations") -> dict | None:
392
+ """Load the most recent simulation result."""
393
+ path = Path(sim_dir)
394
+ if not path.exists():
395
+ return None
396
+ files = sorted(path.glob("simulation-*.yaml"))
397
+ if not files:
398
+ return None
399
+ try:
400
+ with open(files[-1]) as f:
401
+ data = yaml.safe_load(f)
402
+ return data if isinstance(data, dict) else None
403
+ except (yaml.YAMLError, OSError):
404
+ return None
405
+
406
+
374
407
  def format_brief(
375
408
  campaign: dict,
376
409
  best: dict | None,
@@ -393,6 +426,8 @@ def format_brief(
393
426
  budget_status: dict | None = None,
394
427
  scaling_results: list[dict] | None = None,
395
428
  audit_report: dict | None = None,
429
+ whatif_results: list[dict] | None = None,
430
+ simulation_result: dict | None = None,
396
431
  ) -> str:
397
432
  """Format the research briefing as markdown."""
398
433
  direction = "lower" if lower_is_better else "higher"
@@ -698,6 +733,31 @@ def format_brief(
698
733
  total = len(regression_checks)
699
734
  lines.append(f"\n*{passed}/{total} regression checks passed.*")
700
735
 
736
+ # What-If & Simulation section
737
+ if whatif_results or simulation_result:
738
+ lines.extend(["", "## What-If Analysis & Simulation", ""])
739
+
740
+ if whatif_results:
741
+ lines.append(f"**Recent what-if queries:** {len(whatif_results)}")
742
+ for wf in whatif_results[-3:]:
743
+ q = wf.get("question", "N/A")
744
+ route = wf.get("route", "?")
745
+ result = wf.get("result", {})
746
+ est = result.get("estimate")
747
+ conf = result.get("confidence", "?")
748
+ if est is not None:
749
+ lines.append(f"- *\"{q}\"* → {est} ({conf} confidence) [{route}]")
750
+ elif "error" in result:
751
+ lines.append(f"- *\"{q}\"* → {result['error']} [{route}]")
752
+ lines.append("")
753
+
754
+ if simulation_result and "error" not in simulation_result:
755
+ run = simulation_result.get("run_count", 0)
756
+ skip = simulation_result.get("skip_count", 0)
757
+ savings = simulation_result.get("budget_savings_pct", 0)
758
+ lines.append(f"**Last simulation:** {run} configs recommended, {skip} skipped ({savings}% budget savings)")
759
+ lines.append("")
760
+
701
761
  lines.extend([
702
762
  "",
703
763
  "## Recommendations",
@@ -768,6 +828,8 @@ def generate_brief(
768
828
  budget_status = load_budget_status(log_path=log_path)
769
829
  scaling_results = load_scaling_results()
770
830
  audit_report = load_audit_report()
831
+ whatif_results = load_whatif_results()
832
+ simulation_result = load_simulation_results()
771
833
 
772
834
  return format_brief(
773
835
  campaign, best, trajectory, model_types, hypotheses,
@@ -784,6 +846,8 @@ def generate_brief(
784
846
  budget_status=budget_status,
785
847
  scaling_results=scaling_results if scaling_results else None,
786
848
  audit_report=audit_report,
849
+ whatif_results=whatif_results if whatif_results else None,
850
+ simulation_result=simulation_result,
787
851
  )
788
852
 
789
853