claude-turing 2.5.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,423 @@
1
+ #!/usr/bin/env python3
2
+ """Automatic baseline generation for the autoresearch pipeline.
3
+
4
+ Auto-generates trivial baselines (majority, mean, random, linear, k-NN)
5
+ so every experiment has a "is this better than dumb?" reference point.
6
+
7
+ Usage:
8
+ python scripts/generate_baselines.py
9
+ python scripts/generate_baselines.py --methods all
10
+ python scripts/generate_baselines.py --methods simple
11
+ python scripts/generate_baselines.py --json
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import sys
19
+ from datetime import datetime, timezone
20
+ from pathlib import Path
21
+
22
+ import numpy as np
23
+ import yaml
24
+
25
+ from scripts.turing_io import load_config, load_experiments
26
+
27
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
28
+
29
+ # Baseline method groups
30
+ SIMPLE_METHODS = ["random", "majority_or_mean"]
31
+ LINEAR_METHODS = ["linear"]
32
+ ALL_METHODS = ["random", "majority_or_mean", "stratified_or_median", "linear", "knn"]
33
+
34
+
35
+ # --- Baseline Methods ---
36
+
37
+
38
+ def random_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
39
+ """Random predictions."""
40
+ n = len(y)
41
+ if task_type == "classification":
42
+ classes = np.unique(y)
43
+ return np.random.choice(classes, size=n)
44
+ else:
45
+ return np.random.uniform(np.min(y), np.max(y), size=n)
46
+
47
+
48
+ def majority_or_mean_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
49
+ """Majority class (classification) or mean (regression)."""
50
+ n = len(y)
51
+ if task_type == "classification":
52
+ from scipy import stats as scipy_stats
53
+ mode_result = scipy_stats.mode(y, keepdims=False)
54
+ majority = mode_result.mode
55
+ return np.full(n, majority)
56
+ else:
57
+ return np.full(n, np.mean(y))
58
+
59
+
60
+ def stratified_or_median_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
61
+ """Stratified random (classification) or median (regression)."""
62
+ n = len(y)
63
+ if task_type == "classification":
64
+ classes, counts = np.unique(y, return_counts=True)
65
+ probs = counts / counts.sum()
66
+ return np.random.choice(classes, size=n, p=probs)
67
+ else:
68
+ return np.full(n, np.median(y))
69
+
70
+
71
+ def linear_baseline(
72
+ X: np.ndarray,
73
+ y: np.ndarray,
74
+ task_type: str = "classification",
75
+ ) -> dict:
76
+ """Linear model baseline (LogisticRegression / Ridge)."""
77
+ from sklearn.linear_model import LogisticRegression, Ridge
78
+
79
+ n_samples = X.shape[0]
80
+ split = int(n_samples * 0.7)
81
+ X_train, X_test = X[:split], X[split:]
82
+ y_train, y_test = y[:split], y[split:]
83
+
84
+ if task_type == "classification":
85
+ model = LogisticRegression(max_iter=1000, solver="lbfgs")
86
+ else:
87
+ model = Ridge(alpha=1.0)
88
+
89
+ model.fit(X_train, y_train)
90
+ predictions = model.predict(X_test)
91
+
92
+ return {
93
+ "predictions": predictions,
94
+ "labels": y_test,
95
+ "model_name": "LogisticRegression" if task_type == "classification" else "Ridge",
96
+ }
97
+
98
+
99
+ def knn_baseline(
100
+ X: np.ndarray,
101
+ y: np.ndarray,
102
+ task_type: str = "classification",
103
+ n_neighbors: int = 5,
104
+ ) -> dict:
105
+ """k-NN baseline."""
106
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
107
+
108
+ n_samples = X.shape[0]
109
+ split = int(n_samples * 0.7)
110
+ X_train, X_test = X[:split], X[split:]
111
+ y_train, y_test = y[:split], y[split:]
112
+
113
+ k = min(n_neighbors, len(X_train))
114
+ if task_type == "classification":
115
+ model = KNeighborsClassifier(n_neighbors=k)
116
+ else:
117
+ model = KNeighborsRegressor(n_neighbors=k)
118
+
119
+ model.fit(X_train, y_train)
120
+ predictions = model.predict(X_test)
121
+
122
+ return {
123
+ "predictions": predictions,
124
+ "labels": y_test,
125
+ "model_name": f"k-NN (k={k})",
126
+ }
127
+
128
+
129
+ # --- Evaluation ---
130
+
131
+
132
+ def evaluate_predictions(
133
+ predictions: np.ndarray,
134
+ labels: np.ndarray,
135
+ task_type: str = "classification",
136
+ primary_metric: str = "accuracy",
137
+ ) -> dict:
138
+ """Evaluate baseline predictions."""
139
+ min_len = min(len(predictions), len(labels))
140
+ predictions = predictions[:min_len]
141
+ labels = labels[:min_len]
142
+
143
+ if task_type == "classification":
144
+ accuracy = float(np.mean(predictions == labels))
145
+ return {"accuracy": round(accuracy, 6), "n_samples": min_len}
146
+ else:
147
+ mse = float(np.mean((predictions - labels) ** 2))
148
+ rmse = float(np.sqrt(mse))
149
+ return {"mse": round(mse, 6), "rmse": round(rmse, 6), "n_samples": min_len}
150
+
151
+
152
+ # --- Full Pipeline ---
153
+
154
+
155
+ def generate_baselines(
156
+ methods: str = "all",
157
+ config_path: str = "config.yaml",
158
+ log_path: str = DEFAULT_LOG_PATH,
159
+ data_path: str | None = None,
160
+ ) -> dict:
161
+ """Generate baseline results.
162
+
163
+ Args:
164
+ methods: Method group (all, simple, linear).
165
+ config_path: Path to config.yaml.
166
+ log_path: Path to experiment log.
167
+ data_path: Path to data (optional, for linear/knn).
168
+
169
+ Returns:
170
+ Baseline report dict.
171
+ """
172
+ config = load_config(config_path)
173
+ eval_cfg = config.get("evaluation", {})
174
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
175
+ task_type = config.get("task", {}).get("type", "classification")
176
+
177
+ experiments = load_experiments(log_path)
178
+
179
+ # Find current best for comparison
180
+ kept = [e for e in experiments if e.get("status") == "kept"]
181
+ current_best_value = None
182
+ if kept:
183
+ best = max(kept, key=lambda e: e.get("metrics", {}).get(primary_metric, 0))
184
+ current_best_value = best.get("metrics", {}).get(primary_metric)
185
+
186
+ # Select methods
187
+ if methods == "simple":
188
+ method_list = SIMPLE_METHODS
189
+ elif methods == "linear":
190
+ method_list = LINEAR_METHODS
191
+ else:
192
+ method_list = ALL_METHODS
193
+
194
+ # For methods that need data, check if data is available
195
+ has_data = data_path is not None and Path(data_path).exists()
196
+
197
+ report = {
198
+ "generated_at": datetime.now(timezone.utc).isoformat(),
199
+ "task_type": task_type,
200
+ "primary_metric": primary_metric,
201
+ "methods_requested": methods,
202
+ "baselines": [],
203
+ "current_best": current_best_value,
204
+ "data_available": has_data,
205
+ }
206
+
207
+ # Generate synthetic labels for demo if no data
208
+ # In real use, evaluate.py would provide these
209
+ if not has_data:
210
+ report["note"] = "No data loaded — baseline plan generated. Run with --data to compute actual scores."
211
+ for method in method_list:
212
+ report["baselines"].append({
213
+ "method": _method_display_name(method, task_type),
214
+ "metric_value": None,
215
+ "notes": "Requires data",
216
+ })
217
+ return report
218
+
219
+ # Load data
220
+ try:
221
+ data = np.load(data_path, allow_pickle=True)
222
+ X = data.get("X", data.get("features"))
223
+ y = data.get("y", data.get("labels", data.get("target")))
224
+ if X is None or y is None:
225
+ return {"error": f"Data file {data_path} missing X/y arrays"}
226
+ except Exception as e:
227
+ return {"error": f"Failed to load data: {e}"}
228
+
229
+ # Run baselines
230
+ for method in method_list:
231
+ result = _run_baseline(method, X, y, task_type, primary_metric)
232
+ report["baselines"].append(result)
233
+
234
+ # Add current best for comparison
235
+ if current_best_value is not None:
236
+ report["baselines"].append({
237
+ "method": "Current best",
238
+ "metric_value": current_best_value,
239
+ "notes": "",
240
+ })
241
+
242
+ # Compute improvement over linear baseline
243
+ linear_result = next((b for b in report["baselines"] if "linear" in b.get("method", "").lower()), None)
244
+ if linear_result and linear_result.get("metric_value") and current_best_value:
245
+ improvement = current_best_value - linear_result["metric_value"]
246
+ report["improvement_over_linear"] = round(improvement, 6)
247
+
248
+ return report
249
+
250
+
251
+ def _method_display_name(method: str, task_type: str) -> str:
252
+ """Human-readable method name."""
253
+ names = {
254
+ "random": "Random",
255
+ "majority_or_mean": "Majority class" if task_type == "classification" else "Mean predictor",
256
+ "stratified_or_median": "Stratified random" if task_type == "classification" else "Median predictor",
257
+ "linear": "Logistic Regression" if task_type == "classification" else "Ridge Regression",
258
+ "knn": "k-NN (k=5)",
259
+ }
260
+ return names.get(method, method)
261
+
262
+
263
+ def _run_baseline(
264
+ method: str,
265
+ X: np.ndarray,
266
+ y: np.ndarray,
267
+ task_type: str,
268
+ primary_metric: str,
269
+ ) -> dict:
270
+ """Run a single baseline method."""
271
+ try:
272
+ if method == "random":
273
+ preds = random_baseline(y, task_type)
274
+ eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
275
+ return {
276
+ "method": "Random",
277
+ "metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
278
+ "notes": "Floor — below this = bug",
279
+ }
280
+
281
+ elif method == "majority_or_mean":
282
+ preds = majority_or_mean_baseline(y, task_type)
283
+ eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
284
+ name = "Majority class" if task_type == "classification" else "Mean predictor"
285
+ return {
286
+ "method": name,
287
+ "metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
288
+ "notes": "Naive floor",
289
+ }
290
+
291
+ elif method == "stratified_or_median":
292
+ preds = stratified_or_median_baseline(y, task_type)
293
+ eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
294
+ name = "Stratified random" if task_type == "classification" else "Median predictor"
295
+ return {
296
+ "method": name,
297
+ "metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
298
+ "notes": "",
299
+ }
300
+
301
+ elif method == "linear":
302
+ result = linear_baseline(X, y, task_type)
303
+ eval_result = evaluate_predictions(result["predictions"], result["labels"], task_type, primary_metric)
304
+ return {
305
+ "method": result["model_name"],
306
+ "metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
307
+ "notes": "Linear ceiling",
308
+ }
309
+
310
+ elif method == "knn":
311
+ result = knn_baseline(X, y, task_type)
312
+ eval_result = evaluate_predictions(result["predictions"], result["labels"], task_type, primary_metric)
313
+ return {
314
+ "method": result["model_name"],
315
+ "metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
316
+ "notes": "Non-parametric reference",
317
+ }
318
+
319
+ except Exception as e:
320
+ return {"method": method, "metric_value": None, "notes": f"Error: {e}"}
321
+
322
+ return {"method": method, "metric_value": None, "notes": "Unknown method"}
323
+
324
+
325
+ # --- Report Formatting ---
326
+
327
+
328
+ def save_baseline_report(report: dict, output_dir: str = "experiments/baselines") -> Path:
329
+ """Save baseline report to YAML."""
330
+ out_path = Path(output_dir)
331
+ out_path.mkdir(parents=True, exist_ok=True)
332
+
333
+ date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
334
+ filepath = out_path / f"baselines-{date}.yaml"
335
+
336
+ with open(filepath, "w") as f:
337
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
338
+
339
+ return filepath
340
+
341
+
342
+ def format_baseline_report(report: dict) -> str:
343
+ """Format baseline report as markdown."""
344
+ if "error" in report:
345
+ return f"ERROR: {report['error']}"
346
+
347
+ task = report.get("task_type", "?")
348
+ metric = report.get("primary_metric", "metric")
349
+
350
+ lines = [
351
+ f"# Baselines for {task} ({metric})",
352
+ "",
353
+ f"*Generated {report.get('generated_at', 'N/A')[:19]}*",
354
+ "",
355
+ ]
356
+
357
+ baselines = report.get("baselines", [])
358
+ if baselines:
359
+ lines.append(f"| Method | {metric} | Notes |")
360
+ lines.append("|--------|--------|-------|")
361
+ for b in baselines:
362
+ val = b.get("metric_value")
363
+ val_str = f"{val:.4f}" if isinstance(val, (int, float)) else str(val or "N/A")
364
+ lines.append(f"| {b.get('method', '?')} | {val_str} | {b.get('notes', '')} |")
365
+ lines.append("")
366
+
367
+ improvement = report.get("improvement_over_linear")
368
+ if improvement is not None:
369
+ lines.append(f"**Your model beats the linear baseline by {improvement:+.4f} ({improvement / report.get('current_best', 1) * 100:.1f}%)**")
370
+ lines.append("")
371
+
372
+ if report.get("note"):
373
+ lines.append(f"*{report['note']}*")
374
+
375
+ return "\n".join(lines)
376
+
377
+
378
+ def main() -> None:
379
+ """CLI entry point."""
380
+ parser = argparse.ArgumentParser(
381
+ description="Automatic baseline generation",
382
+ )
383
+ parser.add_argument(
384
+ "--methods", choices=["all", "simple", "linear"], default="all",
385
+ help="Baseline method group (default: all)",
386
+ )
387
+ parser.add_argument(
388
+ "--data",
389
+ help="Path to data file (.npz with X and y arrays)",
390
+ )
391
+ parser.add_argument(
392
+ "--config", default="config.yaml",
393
+ help="Path to config.yaml",
394
+ )
395
+ parser.add_argument(
396
+ "--log", default=DEFAULT_LOG_PATH,
397
+ help="Path to experiment log",
398
+ )
399
+ parser.add_argument(
400
+ "--json", action="store_true",
401
+ help="Output raw JSON instead of formatted report",
402
+ )
403
+ args = parser.parse_args()
404
+
405
+ report = generate_baselines(
406
+ methods=args.methods,
407
+ config_path=args.config,
408
+ log_path=args.log,
409
+ data_path=args.data,
410
+ )
411
+
412
+ if "error" not in report:
413
+ filepath = save_baseline_report(report)
414
+ print(f"Saved to {filepath}", file=sys.stderr)
415
+
416
+ if args.json:
417
+ print(json.dumps(report, indent=2, default=str))
418
+ else:
419
+ print(format_baseline_report(report))
420
+
421
+
422
+ if __name__ == "__main__":
423
+ main()
@@ -355,6 +355,22 @@ def load_scaling_results(scaling_dir: str = "experiments/scaling") -> list[dict]
355
355
  return reports
356
356
 
357
357
 
358
+ def load_audit_report(audit_dir: str = "experiments/audits") -> dict | None:
359
+ """Load the most recent audit report."""
360
+ path = Path(audit_dir)
361
+ if not path.exists():
362
+ return None
363
+ files = sorted(path.glob("audit-*.yaml"))
364
+ if not files:
365
+ return None
366
+ try:
367
+ with open(files[-1]) as f:
368
+ report = yaml.safe_load(f)
369
+ return report if isinstance(report, dict) else None
370
+ except (yaml.YAMLError, OSError):
371
+ return None
372
+
373
+
358
374
  def format_brief(
359
375
  campaign: dict,
360
376
  best: dict | None,
@@ -376,6 +392,7 @@ def format_brief(
376
392
  ensemble_results: list[dict] | None = None,
377
393
  budget_status: dict | None = None,
378
394
  scaling_results: list[dict] | None = None,
395
+ audit_report: dict | None = None,
379
396
  ) -> str:
380
397
  """Format the research briefing as markdown."""
381
398
  direction = "lower" if lower_is_better else "higher"
@@ -635,6 +652,28 @@ def format_brief(
635
652
  reason = verdict.get("reason", "")
636
653
  lines.append(f"- **{v.upper()}**: {reason}")
637
654
 
655
+ # Methodology audit
656
+ if audit_report and audit_report.get("score"):
657
+ score = audit_report["score"]
658
+ verdict = audit_report.get("verdict", "?")
659
+ verdict_labels = {
660
+ "pass": "PASS",
661
+ "pass_with_warnings": "PASS (warnings)",
662
+ "needs_work": "NEEDS WORK",
663
+ "fail": "FAIL",
664
+ }
665
+ lines.extend(["", "## Methodology Audit", ""])
666
+ lines.append(
667
+ f"**{verdict_labels.get(verdict, verdict.upper())}** — "
668
+ f"{score.get('pass', 0)}/{score.get('checkable', 0)} checks passed, "
669
+ f"{score.get('fail', 0)} failure(s)"
670
+ )
671
+ actions = audit_report.get("actions", [])
672
+ if actions:
673
+ lines.append("")
674
+ for a in actions[:3]:
675
+ lines.append(f"- Fix: `{a['fix']}` ({a['check']})")
676
+
638
677
  # Regression check history (stability)
639
678
  if regression_checks:
640
679
  lines.extend(["", "## Stability", ""])
@@ -728,6 +767,7 @@ def generate_brief(
728
767
  ensemble_results = load_ensemble_results()
729
768
  budget_status = load_budget_status(log_path=log_path)
730
769
  scaling_results = load_scaling_results()
770
+ audit_report = load_audit_report()
731
771
 
732
772
  return format_brief(
733
773
  campaign, best, trajectory, model_types, hypotheses,
@@ -743,6 +783,7 @@ def generate_brief(
743
783
  ensemble_results=ensemble_results if ensemble_results else None,
744
784
  budget_status=budget_status,
745
785
  scaling_results=scaling_results if scaling_results else None,
786
+ audit_report=audit_report,
746
787
  )
747
788
 
748
789