claude-turing 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,523 @@
1
+ #!/usr/bin/env python3
2
+ """Scaling law estimator for the autoresearch pipeline.
3
+
4
+ Runs experiments at different data/compute/model sizes, fits a power-law
5
+ curve, and predicts full-scale performance. Answers "is it worth training
6
+ on the full dataset?" before committing the compute.
7
+
8
+ Usage:
9
+ python scripts/scaling_estimator.py --axis data
10
+ python scripts/scaling_estimator.py --axis compute --points 4
11
+ python scripts/scaling_estimator.py --analyze experiments/scaling/results.yaml
12
+ python scripts/scaling_estimator.py --json
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import math
20
+ import sys
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+
24
+ import numpy as np
25
+ import yaml
26
+
27
+ from scripts.turing_io import load_config, load_experiments
28
+
29
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
30
+ DEFAULT_SCALE_POINTS = [0.10, 0.25, 0.50, 0.75]
31
+ SCALE_AXES = {"data", "compute", "params"}
32
+
33
+
34
+ # --- Power Law Fitting ---
35
+
36
+
37
+ def fit_power_law(
38
+ scales: list[float],
39
+ values: list[float],
40
+ ) -> dict:
41
+ """Fit a power law: performance = a * scale^b + c.
42
+
43
+ Uses log-linear regression on (log(scale), value) for the a*x^b part,
44
+ then estimates c as the residual offset.
45
+
46
+ Args:
47
+ scales: Scale fractions (e.g., [0.1, 0.25, 0.5, 0.75]).
48
+ values: Metric values at each scale.
49
+
50
+ Returns:
51
+ Dict with a, b, c coefficients, r_squared, and residuals.
52
+ """
53
+ if len(scales) < 2 or len(values) < 2:
54
+ return {"a": 0, "b": 0, "c": 0, "r_squared": 0, "error": "Need at least 2 points"}
55
+
56
+ x = np.array(scales, dtype=float)
57
+ y = np.array(values, dtype=float)
58
+
59
+ # Filter out non-positive scales
60
+ mask = x > 0
61
+ x = x[mask]
62
+ y = y[mask]
63
+
64
+ if len(x) < 2:
65
+ return {"a": 0, "b": 0, "c": 0, "r_squared": 0, "error": "Need at least 2 positive scale points"}
66
+
67
+ # Log-linear regression: log(y - c) = log(a) + b * log(x)
68
+ # First estimate without c (assume c=0)
69
+ log_x = np.log(x)
70
+ log_y = np.log(np.clip(y, 1e-10, None))
71
+
72
+ # Linear regression in log space
73
+ n = len(log_x)
74
+ sum_lx = np.sum(log_x)
75
+ sum_ly = np.sum(log_y)
76
+ sum_lx2 = np.sum(log_x ** 2)
77
+ sum_lxly = np.sum(log_x * log_y)
78
+
79
+ denom = n * sum_lx2 - sum_lx ** 2
80
+ if abs(denom) < 1e-12:
81
+ b = 0.0
82
+ log_a = np.mean(log_y)
83
+ else:
84
+ b = (n * sum_lxly - sum_lx * sum_ly) / denom
85
+ log_a = (sum_ly - b * sum_lx) / n
86
+
87
+ a = math.exp(log_a)
88
+ c = 0.0 # Simplified: no offset term
89
+
90
+ # Compute R²
91
+ y_pred = a * x ** b + c
92
+ ss_res = np.sum((y - y_pred) ** 2)
93
+ ss_tot = np.sum((y - np.mean(y)) ** 2)
94
+ r_squared = 1 - ss_res / ss_tot if ss_tot > 0 else 0.0
95
+
96
+ return {
97
+ "a": round(float(a), 6),
98
+ "b": round(float(b), 6),
99
+ "c": round(float(c), 6),
100
+ "r_squared": round(float(r_squared), 4),
101
+ "residuals": [round(float(r), 6) for r in (y - y_pred)],
102
+ }
103
+
104
+
105
+ def extrapolate(
106
+ fit: dict,
107
+ target_scales: list[float],
108
+ ) -> list[dict]:
109
+ """Extrapolate from fitted power law to target scales.
110
+
111
+ Args:
112
+ fit: Power law fit dict from fit_power_law.
113
+ target_scales: Scale values to predict.
114
+
115
+ Returns:
116
+ List of prediction dicts with scale, predicted_value.
117
+ """
118
+ a = fit.get("a", 0)
119
+ b = fit.get("b", 0)
120
+ c = fit.get("c", 0)
121
+
122
+ predictions = []
123
+ for scale in target_scales:
124
+ if scale <= 0:
125
+ predictions.append({"scale": scale, "predicted_value": None})
126
+ continue
127
+ predicted = a * (scale ** b) + c
128
+ predictions.append({
129
+ "scale": round(scale, 4),
130
+ "predicted_value": round(float(predicted), 6),
131
+ })
132
+
133
+ return predictions
134
+
135
+
136
+ # --- Scale Point Generation ---
137
+
138
+
139
+ def generate_scale_points(
140
+ axis: str,
141
+ fractions: list[float] | None = None,
142
+ config: dict | None = None,
143
+ ) -> list[dict]:
144
+ """Generate experiment configurations for each scale point.
145
+
146
+ Args:
147
+ axis: Scaling axis (data, compute, params).
148
+ fractions: Scale fractions (default: [0.1, 0.25, 0.5, 0.75]).
149
+ config: Current model config.
150
+
151
+ Returns:
152
+ List of scale point dicts with fraction, description, config_overrides.
153
+ """
154
+ if fractions is None:
155
+ fractions = DEFAULT_SCALE_POINTS
156
+
157
+ if config is None:
158
+ config = {}
159
+
160
+ hyperparams = config.get("model", {}).get("hyperparams", {})
161
+ points = []
162
+
163
+ for frac in fractions:
164
+ point = {
165
+ "fraction": frac,
166
+ "percentage": f"{frac * 100:.0f}%",
167
+ "config_overrides": {},
168
+ }
169
+
170
+ if axis == "data":
171
+ point["description"] = f"Train on {frac * 100:.0f}% of dataset"
172
+ point["config_overrides"]["data_fraction"] = frac
173
+
174
+ elif axis == "compute":
175
+ max_epochs = hyperparams.get("n_estimators", hyperparams.get("epochs", 100))
176
+ scaled_epochs = max(1, int(max_epochs * frac))
177
+ point["description"] = f"Train for {scaled_epochs} epochs ({frac * 100:.0f}%)"
178
+ point["config_overrides"]["n_estimators"] = scaled_epochs
179
+
180
+ elif axis == "params":
181
+ n_estimators = hyperparams.get("n_estimators", 100)
182
+ max_depth = hyperparams.get("max_depth", 6)
183
+ point["description"] = f"Model at {frac * 100:.0f}% capacity"
184
+ point["config_overrides"]["n_estimators"] = max(1, int(n_estimators * frac))
185
+ point["config_overrides"]["max_depth"] = max(1, int(max_depth * frac))
186
+
187
+ points.append(point)
188
+
189
+ return points
190
+
191
+
192
+ # --- Verdict ---
193
+
194
+
195
+ def compute_verdict(
196
+ observed: list[dict],
197
+ predictions: list[dict],
198
+ primary_metric: str,
199
+ ) -> dict:
200
+ """Compute a verdict on whether scaling is worth it.
201
+
202
+ Args:
203
+ observed: List of {fraction, metric_value} from actual runs.
204
+ predictions: Extrapolation predictions.
205
+ primary_metric: Name of the primary metric.
206
+
207
+ Returns:
208
+ Verdict dict with recommendation and reasoning.
209
+ """
210
+ if not observed or not predictions:
211
+ return {"verdict": "insufficient_data", "reason": "Not enough data points"}
212
+
213
+ # Find the highest observed fraction and its value
214
+ observed_sorted = sorted(observed, key=lambda x: x.get("fraction", 0))
215
+ last_observed = observed_sorted[-1]
216
+ last_fraction = last_observed.get("fraction", 0)
217
+ last_value = last_observed.get("metric_value", 0)
218
+
219
+ # Find full-scale prediction
220
+ full_pred = None
221
+ for p in predictions:
222
+ if abs(p["scale"] - 1.0) < 0.01:
223
+ full_pred = p
224
+ break
225
+
226
+ if not full_pred or full_pred["predicted_value"] is None:
227
+ return {"verdict": "no_prediction", "reason": "Cannot predict full-scale performance"}
228
+
229
+ predicted_gain = full_pred["predicted_value"] - last_value
230
+ relative_gain = abs(predicted_gain / last_value) if last_value != 0 else 0
231
+
232
+ if relative_gain < 0.005: # < 0.5% improvement
233
+ return {
234
+ "verdict": "diminishing_returns",
235
+ "predicted_gain": round(predicted_gain, 6),
236
+ "relative_gain": round(relative_gain, 6),
237
+ "reason": (
238
+ f"Full-scale gains only {predicted_gain:+.4f} ({relative_gain:.1%}) "
239
+ f"over {last_fraction:.0%} data. Consider feature engineering instead."
240
+ ),
241
+ }
242
+ elif relative_gain < 0.02: # < 2% improvement
243
+ return {
244
+ "verdict": "marginal_gains",
245
+ "predicted_gain": round(predicted_gain, 6),
246
+ "relative_gain": round(relative_gain, 6),
247
+ "reason": (
248
+ f"Full-scale gains {predicted_gain:+.4f} ({relative_gain:.1%}). "
249
+ f"Worth running if compute is cheap."
250
+ ),
251
+ }
252
+ else:
253
+ return {
254
+ "verdict": "worth_scaling",
255
+ "predicted_gain": round(predicted_gain, 6),
256
+ "relative_gain": round(relative_gain, 6),
257
+ "reason": (
258
+ f"Full-scale gains {predicted_gain:+.4f} ({relative_gain:.1%}). "
259
+ f"Significant improvement expected — proceed with full-scale training."
260
+ ),
261
+ }
262
+
263
+
264
+ # --- Analysis ---
265
+
266
+
267
+ def analyze_scaling(
268
+ scale_results: list[dict],
269
+ primary_metric: str,
270
+ ) -> dict:
271
+ """Analyze completed scaling study results.
272
+
273
+ Args:
274
+ scale_results: List of {fraction, metric_value, std} dicts.
275
+ primary_metric: Name of primary metric.
276
+
277
+ Returns:
278
+ Complete analysis report.
279
+ """
280
+ if not scale_results:
281
+ return {"error": "No scaling results to analyze"}
282
+
283
+ fractions = [r["fraction"] for r in scale_results]
284
+ values = [r["metric_value"] for r in scale_results]
285
+
286
+ # Fit power law
287
+ fit = fit_power_law(fractions, values)
288
+
289
+ # Extrapolate to 100% and 200%
290
+ predictions = extrapolate(fit, [1.0, 1.5, 2.0])
291
+
292
+ # Compute verdict
293
+ verdict = compute_verdict(scale_results, predictions, primary_metric)
294
+
295
+ return {
296
+ "analyzed_at": datetime.now(timezone.utc).isoformat(),
297
+ "primary_metric": primary_metric,
298
+ "scale_points": scale_results,
299
+ "power_law_fit": fit,
300
+ "predictions": predictions,
301
+ "verdict": verdict,
302
+ }
303
+
304
+
305
+ # --- Report Formatting ---
306
+
307
+
308
+ def save_scaling_report(report: dict, output_dir: str = "experiments/scaling") -> Path:
309
+ """Save scaling report to YAML."""
310
+ out_path = Path(output_dir)
311
+ out_path.mkdir(parents=True, exist_ok=True)
312
+
313
+ date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
314
+ filepath = out_path / f"scale-{date}.yaml"
315
+
316
+ with open(filepath, "w") as f:
317
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
318
+
319
+ return filepath
320
+
321
+
322
+ def format_scaling_report(report: dict) -> str:
323
+ """Format scaling report as markdown."""
324
+ if "error" in report:
325
+ return f"ERROR: {report['error']}"
326
+
327
+ metric = report.get("primary_metric", "metric")
328
+ fit = report.get("power_law_fit", {})
329
+
330
+ lines = [
331
+ "# Scaling Analysis",
332
+ "",
333
+ f"*Analyzed {report.get('analyzed_at', 'N/A')[:19]}*",
334
+ "",
335
+ ]
336
+
337
+ # Scale points table
338
+ points = report.get("scale_points", [])
339
+ if points:
340
+ lines.extend(["## Observed Scale Points", ""])
341
+ has_std = any("std" in p for p in points)
342
+ if has_std:
343
+ lines.append(f"| Data % | {metric} (mean±std) |")
344
+ lines.append("|--------|---------------------|")
345
+ for p in points:
346
+ std = p.get("std", 0)
347
+ lines.append(f"| {p['fraction'] * 100:.0f}% | {p['metric_value']:.4f} ± {std:.4f} |")
348
+ else:
349
+ lines.append(f"| Data % | {metric} |")
350
+ lines.append("|--------|---------|")
351
+ for p in points:
352
+ lines.append(f"| {p['fraction'] * 100:.0f}% | {p['metric_value']:.4f} |")
353
+ lines.append("")
354
+
355
+ # Power law fit
356
+ if fit and "error" not in fit:
357
+ lines.extend([
358
+ "## Power Law Fit",
359
+ "",
360
+ f"**{metric} = {fit['a']:.4f} × n^{fit['b']:.4f}** (R²={fit['r_squared']:.4f})",
361
+ "",
362
+ ])
363
+
364
+ # Predictions
365
+ predictions = report.get("predictions", [])
366
+ if predictions:
367
+ lines.extend(["## Predictions", ""])
368
+ for p in predictions:
369
+ if p["predicted_value"] is not None:
370
+ lines.append(f"- **{p['scale'] * 100:.0f}% data** → {metric} = {p['predicted_value']:.4f}")
371
+ lines.append("")
372
+
373
+ # Verdict
374
+ verdict = report.get("verdict", {})
375
+ if verdict:
376
+ v = verdict.get("verdict", "?")
377
+ verdict_labels = {
378
+ "diminishing_returns": "DIMINISHING RETURNS",
379
+ "marginal_gains": "MARGINAL GAINS",
380
+ "worth_scaling": "WORTH SCALING",
381
+ }
382
+ lines.extend([
383
+ "## Verdict",
384
+ "",
385
+ f"**{verdict_labels.get(v, v.upper())}**",
386
+ "",
387
+ verdict.get("reason", ""),
388
+ "",
389
+ ])
390
+
391
+ return "\n".join(lines)
392
+
393
+
394
+ def format_ascii_plot(
395
+ scale_results: list[dict],
396
+ predictions: list[dict],
397
+ metric: str,
398
+ width: int = 50,
399
+ height: int = 15,
400
+ ) -> str:
401
+ """Generate an ASCII scatter plot of the scaling curve."""
402
+ all_points = []
403
+ for r in scale_results:
404
+ all_points.append((r["fraction"], r["metric_value"], "o"))
405
+ for p in predictions:
406
+ if p["predicted_value"] is not None:
407
+ all_points.append((p["scale"], p["predicted_value"], "*"))
408
+
409
+ if not all_points:
410
+ return "(no data to plot)"
411
+
412
+ x_vals = [p[0] for p in all_points]
413
+ y_vals = [p[1] for p in all_points]
414
+ x_min, x_max = min(x_vals), max(x_vals)
415
+ y_min, y_max = min(y_vals), max(y_vals)
416
+
417
+ if x_max == x_min:
418
+ x_max = x_min + 1
419
+ if y_max == y_min:
420
+ y_max = y_min + 0.01
421
+
422
+ grid = [[" " for _ in range(width)] for _ in range(height)]
423
+
424
+ for x, y, marker in all_points:
425
+ col = int((x - x_min) / (x_max - x_min) * (width - 1))
426
+ row = height - 1 - int((y - y_min) / (y_max - y_min) * (height - 1))
427
+ col = max(0, min(width - 1, col))
428
+ row = max(0, min(height - 1, row))
429
+ grid[row][col] = marker
430
+
431
+ lines = [f" {metric} vs Scale (o=observed, *=predicted)", ""]
432
+ lines.append(f" {y_max:.3f} |")
433
+ for row in grid:
434
+ lines.append(f" |{''.join(row)}|")
435
+ lines.append(f" {y_min:.3f} |{'_' * width}|")
436
+ lines.append(f" {x_min:.0%}{' ' * (width - 8)}{x_max:.0%}")
437
+
438
+ return "\n".join(lines)
439
+
440
+
441
+ def main() -> None:
442
+ """CLI entry point."""
443
+ parser = argparse.ArgumentParser(
444
+ description="Scaling law estimator",
445
+ )
446
+ parser.add_argument(
447
+ "--axis", choices=list(SCALE_AXES), default="data",
448
+ help="Scaling axis (default: data)",
449
+ )
450
+ parser.add_argument(
451
+ "--points", type=int, default=len(DEFAULT_SCALE_POINTS),
452
+ help=f"Number of scale points (default: {len(DEFAULT_SCALE_POINTS)})",
453
+ )
454
+ parser.add_argument(
455
+ "--analyze",
456
+ help="Analyze existing scaling results YAML",
457
+ )
458
+ parser.add_argument(
459
+ "--config", default="config.yaml",
460
+ help="Path to config.yaml",
461
+ )
462
+ parser.add_argument(
463
+ "--log", default=DEFAULT_LOG_PATH,
464
+ help="Path to experiment log",
465
+ )
466
+ parser.add_argument(
467
+ "--plot", action="store_true",
468
+ help="Include ASCII plot",
469
+ )
470
+ parser.add_argument(
471
+ "--json", action="store_true",
472
+ help="Output raw JSON instead of formatted report",
473
+ )
474
+ args = parser.parse_args()
475
+
476
+ if args.analyze:
477
+ # Analyze existing results
478
+ with open(args.analyze) as f:
479
+ data = yaml.safe_load(f)
480
+ config = load_config(args.config)
481
+ metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
482
+ report = analyze_scaling(data.get("scale_points", data), metric)
483
+ else:
484
+ # Generate scale points (actual execution would be done by the agent)
485
+ config = load_config(args.config)
486
+ metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
487
+ fractions = DEFAULT_SCALE_POINTS[:args.points]
488
+ points = generate_scale_points(args.axis, fractions, config)
489
+ report = {
490
+ "action": "plan",
491
+ "axis": args.axis,
492
+ "primary_metric": metric,
493
+ "scale_points": points,
494
+ "message": f"Run {len(points)} experiments at scale points: {', '.join(p['percentage'] for p in points)}",
495
+ }
496
+
497
+ if "error" not in report:
498
+ filepath = save_scaling_report(report)
499
+ print(f"Saved to {filepath}", file=sys.stderr)
500
+
501
+ if args.json:
502
+ print(json.dumps(report, indent=2, default=str))
503
+ else:
504
+ if report.get("action") == "plan":
505
+ lines = ["# Scaling Plan", "", f"**Axis:** {report['axis']}", ""]
506
+ for p in report["scale_points"]:
507
+ lines.append(f"- {p['percentage']}: {p['description']}")
508
+ lines.append("")
509
+ lines.append(report["message"])
510
+ print("\n".join(lines))
511
+ else:
512
+ text = format_scaling_report(report)
513
+ if args.plot:
514
+ text += "\n\n" + format_ascii_plot(
515
+ report.get("scale_points", []),
516
+ report.get("predictions", []),
517
+ metric,
518
+ )
519
+ print(text)
520
+
521
+
522
+ if __name__ == "__main__":
523
+ main()