claude-turing 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +9 -2
  3. package/commands/annotate.md +23 -0
  4. package/commands/archive.md +23 -0
  5. package/commands/cite.md +23 -0
  6. package/commands/flashback.md +22 -0
  7. package/commands/present.md +23 -0
  8. package/commands/replay.md +23 -0
  9. package/commands/search.md +22 -0
  10. package/commands/template.md +22 -0
  11. package/commands/trend.md +21 -0
  12. package/commands/turing.md +14 -0
  13. package/package.json +1 -1
  14. package/src/install.js +1 -0
  15. package/src/verify.js +7 -0
  16. package/templates/scripts/__pycache__/experiment_annotations.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/experiment_archive.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/experiment_replay.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/experiment_search.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/experiment_templates.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/session_flashback.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/trend_analysis.cpython-314.pyc +0 -0
  24. package/templates/scripts/citation_manager.py +436 -0
  25. package/templates/scripts/experiment_annotations.py +392 -0
  26. package/templates/scripts/experiment_archive.py +534 -0
  27. package/templates/scripts/experiment_replay.py +592 -0
  28. package/templates/scripts/experiment_search.py +451 -0
  29. package/templates/scripts/experiment_templates.py +501 -0
  30. package/templates/scripts/generate_changelog.py +464 -0
  31. package/templates/scripts/generate_figures.py +597 -0
  32. package/templates/scripts/scaffold.py +12 -0
  33. package/templates/scripts/session_flashback.py +461 -0
  34. package/templates/scripts/trend_analysis.py +503 -0
@@ -0,0 +1,503 @@
1
+ #!/usr/bin/env python3
2
+ """Long-term experiment trend analysis for the autoresearch pipeline.
3
+
4
+ Computes improvement velocity, family ROI, diminishing returns detection,
5
+ and phase transition detection across experiment history. Answers "is this
6
+ research direction still productive?" and "how many more experiments before
7
+ the next meaningful gain?"
8
+
9
+ Usage:
10
+ python scripts/trend_analysis.py [--config config.yaml] [--log experiments/log.jsonl]
11
+ python scripts/trend_analysis.py --window 10 --json
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import math
19
+ import sys
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+
23
+ import yaml
24
+
25
+ from scripts.turing_io import load_config, load_experiments
26
+
27
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
28
+ DEFAULT_WINDOW = 5
29
+
30
+
31
+ # --- Improvement Velocity ---
32
+
33
+
34
+ def compute_improvement_trajectory(
35
+ experiments: list[dict],
36
+ metric: str,
37
+ lower_is_better: bool,
38
+ ) -> list[dict]:
39
+ """Build a trajectory of best-so-far metric values across kept experiments.
40
+
41
+ Returns list of dicts with experiment_id, index, timestamp, metric value,
42
+ and best_so_far.
43
+ """
44
+ trajectory = []
45
+ best_val = float("inf") if lower_is_better else float("-inf")
46
+ idx = 0
47
+
48
+ for exp in experiments:
49
+ if exp.get("status") != "kept":
50
+ continue
51
+ val = exp.get("metrics", {}).get(metric)
52
+ if val is None:
53
+ continue
54
+ try:
55
+ val = float(val)
56
+ except (ValueError, TypeError):
57
+ continue
58
+
59
+ improved = (lower_is_better and val < best_val) or (
60
+ not lower_is_better and val > best_val
61
+ )
62
+ if improved:
63
+ best_val = val
64
+
65
+ trajectory.append({
66
+ "experiment_id": exp.get("experiment_id", "?"),
67
+ "index": idx,
68
+ "timestamp": exp.get("timestamp", ""),
69
+ "value": val,
70
+ "best_so_far": best_val,
71
+ "improved": improved,
72
+ })
73
+ idx += 1
74
+
75
+ return trajectory
76
+
77
+
78
+ def compute_velocity(
79
+ trajectory: list[dict],
80
+ window: int = DEFAULT_WINDOW,
81
+ ) -> list[dict]:
82
+ """Compute improvement velocity: metric change per experiment over sliding windows.
83
+
84
+ Returns list of window summaries with start/end indices, improvement,
85
+ and velocity (improvement / window_size).
86
+ """
87
+ if len(trajectory) < 2:
88
+ return []
89
+
90
+ velocities = []
91
+ for i in range(len(trajectory) - window + 1):
92
+ segment = trajectory[i : i + window]
93
+ start_best = segment[0]["best_so_far"]
94
+ end_best = segment[-1]["best_so_far"]
95
+ improvement = end_best - start_best
96
+ velocity = improvement / window
97
+
98
+ velocities.append({
99
+ "window_start": segment[0]["experiment_id"],
100
+ "window_end": segment[-1]["experiment_id"],
101
+ "start_index": i,
102
+ "end_index": i + window - 1,
103
+ "start_best": start_best,
104
+ "end_best": end_best,
105
+ "improvement": round(improvement, 6),
106
+ "velocity": round(velocity, 6),
107
+ "improvements_in_window": sum(1 for s in segment if s.get("improved")),
108
+ })
109
+
110
+ return velocities
111
+
112
+
113
+ # --- Family ROI ---
114
+
115
+
116
+ def compute_family_roi(
117
+ experiments: list[dict],
118
+ metric: str,
119
+ lower_is_better: bool,
120
+ ) -> list[dict]:
121
+ """Compute experiments-per-unit-improvement for each experiment family.
122
+
123
+ A family with high ROI delivers big metric improvements per experiment.
124
+ A family with low ROI is burning compute for marginal gains.
125
+ """
126
+ families: dict[str, list[dict]] = {}
127
+ for exp in experiments:
128
+ family = exp.get("family") or "untagged"
129
+ families.setdefault(family, []).append(exp)
130
+
131
+ results = []
132
+ for family, exps in sorted(families.items()):
133
+ kept = [e for e in exps if e.get("status") == "kept"]
134
+ total = len(exps)
135
+ kept_count = len(kept)
136
+
137
+ vals = []
138
+ for e in kept:
139
+ v = e.get("metrics", {}).get(metric)
140
+ if v is not None:
141
+ try:
142
+ vals.append(float(v))
143
+ except (ValueError, TypeError):
144
+ continue
145
+
146
+ if len(vals) < 2:
147
+ improvement = 0.0
148
+ else:
149
+ if lower_is_better:
150
+ improvement = vals[0] - min(vals)
151
+ else:
152
+ improvement = max(vals) - vals[0]
153
+
154
+ roi = improvement / total if total > 0 and improvement != 0 else 0.0
155
+ cost_per_gain = total / improvement if improvement > 0 else float("inf")
156
+
157
+ results.append({
158
+ "family": family,
159
+ "total_experiments": total,
160
+ "kept": kept_count,
161
+ "keep_rate": round(kept_count / total, 3) if total > 0 else 0,
162
+ "improvement": round(improvement, 6),
163
+ "roi": round(roi, 6),
164
+ "experiments_per_unit_gain": round(cost_per_gain, 2) if cost_per_gain != float("inf") else None,
165
+ })
166
+
167
+ results.sort(key=lambda r: r["roi"], reverse=True)
168
+ return results
169
+
170
+
171
+ # --- Diminishing Returns Detection ---
172
+
173
+
174
+ def detect_diminishing_returns(
175
+ trajectory: list[dict],
176
+ lower_is_better: bool,
177
+ ) -> dict:
178
+ """Fit a log curve to the improvement trajectory and predict effort for next 0.5% gain.
179
+
180
+ Uses least-squares fit of best_so_far ~ a * ln(index + 1) + b.
181
+ """
182
+ if len(trajectory) < 3:
183
+ return {"detected": False, "reason": "Too few data points"}
184
+
185
+ # Extract (index, best_so_far) pairs
186
+ xs = [t["index"] + 1 for t in trajectory]
187
+ ys = [t["best_so_far"] for t in trajectory]
188
+
189
+ # Fit: y = a * ln(x) + b via least squares
190
+ ln_xs = [math.log(x) for x in xs]
191
+ n = len(xs)
192
+ sum_lnx = sum(ln_xs)
193
+ sum_y = sum(ys)
194
+ sum_lnx2 = sum(lx ** 2 for lx in ln_xs)
195
+ sum_lnx_y = sum(lx * y for lx, y in zip(ln_xs, ys))
196
+
197
+ denom = n * sum_lnx2 - sum_lnx ** 2
198
+ if abs(denom) < 1e-12:
199
+ return {"detected": False, "reason": "Degenerate fit"}
200
+
201
+ a = (n * sum_lnx_y - sum_lnx * sum_y) / denom
202
+ b = (sum_y - a * sum_lnx) / n
203
+
204
+ # Compute R-squared
205
+ y_mean = sum_y / n
206
+ ss_tot = sum((y - y_mean) ** 2 for y in ys)
207
+ ss_res = sum((y - (a * lx + b)) ** 2 for y, lx in zip(ys, ln_xs))
208
+ r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
209
+
210
+ # Current best and target
211
+ current_best = ys[-1]
212
+ current_n = xs[-1]
213
+
214
+ # Target: 0.5% improvement
215
+ if lower_is_better:
216
+ target = current_best * 0.995 # 0.5% lower
217
+ else:
218
+ target = current_best * 1.005 # 0.5% higher
219
+
220
+ # Predict: target = a * ln(n_needed) + b => n_needed = exp((target - b) / a)
221
+ predicted_n = None
222
+ additional_experiments = None
223
+ if abs(a) > 1e-12:
224
+ try:
225
+ predicted_n = int(math.exp((target - b) / a))
226
+ additional_experiments = max(0, predicted_n - current_n)
227
+ except (OverflowError, ValueError):
228
+ pass
229
+
230
+ # Diminishing returns if recent velocity is low relative to early velocity
231
+ recent_improvement = abs(ys[-1] - ys[max(0, len(ys) - 4)]) if len(ys) >= 4 else 0
232
+ early_improvement = abs(ys[min(3, len(ys) - 1)] - ys[0]) if len(ys) >= 2 else 0
233
+ ratio = recent_improvement / early_improvement if early_improvement > 0 else 0
234
+
235
+ return {
236
+ "detected": ratio < 0.25 and len(trajectory) >= 6,
237
+ "fit": {
238
+ "a": round(a, 6),
239
+ "b": round(b, 6),
240
+ "r_squared": round(r_squared, 4),
241
+ },
242
+ "current_best": round(current_best, 6),
243
+ "target_0_5_pct": round(target, 6),
244
+ "predicted_experiments_for_target": predicted_n,
245
+ "additional_experiments_needed": additional_experiments,
246
+ "recent_vs_early_ratio": round(ratio, 4),
247
+ }
248
+
249
+
250
+ # --- Phase Transition Detection ---
251
+
252
+
253
+ def detect_phase_transitions(
254
+ trajectory: list[dict],
255
+ threshold_factor: float = 3.0,
256
+ ) -> list[dict]:
257
+ """Detect step-changes (phase transitions) vs incremental improvement.
258
+
259
+ A phase transition is a single improvement that exceeds threshold_factor
260
+ times the median improvement magnitude.
261
+ """
262
+ if len(trajectory) < 3:
263
+ return []
264
+
265
+ # Compute per-step deltas in best_so_far
266
+ deltas = []
267
+ for i in range(1, len(trajectory)):
268
+ delta = trajectory[i]["best_so_far"] - trajectory[i - 1]["best_so_far"]
269
+ deltas.append({
270
+ "index": i,
271
+ "experiment_id": trajectory[i]["experiment_id"],
272
+ "delta": delta,
273
+ "abs_delta": abs(delta),
274
+ })
275
+
276
+ # Filter to only improvements (non-zero deltas in the right direction)
277
+ improvements = [d for d in deltas if d["abs_delta"] > 0]
278
+ if len(improvements) < 2:
279
+ return []
280
+
281
+ # Median absolute improvement
282
+ sorted_abs = sorted(d["abs_delta"] for d in improvements)
283
+ mid = len(sorted_abs) // 2
284
+ median_abs = sorted_abs[mid] if len(sorted_abs) % 2 == 1 else (
285
+ sorted_abs[mid - 1] + sorted_abs[mid]
286
+ ) / 2
287
+
288
+ if median_abs <= 0:
289
+ return []
290
+
291
+ # Find step-changes that exceed threshold * median
292
+ transitions = []
293
+ for d in deltas:
294
+ if d["abs_delta"] > threshold_factor * median_abs:
295
+ transitions.append({
296
+ "experiment_id": d["experiment_id"],
297
+ "index": d["index"],
298
+ "delta": round(d["delta"], 6),
299
+ "magnitude_vs_median": round(d["abs_delta"] / median_abs, 2),
300
+ "type": "step_change",
301
+ })
302
+
303
+ return transitions
304
+
305
+
306
+ # --- Report ---
307
+
308
+
309
+ def save_trend_report(report: dict, output_dir: str = "experiments/trends") -> Path:
310
+ """Save trend analysis report to YAML."""
311
+ out_path = Path(output_dir)
312
+ out_path.mkdir(parents=True, exist_ok=True)
313
+ date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d_%H%M%S")
314
+ filepath = out_path / f"trend-{date_str}.yaml"
315
+ with open(filepath, "w") as f:
316
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
317
+ return filepath
318
+
319
+
320
+ def format_trend_report(report: dict) -> str:
321
+ """Format trend analysis as a markdown report."""
322
+ if "error" in report:
323
+ return f"ERROR: {report['error']}"
324
+
325
+ lines = [
326
+ "# Experiment Trend Analysis",
327
+ "",
328
+ f"*Generated {report.get('timestamp', '?')[:19]} UTC*",
329
+ "",
330
+ ]
331
+
332
+ # Velocity summary
333
+ velocities = report.get("velocity", [])
334
+ if velocities:
335
+ latest = velocities[-1]
336
+ peak = max(velocities, key=lambda v: abs(v["velocity"]))
337
+ lines.extend([
338
+ "## Improvement Velocity",
339
+ "",
340
+ f"| Window | Velocity | Improvements |",
341
+ f"|--------|----------|--------------|",
342
+ ])
343
+ for v in velocities[-5:]:
344
+ lines.append(
345
+ f"| {v['window_start']}..{v['window_end']} "
346
+ f"| {v['velocity']:+.6f}/exp "
347
+ f"| {v['improvements_in_window']}/{report.get('window', DEFAULT_WINDOW)} |"
348
+ )
349
+ lines.extend([
350
+ "",
351
+ f"**Current velocity:** {latest['velocity']:+.6f}/experiment",
352
+ f"**Peak velocity:** {peak['velocity']:+.6f}/experiment "
353
+ f"(window ending at {peak['window_end']})",
354
+ "",
355
+ ])
356
+ else:
357
+ lines.extend(["## Improvement Velocity", "", "Not enough data.", ""])
358
+
359
+ # Family ROI
360
+ family_roi = report.get("family_roi", [])
361
+ if family_roi:
362
+ lines.extend([
363
+ "## Family ROI",
364
+ "",
365
+ "| Family | Experiments | Kept | Improvement | Exp/Unit Gain |",
366
+ "|--------|-------------|------|-------------|---------------|",
367
+ ])
368
+ for f in family_roi:
369
+ cost = f"~{f['experiments_per_unit_gain']:.0f}" if f["experiments_per_unit_gain"] is not None else "inf"
370
+ lines.append(
371
+ f"| {f['family']} | {f['total_experiments']} "
372
+ f"| {f['kept']} | {f['improvement']:+.6f} | {cost} |"
373
+ )
374
+ lines.append("")
375
+
376
+ # Flag exhausted families
377
+ exhausted = [f for f in family_roi if f["experiments_per_unit_gain"] is None]
378
+ if exhausted:
379
+ names = ", ".join(f["family"] for f in exhausted)
380
+ lines.append(f"**Zero-improvement families:** {names}")
381
+ lines.append("")
382
+
383
+ # Diminishing returns
384
+ dr = report.get("diminishing_returns", {})
385
+ if dr:
386
+ lines.extend(["## Diminishing Returns", ""])
387
+ if dr.get("detected"):
388
+ lines.append("**DETECTED** — recent improvements are <25% of early improvements.")
389
+ else:
390
+ lines.append("Not detected (research is still productive).")
391
+
392
+ fit = dr.get("fit", {})
393
+ if fit.get("r_squared") is not None:
394
+ lines.append(f"- Log-curve fit R2: {fit['r_squared']:.4f}")
395
+ lines.append(f"- Current best: {dr.get('current_best', '?')}")
396
+ lines.append(f"- Target (+0.5%): {dr.get('target_0_5_pct', '?')}")
397
+
398
+ addl = dr.get("additional_experiments_needed")
399
+ if addl is not None:
400
+ lines.append(f"- Predicted experiments for +0.5%: **~{addl}**")
401
+ else:
402
+ lines.append("- Prediction unavailable (poor fit or degenerate data)")
403
+ lines.append("")
404
+
405
+ # Phase transitions
406
+ transitions = report.get("phase_transitions", [])
407
+ if transitions:
408
+ lines.extend([
409
+ "## Phase Transitions (Step-Changes)",
410
+ "",
411
+ "| Experiment | Delta | Magnitude vs Median |",
412
+ "|------------|-------|---------------------|",
413
+ ])
414
+ for t in transitions:
415
+ lines.append(
416
+ f"| {t['experiment_id']} | {t['delta']:+.6f} | {t['magnitude_vs_median']:.1f}x |"
417
+ )
418
+ lines.append("")
419
+ else:
420
+ lines.extend(["## Phase Transitions", "", "No step-changes detected.", ""])
421
+
422
+ # Summary
423
+ lines.extend(["---", ""])
424
+ if dr.get("detected"):
425
+ lines.append("*Consider switching research direction or injecting novel hypotheses.*")
426
+ elif velocities and velocities[-1]["velocity"] == 0:
427
+ lines.append("*Velocity is zero — the last window produced no improvement.*")
428
+ else:
429
+ lines.append("*Research is still productive. Continue the current direction.*")
430
+
431
+ return "\n".join(lines)
432
+
433
+
434
+ def run_trend_analysis(
435
+ config_path: str = "config.yaml",
436
+ log_path: str = DEFAULT_LOG_PATH,
437
+ window: int = DEFAULT_WINDOW,
438
+ ) -> dict:
439
+ """Run full trend analysis.
440
+
441
+ Args:
442
+ config_path: Path to config.yaml.
443
+ log_path: Path to experiment log.
444
+ window: Sliding window size for velocity computation.
445
+
446
+ Returns:
447
+ Trend analysis result dict.
448
+ """
449
+ config = load_config(config_path)
450
+ eval_cfg = config.get("evaluation", {})
451
+ metric = eval_cfg.get("primary_metric", "accuracy")
452
+ lower_is_better = eval_cfg.get("lower_is_better", False)
453
+
454
+ experiments = load_experiments(log_path)
455
+ if not experiments:
456
+ return {"error": "No experiments found", "log_path": log_path}
457
+
458
+ trajectory = compute_improvement_trajectory(experiments, metric, lower_is_better)
459
+ if len(trajectory) < 2:
460
+ return {"error": "Need at least 2 kept experiments for trend analysis"}
461
+
462
+ velocities = compute_velocity(trajectory, window)
463
+ family_roi = compute_family_roi(experiments, metric, lower_is_better)
464
+ diminishing = detect_diminishing_returns(trajectory, lower_is_better)
465
+ transitions = detect_phase_transitions(trajectory)
466
+
467
+ return {
468
+ "timestamp": datetime.now(timezone.utc).isoformat(),
469
+ "metric": metric,
470
+ "lower_is_better": lower_is_better,
471
+ "total_experiments": len(experiments),
472
+ "kept_experiments": len(trajectory),
473
+ "window": window,
474
+ "velocity": velocities,
475
+ "family_roi": family_roi,
476
+ "diminishing_returns": diminishing,
477
+ "phase_transitions": transitions,
478
+ }
479
+
480
+
481
+ def main() -> None:
482
+ """CLI entry point."""
483
+ parser = argparse.ArgumentParser(description="Long-term experiment trend analysis")
484
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
485
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
486
+ parser.add_argument("--window", type=int, default=DEFAULT_WINDOW, help="Sliding window size")
487
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
488
+ args = parser.parse_args()
489
+
490
+ report = run_trend_analysis(args.config, args.log, args.window)
491
+
492
+ if "error" not in report:
493
+ filepath = save_trend_report(report)
494
+ print(f"Saved to {filepath}", file=sys.stderr)
495
+
496
+ if args.json:
497
+ print(json.dumps(report, indent=2, default=str))
498
+ else:
499
+ print(format_trend_report(report))
500
+
501
+
502
+ if __name__ == "__main__":
503
+ main()