claude-turing 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +9 -2
  3. package/commands/annotate.md +23 -0
  4. package/commands/archive.md +23 -0
  5. package/commands/cite.md +23 -0
  6. package/commands/flashback.md +22 -0
  7. package/commands/present.md +23 -0
  8. package/commands/replay.md +23 -0
  9. package/commands/search.md +22 -0
  10. package/commands/template.md +22 -0
  11. package/commands/trend.md +21 -0
  12. package/commands/turing.md +14 -0
  13. package/package.json +1 -1
  14. package/src/install.js +1 -0
  15. package/src/verify.js +7 -0
  16. package/templates/scripts/__pycache__/experiment_annotations.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/experiment_archive.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/experiment_replay.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/experiment_search.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/experiment_templates.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/session_flashback.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/trend_analysis.cpython-314.pyc +0 -0
  24. package/templates/scripts/citation_manager.py +436 -0
  25. package/templates/scripts/experiment_annotations.py +392 -0
  26. package/templates/scripts/experiment_archive.py +534 -0
  27. package/templates/scripts/experiment_replay.py +592 -0
  28. package/templates/scripts/experiment_search.py +451 -0
  29. package/templates/scripts/experiment_templates.py +501 -0
  30. package/templates/scripts/generate_changelog.py +464 -0
  31. package/templates/scripts/generate_figures.py +597 -0
  32. package/templates/scripts/scaffold.py +12 -0
  33. package/templates/scripts/session_flashback.py +461 -0
  34. package/templates/scripts/trend_analysis.py +503 -0
@@ -0,0 +1,592 @@
1
+ #!/usr/bin/env python3
2
+ """Re-run historical experiments with current infrastructure.
3
+
4
+ Read an old experiment's config from log.jsonl, plan a replay with
5
+ current code, data, and preprocessing, then compare original vs
6
+ replayed metrics. Answers the question: "would this old experiment
7
+ perform better/worse with today's pipeline?"
8
+
9
+ Usage:
10
+ python scripts/experiment_replay.py exp-042
11
+ python scripts/experiment_replay.py exp-042 --with-current-data
12
+ python scripts/experiment_replay.py exp-042 --with-current-preprocessing
13
+ python scripts/experiment_replay.py exp-042 --dry-run
14
+ python scripts/experiment_replay.py --list
15
+ python scripts/experiment_replay.py --json
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import subprocess
23
+ import sys
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+
27
+ import yaml
28
+
29
+ from scripts.turing_io import load_config, load_experiments
30
+
31
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
32
+ DEFAULT_REPLAY_DIR = "experiments/replays"
33
+
34
+
35
+ # --- Replay Planning ---
36
+
37
+
38
+ def find_experiment(experiments: list[dict], experiment_id: str) -> dict | None:
39
+ """Find an experiment by ID in the log."""
40
+ for exp in experiments:
41
+ if exp.get("experiment_id") == experiment_id:
42
+ return exp
43
+ return None
44
+
45
+
46
+ def plan_replay(
47
+ original: dict,
48
+ config: dict,
49
+ with_current_data: bool = False,
50
+ with_current_preprocessing: bool = False,
51
+ ) -> dict:
52
+ """Plan a replay of an original experiment.
53
+
54
+ Determines what changes between original and current infrastructure,
55
+ and constructs a replay configuration.
56
+
57
+ Args:
58
+ original: Original experiment dict from log.
59
+ config: Current project config.
60
+ with_current_data: Use current data instead of original data path.
61
+ with_current_preprocessing: Use current preprocessing pipeline.
62
+
63
+ Returns:
64
+ Replay plan dict with config, changes, and warnings.
65
+ """
66
+ original_config = original.get("config", {})
67
+ replay_config = dict(original_config)
68
+ changes = []
69
+ warnings = []
70
+
71
+ # Data source
72
+ if with_current_data:
73
+ current_data = config.get("data", {}).get("path", "")
74
+ original_data = original_config.get("data_path", "") or original_config.get("data", {}).get("path", "")
75
+ if current_data and current_data != original_data:
76
+ replay_config["data_path"] = current_data
77
+ if isinstance(replay_config.get("data"), dict):
78
+ replay_config["data"]["path"] = current_data
79
+ changes.append({
80
+ "field": "data_path",
81
+ "original": original_data,
82
+ "replay": current_data,
83
+ "reason": "Using current data (--with-current-data)",
84
+ })
85
+ elif not current_data:
86
+ warnings.append("No data path in current config — using original data path")
87
+
88
+ # Preprocessing
89
+ if with_current_preprocessing:
90
+ current_preproc = config.get("preprocessing", {})
91
+ original_preproc = original_config.get("preprocessing", {})
92
+ if current_preproc and current_preproc != original_preproc:
93
+ replay_config["preprocessing"] = current_preproc
94
+ changes.append({
95
+ "field": "preprocessing",
96
+ "original": original_preproc,
97
+ "replay": current_preproc,
98
+ "reason": "Using current preprocessing (--with-current-preprocessing)",
99
+ })
100
+
101
+ # Check for missing dependencies or features
102
+ model_type = original_config.get("model_type", "")
103
+ if model_type:
104
+ # Check if model type still exists in current codebase
105
+ train_path = Path("train.py")
106
+ if train_path.exists():
107
+ train_content = train_path.read_text()
108
+ if model_type not in train_content:
109
+ warnings.append(
110
+ f"Model type '{model_type}' not found in current train.py — "
111
+ f"replay may fail"
112
+ )
113
+
114
+ # Seed handling — use same seed for reproducibility
115
+ seed = original_config.get("seed", original.get("seed"))
116
+ if seed is not None:
117
+ replay_config["seed"] = seed
118
+ else:
119
+ replay_config["seed"] = 42
120
+ warnings.append("No seed in original experiment — defaulting to 42")
121
+
122
+ return {
123
+ "original_id": original.get("experiment_id"),
124
+ "original_timestamp": original.get("timestamp"),
125
+ "original_metrics": original.get("metrics", {}),
126
+ "replay_config": replay_config,
127
+ "changes": changes,
128
+ "warnings": warnings,
129
+ "with_current_data": with_current_data,
130
+ "with_current_preprocessing": with_current_preprocessing,
131
+ }
132
+
133
+
134
+ # --- Replay Execution ---
135
+
136
+
137
+ def execute_replay(
138
+ plan: dict,
139
+ timeout: int = 600,
140
+ ) -> dict:
141
+ """Execute a replay by running train.py with the replay config.
142
+
143
+ Args:
144
+ plan: Replay plan from plan_replay.
145
+ timeout: Max seconds for training.
146
+
147
+ Returns:
148
+ Execution result with replay metrics.
149
+ """
150
+ replay_config = plan.get("replay_config", {})
151
+ started_at = datetime.now(timezone.utc).isoformat()
152
+
153
+ # Write temporary config
154
+ tmp_config = Path("experiments/replays/.replay-config.yaml")
155
+ tmp_config.parent.mkdir(parents=True, exist_ok=True)
156
+ with open(tmp_config, "w") as f:
157
+ yaml.dump(replay_config, f, default_flow_style=False, sort_keys=False)
158
+
159
+ # Run training
160
+ cmd = ["python", "train.py", "--config", str(tmp_config)]
161
+ seed = replay_config.get("seed")
162
+ if seed is not None:
163
+ cmd.extend(["--seed", str(seed)])
164
+
165
+ try:
166
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
167
+ except subprocess.TimeoutExpired:
168
+ return {
169
+ "status": "timeout",
170
+ "started_at": started_at,
171
+ "error": f"Training exceeded {timeout}s timeout",
172
+ }
173
+ except FileNotFoundError:
174
+ return {
175
+ "status": "error",
176
+ "started_at": started_at,
177
+ "error": "train.py not found",
178
+ }
179
+
180
+ completed_at = datetime.now(timezone.utc).isoformat()
181
+
182
+ if proc.returncode != 0:
183
+ error_snippet = (proc.stderr + proc.stdout)[-500:]
184
+ return {
185
+ "status": "failed",
186
+ "started_at": started_at,
187
+ "completed_at": completed_at,
188
+ "error": _classify_error(proc.stderr + proc.stdout),
189
+ "stderr_tail": error_snippet,
190
+ }
191
+
192
+ # Parse metrics from stdout
193
+ metrics = _parse_metrics(proc.stdout)
194
+
195
+ # Clean up temp config
196
+ try:
197
+ tmp_config.unlink()
198
+ except OSError:
199
+ pass
200
+
201
+ return {
202
+ "status": "completed",
203
+ "started_at": started_at,
204
+ "completed_at": completed_at,
205
+ "metrics": metrics,
206
+ }
207
+
208
+
209
+ def _parse_metrics(stdout: str) -> dict:
210
+ """Parse metrics from training output."""
211
+ metrics = {}
212
+ in_block = False
213
+ for line in stdout.splitlines():
214
+ line = line.strip()
215
+ if line == "---":
216
+ if in_block:
217
+ break
218
+ in_block = True
219
+ continue
220
+ if in_block and ":" in line:
221
+ key, value = line.split(":", 1)
222
+ try:
223
+ metrics[key.strip()] = float(value.strip())
224
+ except ValueError:
225
+ metrics[key.strip()] = value.strip()
226
+ return metrics
227
+
228
+
229
+ def _classify_error(output: str) -> str:
230
+ """Classify error from output text."""
231
+ output_lower = output.lower()
232
+ if "cuda out of memory" in output_lower or "memoryerror" in output_lower:
233
+ return "oom"
234
+ if "nan" in output_lower and "loss" in output_lower:
235
+ return "nan_loss"
236
+ if "modulenotfounderror" in output_lower or "importerror" in output_lower:
237
+ return "import_error"
238
+ if "filenotfounderror" in output_lower:
239
+ return "file_not_found"
240
+ return "unknown"
241
+
242
+
243
+ # --- Comparison ---
244
+
245
+
246
+ def compare_metrics(
247
+ original_metrics: dict,
248
+ replay_metrics: dict,
249
+ primary_metric: str = "accuracy",
250
+ lower_is_better: bool = False,
251
+ ) -> dict:
252
+ """Compare original vs replayed metrics.
253
+
254
+ Args:
255
+ original_metrics: Metrics from the original experiment.
256
+ replay_metrics: Metrics from the replay.
257
+ primary_metric: Primary metric name.
258
+ lower_is_better: Whether lower values are better.
259
+
260
+ Returns:
261
+ Comparison dict with deltas and verdict.
262
+ """
263
+ comparisons = {}
264
+ all_metrics = set(list(original_metrics.keys()) + list(replay_metrics.keys()))
265
+
266
+ for metric in sorted(all_metrics):
267
+ orig = original_metrics.get(metric)
268
+ replay = replay_metrics.get(metric)
269
+
270
+ entry: dict = {"original": orig, "replay": replay}
271
+
272
+ if orig is not None and replay is not None:
273
+ try:
274
+ orig_f = float(orig)
275
+ replay_f = float(replay)
276
+ delta = replay_f - orig_f
277
+ pct = (delta / abs(orig_f) * 100) if orig_f != 0 else 0
278
+ entry["delta"] = round(delta, 6)
279
+ entry["delta_pct"] = round(pct, 2)
280
+
281
+ lib = lower_is_better if metric == primary_metric else (
282
+ metric in {"loss", "mse", "rmse", "mae", "error_rate",
283
+ "train_seconds", "latency", "latency_ms"}
284
+ )
285
+ if lib:
286
+ entry["improved"] = delta < 0
287
+ else:
288
+ entry["improved"] = delta > 0
289
+ except (ValueError, TypeError):
290
+ pass
291
+
292
+ comparisons[metric] = entry
293
+
294
+ # Overall verdict
295
+ primary = comparisons.get(primary_metric, {})
296
+ if primary.get("improved") is True:
297
+ verdict = "improved"
298
+ elif primary.get("improved") is False:
299
+ verdict = "regressed"
300
+ else:
301
+ verdict = "inconclusive"
302
+
303
+ return {
304
+ "primary_metric": primary_metric,
305
+ "verdict": verdict,
306
+ "comparisons": comparisons,
307
+ }
308
+
309
+
310
+ # --- Report ---
311
+
312
+
313
+ def format_replay_report(report: dict) -> str:
314
+ """Format replay result as a readable markdown report."""
315
+ if "error" in report:
316
+ return f"ERROR: {report['error']}"
317
+
318
+ lines = [
319
+ "# Experiment Replay",
320
+ "",
321
+ f"*{report.get('timestamp', '?')[:19]} UTC*",
322
+ "",
323
+ ]
324
+
325
+ plan = report.get("plan", {})
326
+ lines.extend([
327
+ "## Original Experiment",
328
+ "",
329
+ f"- **ID:** {plan.get('original_id', '?')}",
330
+ f"- **Timestamp:** {plan.get('original_timestamp', '?')[:19]}",
331
+ ])
332
+
333
+ orig_metrics = plan.get("original_metrics", {})
334
+ if orig_metrics:
335
+ lines.append("- **Metrics:** " + ", ".join(
336
+ f"{k}={v:.4f}" if isinstance(v, float) else f"{k}={v}"
337
+ for k, v in orig_metrics.items()
338
+ ))
339
+ lines.append("")
340
+
341
+ # Changes
342
+ changes = plan.get("changes", [])
343
+ if changes:
344
+ lines.extend(["## Changes from Original", ""])
345
+ for ch in changes:
346
+ lines.append(f"- **{ch['field']}**: {ch['reason']}")
347
+ lines.append("")
348
+
349
+ # Warnings
350
+ warnings = plan.get("warnings", [])
351
+ if warnings:
352
+ lines.extend(["## Warnings", ""])
353
+ for w in warnings:
354
+ lines.append(f"- {w}")
355
+ lines.append("")
356
+
357
+ # Execution result
358
+ execution = report.get("execution", {})
359
+ status = execution.get("status", "not_run")
360
+ lines.extend([
361
+ "## Replay Result",
362
+ "",
363
+ f"**Status:** {status}",
364
+ ])
365
+
366
+ if status == "completed":
367
+ # Comparison
368
+ comparison = report.get("comparison", {})
369
+ verdict = comparison.get("verdict", "?")
370
+ primary = comparison.get("primary_metric", "?")
371
+ lines.extend([
372
+ f"**Verdict:** {verdict} (primary: {primary})",
373
+ "",
374
+ "| Metric | Original | Replay | Delta | Change |",
375
+ "|--------|----------|--------|-------|--------|",
376
+ ])
377
+
378
+ for metric, data in comparison.get("comparisons", {}).items():
379
+ orig = data.get("original")
380
+ replay = data.get("replay")
381
+ orig_str = f"{orig:.4f}" if isinstance(orig, float) else str(orig or "—")
382
+ replay_str = f"{replay:.4f}" if isinstance(replay, float) else str(replay or "—")
383
+ delta = data.get("delta_pct")
384
+ delta_str = f"{delta:+.2f}%" if delta is not None else "—"
385
+ improved = data.get("improved")
386
+ if improved is True:
387
+ change = "improved"
388
+ elif improved is False:
389
+ change = "regressed"
390
+ else:
391
+ change = "—"
392
+ lines.append(f"| {metric} | {orig_str} | {replay_str} | {delta_str} | {change} |")
393
+ elif status in ("failed", "timeout", "error"):
394
+ lines.append(f"**Error:** {execution.get('error', 'unknown')}")
395
+
396
+ lines.extend(["", "---"])
397
+ return "\n".join(lines)
398
+
399
+
400
+ def save_replay_report(report: dict, replay_dir: str = DEFAULT_REPLAY_DIR) -> Path:
401
+ """Save replay report to YAML."""
402
+ p = Path(replay_dir)
403
+ p.mkdir(parents=True, exist_ok=True)
404
+ exp_id = report.get("plan", {}).get("original_id", "unknown")
405
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
406
+ out = p / f"{exp_id}-replay-{ts}.yaml"
407
+ with open(out, "w") as f:
408
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
409
+ return out
410
+
411
+
412
+ def list_replays(replay_dir: str = DEFAULT_REPLAY_DIR) -> list[dict]:
413
+ """List all saved replay reports."""
414
+ p = Path(replay_dir)
415
+ if not p.exists():
416
+ return []
417
+
418
+ replays = []
419
+ for path in sorted(p.glob("*-replay-*.yaml")):
420
+ try:
421
+ with open(path) as f:
422
+ data = yaml.safe_load(f)
423
+ if not isinstance(data, dict):
424
+ continue
425
+ plan = data.get("plan", {})
426
+ execution = data.get("execution", {})
427
+ comparison = data.get("comparison", {})
428
+ replays.append({
429
+ "file": path.name,
430
+ "original_id": plan.get("original_id"),
431
+ "timestamp": data.get("timestamp", ""),
432
+ "status": execution.get("status", "?"),
433
+ "verdict": comparison.get("verdict", "?"),
434
+ })
435
+ except (yaml.YAMLError, OSError):
436
+ continue
437
+
438
+ return replays
439
+
440
+
441
+ # --- Orchestration ---
442
+
443
+
444
+ def run_replay(
445
+ experiment_id: str | None = None,
446
+ with_current_data: bool = False,
447
+ with_current_preprocessing: bool = False,
448
+ dry_run: bool = False,
449
+ list_mode: bool = False,
450
+ timeout: int = 600,
451
+ log_path: str = DEFAULT_LOG_PATH,
452
+ config_path: str = "config.yaml",
453
+ replay_dir: str = DEFAULT_REPLAY_DIR,
454
+ ) -> dict:
455
+ """Run experiment replay workflow.
456
+
457
+ Args:
458
+ experiment_id: Experiment to replay.
459
+ with_current_data: Use current data.
460
+ with_current_preprocessing: Use current preprocessing.
461
+ dry_run: Plan only, don't execute.
462
+ list_mode: List previous replays.
463
+ timeout: Training timeout in seconds.
464
+ log_path: Path to experiment log.
465
+ config_path: Path to config.yaml.
466
+ replay_dir: Directory for replay reports.
467
+
468
+ Returns:
469
+ Replay result dict.
470
+ """
471
+ timestamp = datetime.now(timezone.utc).isoformat()
472
+
473
+ if list_mode:
474
+ replays = list_replays(replay_dir)
475
+ return {
476
+ "timestamp": timestamp,
477
+ "action": "list",
478
+ "count": len(replays),
479
+ "replays": replays,
480
+ }
481
+
482
+ if not experiment_id:
483
+ return {"error": "Experiment ID required. Use --list to see past replays."}
484
+
485
+ config = load_config(config_path)
486
+ experiments = load_experiments(log_path)
487
+
488
+ if not experiments:
489
+ return {"timestamp": timestamp, "error": "No experiments found"}
490
+
491
+ original = find_experiment(experiments, experiment_id)
492
+ if original is None:
493
+ return {"timestamp": timestamp, "error": f"Experiment '{experiment_id}' not found"}
494
+
495
+ eval_cfg = config.get("evaluation", {})
496
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
497
+ lower_is_better = eval_cfg.get("lower_is_better", False)
498
+
499
+ # Plan
500
+ plan = plan_replay(original, config, with_current_data, with_current_preprocessing)
501
+
502
+ report: dict = {
503
+ "timestamp": timestamp,
504
+ "plan": plan,
505
+ }
506
+
507
+ if dry_run:
508
+ report["execution"] = {"status": "dry_run"}
509
+ saved = save_replay_report(report, replay_dir)
510
+ report["saved_to"] = str(saved)
511
+ return report
512
+
513
+ # Execute
514
+ execution = execute_replay(plan, timeout=timeout)
515
+ report["execution"] = execution
516
+
517
+ # Compare if completed
518
+ if execution.get("status") == "completed":
519
+ comparison = compare_metrics(
520
+ plan.get("original_metrics", {}),
521
+ execution.get("metrics", {}),
522
+ primary_metric=primary_metric,
523
+ lower_is_better=lower_is_better,
524
+ )
525
+ report["comparison"] = comparison
526
+
527
+ # Save
528
+ saved = save_replay_report(report, replay_dir)
529
+ report["saved_to"] = str(saved)
530
+
531
+ return report
532
+
533
+
534
+ def main() -> None:
535
+ """CLI entry point."""
536
+ parser = argparse.ArgumentParser(description="Re-run historical experiments")
537
+ parser.add_argument("experiment_id", nargs="?", default=None,
538
+ help="Experiment ID to replay")
539
+ parser.add_argument("--with-current-data", action="store_true",
540
+ help="Use current data instead of original")
541
+ parser.add_argument("--with-current-preprocessing", action="store_true",
542
+ help="Use current preprocessing pipeline")
543
+ parser.add_argument("--dry-run", action="store_true",
544
+ help="Plan replay without executing")
545
+ parser.add_argument("--list", dest="list_mode", action="store_true",
546
+ help="List previous replays")
547
+ parser.add_argument("--timeout", type=int, default=600,
548
+ help="Training timeout in seconds (default: 600)")
549
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
550
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
551
+ parser.add_argument("--replay-dir", default=DEFAULT_REPLAY_DIR,
552
+ help="Directory for replay reports")
553
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
554
+ args = parser.parse_args()
555
+
556
+ report = run_replay(
557
+ experiment_id=args.experiment_id,
558
+ with_current_data=args.with_current_data,
559
+ with_current_preprocessing=args.with_current_preprocessing,
560
+ dry_run=args.dry_run,
561
+ list_mode=args.list_mode,
562
+ timeout=args.timeout,
563
+ log_path=args.log,
564
+ config_path=args.config,
565
+ replay_dir=args.replay_dir,
566
+ )
567
+
568
+ if args.json:
569
+ print(json.dumps(report, indent=2, default=str))
570
+ else:
571
+ if "error" in report:
572
+ print(f"ERROR: {report['error']}", file=sys.stderr)
573
+ sys.exit(1)
574
+
575
+ if report.get("action") == "list":
576
+ replays = report.get("replays", [])
577
+ if not replays:
578
+ print("No replays found.")
579
+ else:
580
+ print("# Experiment Replays")
581
+ print()
582
+ print("| Original | Date | Status | Verdict |")
583
+ print("|----------|------|--------|---------|")
584
+ for r in replays:
585
+ print(f"| {r['original_id']} | {r['timestamp'][:10]} "
586
+ f"| {r['status']} | {r['verdict']} |")
587
+ else:
588
+ print(format_replay_report(report))
589
+
590
+
591
+ if __name__ == "__main__":
592
+ main()