claude-turing 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +67 -3
  3. package/commands/explore.md +107 -0
  4. package/commands/reproduce.md +48 -0
  5. package/commands/seed.md +47 -0
  6. package/commands/suggest.md +68 -4
  7. package/commands/turing.md +6 -0
  8. package/package.json +1 -1
  9. package/src/claude-md.js +1 -0
  10. package/src/install.js +2 -2
  11. package/src/verify.js +3 -0
  12. package/templates/config.yaml +10 -0
  13. package/templates/program.md +5 -0
  14. package/templates/requirements.txt +4 -0
  15. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/reproduce_experiment.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/seed_runner.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  24. package/templates/scripts/generate_brief.py +85 -3
  25. package/templates/scripts/generate_model_card.py +25 -0
  26. package/templates/scripts/leaderboard.py +10 -0
  27. package/templates/scripts/manage_hypotheses.py +2 -2
  28. package/templates/scripts/reproduce_experiment.py +548 -0
  29. package/templates/scripts/scaffold.py +5 -0
  30. package/templates/scripts/seed_runner.py +414 -0
  31. package/templates/scripts/show_metrics.py +17 -0
  32. package/templates/scripts/treequest_suggest.py +520 -0
  33. package/templates/scripts/turing_io.py +36 -0
  34. package/templates/scripts/update_state.py +13 -0
@@ -25,6 +25,7 @@ import yaml
25
25
 
26
26
  from scripts.cost_frontier import compute_pareto_frontier, load_cost_data, _format_seconds
27
27
  from scripts.turing_io import load_config, load_experiments, load_hypotheses
28
+ from scripts.seed_runner import CV_THRESHOLD
28
29
 
29
30
 
30
31
  def compute_campaign_summary(experiments: list[dict]) -> dict:
@@ -211,6 +212,40 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
211
212
  return warnings
212
213
 
213
214
 
215
+ def load_seed_studies(seed_dir: str = "experiments/seed_studies") -> list[dict]:
216
+ """Load all seed study results from YAML files."""
217
+ path = Path(seed_dir)
218
+ if not path.exists():
219
+ return []
220
+ studies = []
221
+ for f in sorted(path.glob("*-seeds.yaml")):
222
+ try:
223
+ with open(f) as fh:
224
+ study = yaml.safe_load(fh)
225
+ if study and isinstance(study, dict):
226
+ studies.append(study)
227
+ except (yaml.YAMLError, OSError):
228
+ continue
229
+ return studies
230
+
231
+
232
+ def load_reproductions(repro_dir: str = "experiments/reproductions") -> list[dict]:
233
+ """Load all reproduction reports from YAML files."""
234
+ path = Path(repro_dir)
235
+ if not path.exists():
236
+ return []
237
+ reports = []
238
+ for f in sorted(path.glob("*-repro.yaml")):
239
+ try:
240
+ with open(f) as fh:
241
+ report = yaml.safe_load(fh)
242
+ if report and isinstance(report, dict):
243
+ reports.append(report)
244
+ except (yaml.YAMLError, OSError):
245
+ continue
246
+ return reports
247
+
248
+
214
249
  def format_brief(
215
250
  campaign: dict,
216
251
  best: dict | None,
@@ -223,6 +258,8 @@ def format_brief(
223
258
  env_warnings: list[str] | None = None,
224
259
  cost_data: list | None = None,
225
260
  cost_frontier: list | None = None,
261
+ seed_studies: list[dict] | None = None,
262
+ reproductions: list[dict] | None = None,
226
263
  ) -> str:
227
264
  """Format the research briefing as markdown."""
228
265
  direction = "lower" if lower_is_better else "higher"
@@ -286,7 +323,8 @@ def format_brief(
286
323
  lines.append(f"**{len(queued)} queued:**")
287
324
  for h in queued:
288
325
  priority_marker = " (HIGH)" if h.get("priority") == "high" else ""
289
- source_marker = " [human]" if h.get("source") == "human" else ""
326
+ source = h.get("source", "")
327
+ source_marker = f" [{source}]" if source in ("human", "treequest", "literature") else ""
290
328
  lines.append(f"- {h['id']}: {h.get('description', '?')}{priority_marker}{source_marker}")
291
329
  else:
292
330
  lines.append("No queued hypotheses. Use `/turing:try` to inject ideas.")
@@ -360,6 +398,44 @@ def format_brief(
360
398
  f"The {pct:.1f}% improvement costs {ratio:.0f}x more compute.",
361
399
  ])
362
400
 
401
+ # Seed studies
402
+ if seed_studies:
403
+ lines.extend(["", "## Seed Studies", ""])
404
+ for study in seed_studies:
405
+ exp_id = study.get("experiment_id", "?")
406
+ sensitive = study.get("seed_sensitive", False)
407
+ status = "SEED-SENSITIVE" if sensitive else "STABLE"
408
+ lines.append(
409
+ f"- **{exp_id}:** {study.get('metric', metric)} = "
410
+ f"{study.get('mean', 0):.4f} +/- {study.get('std', 0):.4f} "
411
+ f"(CV={study.get('cv_percent', 0):.1f}%) — **{status}**"
412
+ )
413
+ if sensitive:
414
+ lines.append(
415
+ f" - 95% CI: [{study['ci_95'][0]:.4f}, {study['ci_95'][1]:.4f}] "
416
+ f"over {len(study.get('seeds_run', []))} seeds"
417
+ )
418
+ if any(s.get("seed_sensitive") for s in seed_studies):
419
+ lines.extend(["", "*Some results are seed-sensitive. Report distributions, not point estimates.*"])
420
+
421
+ # Reproduction reports
422
+ if reproductions:
423
+ lines.extend(["", "## Reproducibility", ""])
424
+ verdict_markers = {
425
+ "reproducible": "PASS",
426
+ "approximately_reproducible": "PASS (approx)",
427
+ "not_reproducible": "FAIL",
428
+ "environment_changed": "WARN (env)",
429
+ }
430
+ for report in reproductions:
431
+ exp_id = report.get("experiment_id", "?")
432
+ verdict = report.get("verdict", "unknown")
433
+ marker = verdict_markers.get(verdict, verdict)
434
+ lines.append(f"- **{exp_id}:** {marker} — {report.get('reason', 'N/A')}")
435
+ failed = [r for r in reproductions if r.get("verdict") in ("not_reproducible", "environment_changed")]
436
+ if failed:
437
+ lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
438
+
363
439
  lines.extend([
364
440
  "",
365
441
  "## Recommendations",
@@ -387,9 +463,9 @@ def format_brief(
387
463
 
388
464
  # Check if hypotheses are exhausted
389
465
  if not queued:
390
- lines.append("- No hypotheses queued — inject ideas with `/turing:try`")
466
+ lines.append("- No hypotheses queued — inject ideas with `/turing:try` or explore with `/turing:explore`")
391
467
 
392
- lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses. Use `/turing:train` to execute.*"])
468
+ lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses, `/turing:explore` for tree search, `/turing:train` to execute.*"])
393
469
 
394
470
  return "\n".join(lines)
395
471
 
@@ -419,11 +495,17 @@ def generate_brief(
419
495
  cost_records = load_cost_data(log_path, metric)
420
496
  pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
421
497
 
498
+ # Load seed studies and reproduction reports
499
+ seed_studies = load_seed_studies()
500
+ reproductions = load_reproductions()
501
+
422
502
  return format_brief(
423
503
  campaign, best, trajectory, model_types, hypotheses,
424
504
  metric, lower_is_better, failures, env_warnings,
425
505
  cost_data=cost_records if cost_records else None,
426
506
  cost_frontier=pareto if cost_records else None,
507
+ seed_studies=seed_studies if seed_studies else None,
508
+ reproductions=reproductions if reproductions else None,
427
509
  )
428
510
 
429
511
 
@@ -243,6 +243,31 @@ def generate_card(
243
243
  else:
244
244
  lines.append("No experiments completed yet.")
245
245
 
246
+ # --- Seed Study ---
247
+ if best:
248
+ seed_study_path = Path("experiments/seed_studies") / f"{best.get('experiment_id', 'unknown')}-seeds.yaml"
249
+ if seed_study_path.exists():
250
+ import yaml
251
+ with open(seed_study_path) as f:
252
+ seed_study = yaml.safe_load(f) or {}
253
+ if seed_study and "mean" in seed_study:
254
+ sensitive = seed_study.get("seed_sensitive", False)
255
+ status = "SEED-SENSITIVE" if sensitive else "STABLE"
256
+ lines.extend([
257
+ "",
258
+ "### Seed Study",
259
+ "",
260
+ f"- **Status:** {status}",
261
+ f"- **{metric}:** {seed_study['mean']:.4f} +/- {seed_study.get('std', 0):.4f}",
262
+ ])
263
+ if "ci_95" in seed_study:
264
+ ci = seed_study["ci_95"]
265
+ lines.append(f"- **95% CI:** [{ci[0]:.4f}, {ci[1]:.4f}]")
266
+ lines.append(f"- **CV:** {seed_study.get('cv_percent', 0):.2f}%")
267
+ lines.append(f"- **Seeds tested:** {len(seed_study.get('seeds_run', []))}")
268
+ if sensitive:
269
+ lines.append("- *Result varies significantly across seeds. Report distribution, not point estimate.*")
270
+
246
271
  # --- Training History ---
247
272
  lines.extend([
248
273
  "",
@@ -503,6 +503,16 @@ def main() -> None:
503
503
  print()
504
504
  print(footer)
505
505
 
506
+ # Show seed study status for #1 if available
507
+ if ranked and args.fmt not in ("csv",):
508
+ from scripts.turing_io import load_seed_study
509
+ best_id = ranked[0].get("experiment_id")
510
+ if best_id:
511
+ study = load_seed_study(best_id)
512
+ if study and "mean" in study:
513
+ sensitive = "SEED-SENSITIVE" if study.get("seed_sensitive") else "STABLE"
514
+ print(f"\n Seed study: {metric}={study['mean']:.4f}±{study.get('std',0):.4f} ({sensitive})")
515
+
506
516
 
507
517
  if __name__ == "__main__":
508
518
  main()
@@ -277,7 +277,7 @@ def get_next_hypothesis(queue_path: str) -> dict | None:
277
277
  return None
278
278
 
279
279
  priority_order = {"high": 0, "medium": 1, "low": 2}
280
- source_order = {"human": 0, "literature": 1, "taxonomy": 2, "agent": 3}
280
+ source_order = {"human": 0, "literature": 1, "treequest": 2, "taxonomy": 3, "agent": 4}
281
281
 
282
282
  queued.sort(key=lambda h: (
283
283
  priority_order.get(h.get("priority", "medium"), 1),
@@ -376,7 +376,7 @@ def main() -> None:
376
376
  add_parser.add_argument("description", nargs="?", default=None, help="What to try and why")
377
377
  add_parser.add_argument("--archetype", default=None, help="Expand from archetype (e.g., model_comparison)")
378
378
  add_parser.add_argument("--priority", default="high", choices=sorted(VALID_PRIORITIES))
379
- add_parser.add_argument("--source", default="human", choices=["human", "agent", "literature", "taxonomy"])
379
+ add_parser.add_argument("--source", default="human", choices=["human", "agent", "literature", "treequest", "taxonomy"])
380
380
  add_parser.add_argument("--parent", default=None, help="Parent experiment ID")
381
381
  add_parser.add_argument("--parent-hyp", default=None, help="Parent hypothesis ID")
382
382
  add_parser.add_argument("--family", default=None, help="Experiment family (e.g., optimizer-sweep)")