claude-turing 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +67 -3
- package/commands/explore.md +107 -0
- package/commands/reproduce.md +48 -0
- package/commands/seed.md +47 -0
- package/commands/suggest.md +68 -4
- package/commands/turing.md +6 -0
- package/package.json +1 -1
- package/src/claude-md.js +1 -0
- package/src/install.js +2 -2
- package/src/verify.js +3 -0
- package/templates/config.yaml +10 -0
- package/templates/program.md +5 -0
- package/templates/requirements.txt +4 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/reproduce_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/seed_runner.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/generate_brief.py +85 -3
- package/templates/scripts/generate_model_card.py +25 -0
- package/templates/scripts/leaderboard.py +10 -0
- package/templates/scripts/manage_hypotheses.py +2 -2
- package/templates/scripts/reproduce_experiment.py +548 -0
- package/templates/scripts/scaffold.py +5 -0
- package/templates/scripts/seed_runner.py +414 -0
- package/templates/scripts/show_metrics.py +17 -0
- package/templates/scripts/treequest_suggest.py +520 -0
- package/templates/scripts/turing_io.py +36 -0
- package/templates/scripts/update_state.py +13 -0
|
@@ -25,6 +25,7 @@ import yaml
|
|
|
25
25
|
|
|
26
26
|
from scripts.cost_frontier import compute_pareto_frontier, load_cost_data, _format_seconds
|
|
27
27
|
from scripts.turing_io import load_config, load_experiments, load_hypotheses
|
|
28
|
+
from scripts.seed_runner import CV_THRESHOLD
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
def compute_campaign_summary(experiments: list[dict]) -> dict:
|
|
@@ -211,6 +212,40 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
|
|
|
211
212
|
return warnings
|
|
212
213
|
|
|
213
214
|
|
|
215
|
+
def load_seed_studies(seed_dir: str = "experiments/seed_studies") -> list[dict]:
|
|
216
|
+
"""Load all seed study results from YAML files."""
|
|
217
|
+
path = Path(seed_dir)
|
|
218
|
+
if not path.exists():
|
|
219
|
+
return []
|
|
220
|
+
studies = []
|
|
221
|
+
for f in sorted(path.glob("*-seeds.yaml")):
|
|
222
|
+
try:
|
|
223
|
+
with open(f) as fh:
|
|
224
|
+
study = yaml.safe_load(fh)
|
|
225
|
+
if study and isinstance(study, dict):
|
|
226
|
+
studies.append(study)
|
|
227
|
+
except (yaml.YAMLError, OSError):
|
|
228
|
+
continue
|
|
229
|
+
return studies
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def load_reproductions(repro_dir: str = "experiments/reproductions") -> list[dict]:
|
|
233
|
+
"""Load all reproduction reports from YAML files."""
|
|
234
|
+
path = Path(repro_dir)
|
|
235
|
+
if not path.exists():
|
|
236
|
+
return []
|
|
237
|
+
reports = []
|
|
238
|
+
for f in sorted(path.glob("*-repro.yaml")):
|
|
239
|
+
try:
|
|
240
|
+
with open(f) as fh:
|
|
241
|
+
report = yaml.safe_load(fh)
|
|
242
|
+
if report and isinstance(report, dict):
|
|
243
|
+
reports.append(report)
|
|
244
|
+
except (yaml.YAMLError, OSError):
|
|
245
|
+
continue
|
|
246
|
+
return reports
|
|
247
|
+
|
|
248
|
+
|
|
214
249
|
def format_brief(
|
|
215
250
|
campaign: dict,
|
|
216
251
|
best: dict | None,
|
|
@@ -223,6 +258,8 @@ def format_brief(
|
|
|
223
258
|
env_warnings: list[str] | None = None,
|
|
224
259
|
cost_data: list | None = None,
|
|
225
260
|
cost_frontier: list | None = None,
|
|
261
|
+
seed_studies: list[dict] | None = None,
|
|
262
|
+
reproductions: list[dict] | None = None,
|
|
226
263
|
) -> str:
|
|
227
264
|
"""Format the research briefing as markdown."""
|
|
228
265
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -286,7 +323,8 @@ def format_brief(
|
|
|
286
323
|
lines.append(f"**{len(queued)} queued:**")
|
|
287
324
|
for h in queued:
|
|
288
325
|
priority_marker = " (HIGH)" if h.get("priority") == "high" else ""
|
|
289
|
-
|
|
326
|
+
source = h.get("source", "")
|
|
327
|
+
source_marker = f" [{source}]" if source in ("human", "treequest", "literature") else ""
|
|
290
328
|
lines.append(f"- {h['id']}: {h.get('description', '?')}{priority_marker}{source_marker}")
|
|
291
329
|
else:
|
|
292
330
|
lines.append("No queued hypotheses. Use `/turing:try` to inject ideas.")
|
|
@@ -360,6 +398,44 @@ def format_brief(
|
|
|
360
398
|
f"The {pct:.1f}% improvement costs {ratio:.0f}x more compute.",
|
|
361
399
|
])
|
|
362
400
|
|
|
401
|
+
# Seed studies
|
|
402
|
+
if seed_studies:
|
|
403
|
+
lines.extend(["", "## Seed Studies", ""])
|
|
404
|
+
for study in seed_studies:
|
|
405
|
+
exp_id = study.get("experiment_id", "?")
|
|
406
|
+
sensitive = study.get("seed_sensitive", False)
|
|
407
|
+
status = "SEED-SENSITIVE" if sensitive else "STABLE"
|
|
408
|
+
lines.append(
|
|
409
|
+
f"- **{exp_id}:** {study.get('metric', metric)} = "
|
|
410
|
+
f"{study.get('mean', 0):.4f} +/- {study.get('std', 0):.4f} "
|
|
411
|
+
f"(CV={study.get('cv_percent', 0):.1f}%) — **{status}**"
|
|
412
|
+
)
|
|
413
|
+
if sensitive:
|
|
414
|
+
lines.append(
|
|
415
|
+
f" - 95% CI: [{study['ci_95'][0]:.4f}, {study['ci_95'][1]:.4f}] "
|
|
416
|
+
f"over {len(study.get('seeds_run', []))} seeds"
|
|
417
|
+
)
|
|
418
|
+
if any(s.get("seed_sensitive") for s in seed_studies):
|
|
419
|
+
lines.extend(["", "*Some results are seed-sensitive. Report distributions, not point estimates.*"])
|
|
420
|
+
|
|
421
|
+
# Reproduction reports
|
|
422
|
+
if reproductions:
|
|
423
|
+
lines.extend(["", "## Reproducibility", ""])
|
|
424
|
+
verdict_markers = {
|
|
425
|
+
"reproducible": "PASS",
|
|
426
|
+
"approximately_reproducible": "PASS (approx)",
|
|
427
|
+
"not_reproducible": "FAIL",
|
|
428
|
+
"environment_changed": "WARN (env)",
|
|
429
|
+
}
|
|
430
|
+
for report in reproductions:
|
|
431
|
+
exp_id = report.get("experiment_id", "?")
|
|
432
|
+
verdict = report.get("verdict", "unknown")
|
|
433
|
+
marker = verdict_markers.get(verdict, verdict)
|
|
434
|
+
lines.append(f"- **{exp_id}:** {marker} — {report.get('reason', 'N/A')}")
|
|
435
|
+
failed = [r for r in reproductions if r.get("verdict") in ("not_reproducible", "environment_changed")]
|
|
436
|
+
if failed:
|
|
437
|
+
lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
|
|
438
|
+
|
|
363
439
|
lines.extend([
|
|
364
440
|
"",
|
|
365
441
|
"## Recommendations",
|
|
@@ -387,9 +463,9 @@ def format_brief(
|
|
|
387
463
|
|
|
388
464
|
# Check if hypotheses are exhausted
|
|
389
465
|
if not queued:
|
|
390
|
-
lines.append("- No hypotheses queued — inject ideas with `/turing:try`")
|
|
466
|
+
lines.append("- No hypotheses queued — inject ideas with `/turing:try` or explore with `/turing:explore`")
|
|
391
467
|
|
|
392
|
-
lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses
|
|
468
|
+
lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses, `/turing:explore` for tree search, `/turing:train` to execute.*"])
|
|
393
469
|
|
|
394
470
|
return "\n".join(lines)
|
|
395
471
|
|
|
@@ -419,11 +495,17 @@ def generate_brief(
|
|
|
419
495
|
cost_records = load_cost_data(log_path, metric)
|
|
420
496
|
pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
|
|
421
497
|
|
|
498
|
+
# Load seed studies and reproduction reports
|
|
499
|
+
seed_studies = load_seed_studies()
|
|
500
|
+
reproductions = load_reproductions()
|
|
501
|
+
|
|
422
502
|
return format_brief(
|
|
423
503
|
campaign, best, trajectory, model_types, hypotheses,
|
|
424
504
|
metric, lower_is_better, failures, env_warnings,
|
|
425
505
|
cost_data=cost_records if cost_records else None,
|
|
426
506
|
cost_frontier=pareto if cost_records else None,
|
|
507
|
+
seed_studies=seed_studies if seed_studies else None,
|
|
508
|
+
reproductions=reproductions if reproductions else None,
|
|
427
509
|
)
|
|
428
510
|
|
|
429
511
|
|
|
@@ -243,6 +243,31 @@ def generate_card(
|
|
|
243
243
|
else:
|
|
244
244
|
lines.append("No experiments completed yet.")
|
|
245
245
|
|
|
246
|
+
# --- Seed Study ---
|
|
247
|
+
if best:
|
|
248
|
+
seed_study_path = Path("experiments/seed_studies") / f"{best.get('experiment_id', 'unknown')}-seeds.yaml"
|
|
249
|
+
if seed_study_path.exists():
|
|
250
|
+
import yaml
|
|
251
|
+
with open(seed_study_path) as f:
|
|
252
|
+
seed_study = yaml.safe_load(f) or {}
|
|
253
|
+
if seed_study and "mean" in seed_study:
|
|
254
|
+
sensitive = seed_study.get("seed_sensitive", False)
|
|
255
|
+
status = "SEED-SENSITIVE" if sensitive else "STABLE"
|
|
256
|
+
lines.extend([
|
|
257
|
+
"",
|
|
258
|
+
"### Seed Study",
|
|
259
|
+
"",
|
|
260
|
+
f"- **Status:** {status}",
|
|
261
|
+
f"- **{metric}:** {seed_study['mean']:.4f} +/- {seed_study.get('std', 0):.4f}",
|
|
262
|
+
])
|
|
263
|
+
if "ci_95" in seed_study:
|
|
264
|
+
ci = seed_study["ci_95"]
|
|
265
|
+
lines.append(f"- **95% CI:** [{ci[0]:.4f}, {ci[1]:.4f}]")
|
|
266
|
+
lines.append(f"- **CV:** {seed_study.get('cv_percent', 0):.2f}%")
|
|
267
|
+
lines.append(f"- **Seeds tested:** {len(seed_study.get('seeds_run', []))}")
|
|
268
|
+
if sensitive:
|
|
269
|
+
lines.append("- *Result varies significantly across seeds. Report distribution, not point estimate.*")
|
|
270
|
+
|
|
246
271
|
# --- Training History ---
|
|
247
272
|
lines.extend([
|
|
248
273
|
"",
|
|
@@ -503,6 +503,16 @@ def main() -> None:
|
|
|
503
503
|
print()
|
|
504
504
|
print(footer)
|
|
505
505
|
|
|
506
|
+
# Show seed study status for #1 if available
|
|
507
|
+
if ranked and args.fmt not in ("csv",):
|
|
508
|
+
from scripts.turing_io import load_seed_study
|
|
509
|
+
best_id = ranked[0].get("experiment_id")
|
|
510
|
+
if best_id:
|
|
511
|
+
study = load_seed_study(best_id)
|
|
512
|
+
if study and "mean" in study:
|
|
513
|
+
sensitive = "SEED-SENSITIVE" if study.get("seed_sensitive") else "STABLE"
|
|
514
|
+
print(f"\n Seed study: {metric}={study['mean']:.4f}±{study.get('std',0):.4f} ({sensitive})")
|
|
515
|
+
|
|
506
516
|
|
|
507
517
|
if __name__ == "__main__":
|
|
508
518
|
main()
|
|
@@ -277,7 +277,7 @@ def get_next_hypothesis(queue_path: str) -> dict | None:
|
|
|
277
277
|
return None
|
|
278
278
|
|
|
279
279
|
priority_order = {"high": 0, "medium": 1, "low": 2}
|
|
280
|
-
source_order = {"human": 0, "literature": 1, "
|
|
280
|
+
source_order = {"human": 0, "literature": 1, "treequest": 2, "taxonomy": 3, "agent": 4}
|
|
281
281
|
|
|
282
282
|
queued.sort(key=lambda h: (
|
|
283
283
|
priority_order.get(h.get("priority", "medium"), 1),
|
|
@@ -376,7 +376,7 @@ def main() -> None:
|
|
|
376
376
|
add_parser.add_argument("description", nargs="?", default=None, help="What to try and why")
|
|
377
377
|
add_parser.add_argument("--archetype", default=None, help="Expand from archetype (e.g., model_comparison)")
|
|
378
378
|
add_parser.add_argument("--priority", default="high", choices=sorted(VALID_PRIORITIES))
|
|
379
|
-
add_parser.add_argument("--source", default="human", choices=["human", "agent", "literature", "taxonomy"])
|
|
379
|
+
add_parser.add_argument("--source", default="human", choices=["human", "agent", "literature", "treequest", "taxonomy"])
|
|
380
380
|
add_parser.add_argument("--parent", default=None, help="Parent experiment ID")
|
|
381
381
|
add_parser.add_argument("--parent-hyp", default=None, help="Parent hypothesis ID")
|
|
382
382
|
add_parser.add_argument("--family", default=None, help="Experiment family (e.g., optimizer-sweep)")
|