npm - claude-turing - Versions diffs - 1.1.0 → 1.3.0 - Mend

claude-turing 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/.claude-plugin/plugin.json +2 -2
package/README.md +67 -3
package/commands/explore.md +107 -0
package/commands/reproduce.md +48 -0
package/commands/seed.md +47 -0
package/commands/suggest.md +68 -4
package/commands/turing.md +6 -0
package/package.json +1 -1
package/src/claude-md.js +1 -0
package/src/install.js +2 -2
package/src/verify.js +3 -0
package/templates/config.yaml +10 -0
package/templates/program.md +5 -0
package/templates/requirements.txt +4 -0
package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/reproduce_experiment.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/seed_runner.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
package/templates/scripts/generate_brief.py +85 -3
package/templates/scripts/generate_model_card.py +25 -0
package/templates/scripts/leaderboard.py +10 -0
package/templates/scripts/manage_hypotheses.py +2 -2
package/templates/scripts/reproduce_experiment.py +548 -0
package/templates/scripts/scaffold.py +5 -0
package/templates/scripts/seed_runner.py +414 -0
package/templates/scripts/show_metrics.py +17 -0
package/templates/scripts/treequest_suggest.py +520 -0
package/templates/scripts/turing_io.py +36 -0
package/templates/scripts/update_state.py +13 -0

package/templates/scripts/generate_brief.py CHANGED Viewed

@@ -25,6 +25,7 @@ import yaml
 from scripts.cost_frontier import compute_pareto_frontier, load_cost_data, _format_seconds
 from scripts.turing_io import load_config, load_experiments, load_hypotheses
+from scripts.seed_runner import CV_THRESHOLD
 def compute_campaign_summary(experiments: list[dict]) -> dict:
@@ -211,6 +212,40 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
     return warnings
+def load_seed_studies(seed_dir: str = "experiments/seed_studies") -> list[dict]:
+    """Load all seed study results from YAML files."""
+    path = Path(seed_dir)
+    if not path.exists():
+        return []
+    studies = []
+    for f in sorted(path.glob("*-seeds.yaml")):
+        try:
+            with open(f) as fh:
+                study = yaml.safe_load(fh)
+                if study and isinstance(study, dict):
+                    studies.append(study)
+        except (yaml.YAMLError, OSError):
+            continue
+    return studies
+def load_reproductions(repro_dir: str = "experiments/reproductions") -> list[dict]:
+    """Load all reproduction reports from YAML files."""
+    path = Path(repro_dir)
+    if not path.exists():
+        return []
+    reports = []
+    for f in sorted(path.glob("*-repro.yaml")):
+        try:
+            with open(f) as fh:
+                report = yaml.safe_load(fh)
+                if report and isinstance(report, dict):
+                    reports.append(report)
+        except (yaml.YAMLError, OSError):
+            continue
+    return reports
 def format_brief(
     campaign: dict,
     best: dict | None,
@@ -223,6 +258,8 @@ def format_brief(
     env_warnings: list[str] | None = None,
     cost_data: list | None = None,
     cost_frontier: list | None = None,
+    seed_studies: list[dict] | None = None,
+    reproductions: list[dict] | None = None,
 ) -> str:
     """Format the research briefing as markdown."""
     direction = "lower" if lower_is_better else "higher"
@@ -286,7 +323,8 @@ def format_brief(
         lines.append(f"**{len(queued)} queued:**")
         for h in queued:
             priority_marker = " (HIGH)" if h.get("priority") == "high" else ""
-            source_marker = " [human]" if h.get("source") == "human" else ""
+            source = h.get("source", "")
+            source_marker = f" [{source}]" if source in ("human", "treequest", "literature") else ""
             lines.append(f"- {h['id']}: {h.get('description', '?')}{priority_marker}{source_marker}")
     else:
         lines.append("No queued hypotheses. Use `/turing:try` to inject ideas.")
@@ -360,6 +398,44 @@ def format_brief(
                     f"The {pct:.1f}% improvement costs {ratio:.0f}x more compute.",
                 ])
+    # Seed studies
+    if seed_studies:
+        lines.extend(["", "## Seed Studies", ""])
+        for study in seed_studies:
+            exp_id = study.get("experiment_id", "?")
+            sensitive = study.get("seed_sensitive", False)
+            status = "SEED-SENSITIVE" if sensitive else "STABLE"
+            lines.append(
+                f"- **{exp_id}:** {study.get('metric', metric)} = "
+                f"{study.get('mean', 0):.4f} +/- {study.get('std', 0):.4f} "
+                f"(CV={study.get('cv_percent', 0):.1f}%) — **{status}**"
+            )
+            if sensitive:
+                lines.append(
+                    f"  - 95% CI: [{study['ci_95'][0]:.4f}, {study['ci_95'][1]:.4f}] "
+                    f"over {len(study.get('seeds_run', []))} seeds"
+                )
+        if any(s.get("seed_sensitive") for s in seed_studies):
+            lines.extend(["", "*Some results are seed-sensitive. Report distributions, not point estimates.*"])
+    # Reproduction reports
+    if reproductions:
+        lines.extend(["", "## Reproducibility", ""])
+        verdict_markers = {
+            "reproducible": "PASS",
+            "approximately_reproducible": "PASS (approx)",
+            "not_reproducible": "FAIL",
+            "environment_changed": "WARN (env)",
+        }
+        for report in reproductions:
+            exp_id = report.get("experiment_id", "?")
+            verdict = report.get("verdict", "unknown")
+            marker = verdict_markers.get(verdict, verdict)
+            lines.append(f"- **{exp_id}:** {marker} — {report.get('reason', 'N/A')}")
+        failed = [r for r in reproductions if r.get("verdict") in ("not_reproducible", "environment_changed")]
+        if failed:
+            lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
     lines.extend([
         "",
         "## Recommendations",
@@ -387,9 +463,9 @@ def format_brief(
         # Check if hypotheses are exhausted
         if not queued:
-            lines.append("- No hypotheses queued — inject ideas with `/turing:try`")
+            lines.append("- No hypotheses queued — inject ideas with `/turing:try` or explore with `/turing:explore`")
-    lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses. Use `/turing:train` to execute.*"])
+    lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses, `/turing:explore` for tree search, `/turing:train` to execute.*"])
     return "\n".join(lines)
@@ -419,11 +495,17 @@ def generate_brief(
     cost_records = load_cost_data(log_path, metric)
     pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
+    # Load seed studies and reproduction reports
+    seed_studies = load_seed_studies()
+    reproductions = load_reproductions()
     return format_brief(
         campaign, best, trajectory, model_types, hypotheses,
         metric, lower_is_better, failures, env_warnings,
         cost_data=cost_records if cost_records else None,
         cost_frontier=pareto if cost_records else None,
+        seed_studies=seed_studies if seed_studies else None,
+        reproductions=reproductions if reproductions else None,
     )

package/templates/scripts/generate_model_card.py CHANGED Viewed

@@ -243,6 +243,31 @@ def generate_card(
     else:
         lines.append("No experiments completed yet.")
+    # --- Seed Study ---
+    if best:
+        seed_study_path = Path("experiments/seed_studies") / f"{best.get('experiment_id', 'unknown')}-seeds.yaml"
+        if seed_study_path.exists():
+            import yaml
+            with open(seed_study_path) as f:
+                seed_study = yaml.safe_load(f) or {}
+            if seed_study and "mean" in seed_study:
+                sensitive = seed_study.get("seed_sensitive", False)
+                status = "SEED-SENSITIVE" if sensitive else "STABLE"
+                lines.extend([
+                    "",
+                    "### Seed Study",
+                    "",
+                    f"- **Status:** {status}",
+                    f"- **{metric}:** {seed_study['mean']:.4f} +/- {seed_study.get('std', 0):.4f}",
+                ])
+                if "ci_95" in seed_study:
+                    ci = seed_study["ci_95"]
+                    lines.append(f"- **95% CI:** [{ci[0]:.4f}, {ci[1]:.4f}]")
+                lines.append(f"- **CV:** {seed_study.get('cv_percent', 0):.2f}%")
+                lines.append(f"- **Seeds tested:** {len(seed_study.get('seeds_run', []))}")
+                if sensitive:
+                    lines.append("- *Result varies significantly across seeds. Report distribution, not point estimate.*")
     # --- Training History ---
     lines.extend([
         "",

package/templates/scripts/leaderboard.py CHANGED Viewed

@@ -503,6 +503,16 @@ def main() -> None:
             print()
             print(footer)
+    # Show seed study status for #1 if available
+    if ranked and args.fmt not in ("csv",):
+        from scripts.turing_io import load_seed_study
+        best_id = ranked[0].get("experiment_id")
+        if best_id:
+            study = load_seed_study(best_id)
+            if study and "mean" in study:
+                sensitive = "SEED-SENSITIVE" if study.get("seed_sensitive") else "STABLE"
+                print(f"\n  Seed study: {metric}={study['mean']:.4f}±{study.get('std',0):.4f} ({sensitive})")
 if __name__ == "__main__":
     main()

package/templates/scripts/manage_hypotheses.py CHANGED Viewed

@@ -277,7 +277,7 @@ def get_next_hypothesis(queue_path: str) -> dict | None:
         return None
     priority_order = {"high": 0, "medium": 1, "low": 2}
-    source_order = {"human": 0, "literature": 1, "taxonomy": 2, "agent": 3}
+    source_order = {"human": 0, "literature": 1, "treequest": 2, "taxonomy": 3, "agent": 4}
     queued.sort(key=lambda h: (
         priority_order.get(h.get("priority", "medium"), 1),
@@ -376,7 +376,7 @@ def main() -> None:
     add_parser.add_argument("description", nargs="?", default=None, help="What to try and why")
     add_parser.add_argument("--archetype", default=None, help="Expand from archetype (e.g., model_comparison)")
     add_parser.add_argument("--priority", default="high", choices=sorted(VALID_PRIORITIES))
-    add_parser.add_argument("--source", default="human", choices=["human", "agent", "literature", "taxonomy"])
+    add_parser.add_argument("--source", default="human", choices=["human", "agent", "literature", "treequest", "taxonomy"])
     add_parser.add_argument("--parent", default=None, help="Parent experiment ID")
     add_parser.add_argument("--parent-hyp", default=None, help="Parent hypothesis ID")
     add_parser.add_argument("--family", default=None, help="Experiment family (e.g., optimizer-sweep)")