claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
package/bin/cli.js ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env node
2
+ import { createRequire } from "module";
3
+ const require = createRequire(import.meta.url);
4
+ const { Command } = require("commander");
5
+ const pkg = require("../package.json");
6
+
7
+ const program = new Command();
8
+
9
+ program
10
+ .name("claude-turing")
11
+ .description(pkg.description)
12
+ .version(pkg.version);
13
+
14
+ program
15
+ .command("install")
16
+ .description("Install turing commands and agents to Claude Code")
17
+ .option("--global", "Install globally (~/.claude/)")
18
+ .option("--project", "Install for current project (.claude/)")
19
+ .action(async (opts) => {
20
+ const { install } = await import("../src/install.js");
21
+ await install(opts);
22
+ });
23
+
24
+ program
25
+ .command("verify")
26
+ .description("Verify turing installation is complete")
27
+ .option("--scope <scope>", "Check a specific scope (global|project)")
28
+ .action(async (opts) => {
29
+ const { verify } = await import("../src/verify.js");
30
+ await verify(opts);
31
+ });
32
+
33
+ program
34
+ .command("init [name] [dir]")
35
+ .description("Scaffold ML project (CLI mode, non-Claude-Code usage)")
36
+ .action(async (name, dir) => {
37
+ const { execSync } = await import("child_process");
38
+ const { dirname, join } = await import("path");
39
+ const { fileURLToPath } = await import("url");
40
+ const __dirname = dirname(fileURLToPath(import.meta.url));
41
+ const script = join(__dirname, "turing-init.sh");
42
+ const args = [name, dir].filter(Boolean).join(" ");
43
+ execSync(`bash "${script}" ${args}`, { stdio: "inherit" });
44
+ });
45
+
46
+ program.parse();
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env bash
2
+ # Turing CLI init script.
3
+ # Thin wrapper around scripts/scaffold.py — the unified scaffolding
4
+ # implementation that both CLI and Claude Code use.
5
+ #
6
+ # Usage:
7
+ # turing-init [project_name] [ml_dir]
8
+ # turing-init --interactive
9
+ #
10
+ # For Claude Code usage, use /turing:init instead.
11
+
12
+ set -euo pipefail
13
+
14
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
15
+ PLUGIN_DIR="$(dirname "$SCRIPT_DIR")"
16
+ TEMPLATES_DIR="${PLUGIN_DIR}/templates"
17
+ SCAFFOLD_SCRIPT="${TEMPLATES_DIR}/scripts/scaffold.py"
18
+
19
+ # Check scaffold script exists
20
+ if [[ ! -f "$SCAFFOLD_SCRIPT" ]]; then
21
+ echo "Error: scaffold.py not found at ${SCAFFOLD_SCRIPT}" >&2
22
+ echo "Ensure the Turing plugin is installed correctly." >&2
23
+ exit 1
24
+ fi
25
+
26
+ echo "╔══════════════════════════════════════════╗"
27
+ echo "║ Turing ML Research Harness ║"
28
+ echo "║ Autonomous Experiment Infrastructure ║"
29
+ echo "╚══════════════════════════════════════════╝"
30
+ echo ""
31
+
32
+ # If no args or --interactive, run interactive mode
33
+ if [[ $# -eq 0 ]] || [[ "${1:-}" == "--interactive" ]]; then
34
+ python3 "$SCAFFOLD_SCRIPT" --interactive --templates-dir "$TEMPLATES_DIR" --no-venv
35
+ echo ""
36
+ echo " To set up the virtual environment:"
37
+ echo " cd <ml_dir> && python3 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt"
38
+ exit 0
39
+ fi
40
+
41
+ # Positional args mode: turing-init <project_name> [ml_dir]
42
+ PROJECT_NAME="${1:-my-ml-project}"
43
+ ML_DIR="${2:-ml/${PROJECT_NAME}}"
44
+
45
+ python3 "$SCAFFOLD_SCRIPT" \
46
+ --project-name "$PROJECT_NAME" \
47
+ --target-metric "accuracy" \
48
+ --metric-direction "higher" \
49
+ --task-description "ML task for ${PROJECT_NAME}" \
50
+ --ml-dir "$ML_DIR" \
51
+ --data-source "data/training.csv" \
52
+ --templates-dir "$TEMPLATES_DIR" \
53
+ --no-venv --no-hooks
54
+
55
+ echo ""
56
+ echo "Note: Run with --interactive for full setup (metric, data source, etc.)"
57
+ echo "Or use Claude Code: /turing:init"
@@ -0,0 +1,83 @@
1
+ ---
2
+ name: brief
3
+ description: Generate a structured research intelligence report from experiment history — what's been learned, what's promising, what's exhausted, and what the human should consider next. Use --deep for literature-grounded suggestions.
4
+ disable-model-invocation: true
5
+ argument-hint: "[--deep]"
6
+ allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob, WebSearch, WebFetch
7
+ ---
8
+
9
+ Generate a research briefing that a human can read in 2 minutes and immediately decide what to inject next.
10
+
11
+ ## Steps
12
+
13
+ 1. **Generate the briefing:**
14
+ ```bash
15
+ source .venv/bin/activate && python scripts/generate_brief.py
16
+ ```
17
+
18
+ 2. **Self-critique the briefing** before presenting. Review the generated output and check:
19
+ - **Recommendations specificity:** Are they concrete enough to act on? "Try a different model" is bad. "Try LightGBM with leaf-wise growth because exp-004 showed depth sensitivity" is good. If vague, rewrite them with specific model/hyperparameter suggestions grounded in the experiment data.
20
+ - **Exhausted directions coverage:** Cross-reference the "Model Types Explored" section against `experiments/log.jsonl`. Are there discarded experiments missing from the summary? If so, add them.
21
+ - **Convergence estimate grounding:** If the briefing says "close to convergence" or "further improvement possible", verify against the actual metric trajectory. Is the claim supported by the numbers?
22
+ - **Metric accuracy:** Spot-check that the "Current Best" metrics match the actual log. Run `python scripts/show_metrics.py --last 1` if uncertain.
23
+
24
+ If any section fails the check, regenerate just that section. Max 1 revision round — don't over-polish.
25
+
26
+ 3. **Present the output** to the user. The briefing has 6 sections:
27
+ - **Campaign Summary** — total experiments, keep rate, timespan
28
+ - **Current Best** — model type, metrics, experiment ID, configuration
29
+ - **Improvement Trajectory** — metric over time, rate of improvement
30
+ - **Model Types Explored** — which approaches have been tried and their hit rates
31
+ - **Hypothesis Queue** — pending and completed hypotheses
32
+ - **Recommendations** — data-driven next steps
33
+
34
+ 4. **If `$ARGUMENTS` contains `--deep`:** run the Literature-Grounded Suggestions step below.
35
+
36
+ 5. **Prompt for action:**
37
+ - "Want to inject a hypothesis? Use `/turing:try <idea>`"
38
+ - "Want to continue training? Use `/turing:train`"
39
+ - "Want literature-backed suggestions? Use `/turing:brief --deep`"
40
+
41
+ ## Literature-Grounded Suggestions (--deep flag)
42
+
43
+ When `--deep` is requested, add a 7th section: **Literature-Grounded Suggestions**.
44
+
45
+ ### Steps:
46
+
47
+ 1. **Read context:** Read `config.yaml` and the briefing output to understand:
48
+ - What task type this is (tabular classification, time series, etc.)
49
+ - Which model families have been exhausted (from "Model Types Explored")
50
+ - Where improvement has plateaued (from "Improvement Trajectory")
51
+ - What failure patterns keep recurring
52
+
53
+ 2. **Search literature** with `WebSearch` for techniques that address the specific stagnation:
54
+ - If plateaued: "improve [task type] accuracy beyond [current metric] 2024"
55
+ - If overfitting: "regularization techniques [model family] [task type]"
56
+ - If all models tried: "state of the art [task type] benchmark 2024 2025"
57
+
58
+ 3. **Distill 3-5 suggestions** from the literature, each with:
59
+ - **Technique:** specific and actionable
60
+ - **Source:** paper or article URL
61
+ - **Why now:** how it addresses the specific stagnation point
62
+ - **Impact estimate:** high/medium/low
63
+ - **Complexity:** low/medium/high
64
+
65
+ 4. **Queue suggestions** as hypotheses:
66
+ ```bash
67
+ source .venv/bin/activate && python scripts/manage_hypotheses.py add "<technique>: <rationale> (source: <citation>)" --priority medium --source literature
68
+ ```
69
+
70
+ 5. **Format as a section** appended to the briefing.
71
+
72
+ ## Saving Briefs
73
+
74
+ ```bash
75
+ mkdir -p briefs && python scripts/generate_brief.py > briefs/brief-$(date +%Y-%m-%d).md
76
+ ```
77
+
78
+ ## When to Use
79
+
80
+ - After a training session completes or converges
81
+ - Before injecting new hypotheses (to understand what's already been tried)
82
+ - When returning to a project after time away
83
+ - **With `--deep`:** when the agent seems stuck and you want evidence-based direction
@@ -0,0 +1,24 @@
1
+ ---
2
+ name: compare
3
+ description: Compare two ML experiment runs side-by-side — metrics, configuration deltas, and a verdict on which approach is more promising.
4
+ disable-model-invocation: true
5
+ argument-hint: "<exp-id-1> <exp-id-2>"
6
+ allowed-tools: Read, Bash(*), Grep, Glob
7
+ ---
8
+
9
+ Compare two ML experiment runs side-by-side to understand what changed and why one performed better.
10
+
11
+ ## Steps
12
+
13
+ 1. **Run comparison:**
14
+ ```bash
15
+ source .venv/bin/activate && python scripts/compare_runs.py $0 $1
16
+ ```
17
+
18
+ 2. **Analyze the delta:**
19
+ - **Metric differences:** all configured metrics for both runs
20
+ - **Configuration delta:** what changed (model type, hyperparameters, features)
21
+ - **Causal analysis:** which changes likely caused the metric difference
22
+ - **Verdict:** which approach is more promising for future experiments
23
+
24
+ 3. **If either ID is missing:** report the error and suggest `/turing:status` to see available experiment IDs.
@@ -0,0 +1,97 @@
1
+ ---
2
+ name: design
3
+ description: Generate a structured experiment design for a hypothesis. Reads experiment history, searches literature for methodology, produces a scored design document at experiments/designs/.
4
+ disable-model-invocation: true
5
+ argument-hint: "<hypothesis-id or description>"
6
+ allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*), Grep, Glob, WebSearch, WebFetch
7
+ ---
8
+
9
+ Front-load the thinking before the coding. Given a hypothesis, produce a structured experiment design grounded in methodology from the literature.
10
+
11
+ ## Steps
12
+
13
+ ### 1. Load Context
14
+
15
+ If `$ARGUMENTS` matches `hyp-NNN`, load the hypothesis:
16
+ ```bash
17
+ source .venv/bin/activate && python scripts/manage_hypotheses.py show $ARGUMENTS
18
+ ```
19
+
20
+ If freeform text, use it directly as the hypothesis description.
21
+
22
+ Read the current config and experiment state:
23
+ ```bash
24
+ cat config.yaml
25
+ ```
26
+ ```bash
27
+ source .venv/bin/activate && python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
28
+ ```
29
+ ```bash
30
+ cat experiment_state.yaml 2>/dev/null || echo "No experiment state yet"
31
+ ```
32
+
33
+ ### 2. Search for Methodology
34
+
35
+ Use `WebSearch` to find 2-3 papers or articles describing how to implement the proposed change effectively. Target:
36
+ - The specific technique in the hypothesis (e.g., "LightGBM dart boosting implementation best practices")
37
+ - Common pitfalls for this type of change
38
+ - Benchmark results showing expected improvement range
39
+
40
+ Use `WebFetch` on the most relevant results to extract specific methodology details: hyperparameter recommendations, training procedures, evaluation approaches.
41
+
42
+ ### 3. Write the Design Document
43
+
44
+ Create `experiments/designs/<hyp-id>-design.md` (or `experiments/designs/adhoc-<date>-design.md` for freeform hypotheses):
45
+
46
+ ```bash
47
+ mkdir -p experiments/designs
48
+ ```
49
+
50
+ Write with this structure:
51
+
52
+ ```markdown
53
+ # Experiment Design: <hypothesis summary>
54
+
55
+ ## Hypothesis
56
+ <full description>
57
+
58
+ ## Objective
59
+ <what we're testing, stated as a falsifiable claim>
60
+
61
+ ## Method
62
+ <specific changes, grounded in literature findings>
63
+
64
+ ## Literature Support
65
+ - <source 1>: <what it says about this approach>
66
+ - <source 2>: <relevant finding>
67
+
68
+ ## Implementation Plan
69
+ ### Changes to train.py
70
+ <concrete code changes needed>
71
+
72
+ ### Changes to config.yaml (if any)
73
+ <hyperparameter values to set, with rationale from literature>
74
+
75
+ ## Expected Outcome
76
+ - **Success:** <metric > threshold, specific number>
77
+ - **Failure:** <what would disprove the hypothesis>
78
+
79
+ ## Risks
80
+ <specific pitfalls from literature, not generic "might not work">
81
+
82
+ ## Estimated Runs
83
+ <how many iterations>
84
+ ```
85
+
86
+ ### 4. Self-Critique
87
+
88
+ Review the design:
89
+ - Is the implementation plan specific enough for the researcher agent to execute without ambiguity?
90
+ - Does the expected outcome have a concrete metric threshold?
91
+ - Are risks actionable?
92
+
93
+ Score each dimension 1-10 (feasibility, novelty, clarity). If any < 7, revise that section. Max 2 revision rounds.
94
+
95
+ ### 5. Report
96
+
97
+ Display the design summary with scores and file location. The researcher agent can read the design during `/turing:train`.
@@ -0,0 +1,123 @@
1
+ ---
2
+ name: init
3
+ description: Initialize a new ML project with the Turing autoresearch harness. Scaffolds the full experiment infrastructure — immutable evaluation pipeline, agent-editable training code, structured logging, convergence detection hooks, and a Python virtual environment. Use --plan to generate a research plan.
4
+ disable-model-invocation: true
5
+ argument-hint: "[project_name] [--plan]"
6
+ allowed-tools: Read, Write, Edit, Bash(*), Grep, Glob, WebSearch, WebFetch
7
+ ---
8
+
9
+ Scaffold a new ML project with the Turing autoresearch harness. This creates the separation between the measurement apparatus (READ-ONLY) and the hypothesis space (AGENT-EDITABLE) that makes autonomous experimentation trustworthy.
10
+
11
+ ## Interactive Setup
12
+
13
+ Ask the user for the following (or accept from `$ARGUMENTS` if provided as JSON):
14
+
15
+ 1. **Project name** (`{{PROJECT_NAME}}`): Name of the ML project (e.g., "sentiment", "churn", "fraud-detection")
16
+ 2. **Target metric** (`{{TARGET_METRIC}}`): Primary metric to optimize (e.g., "accuracy", "f1", "mae", "mse", "auc")
17
+ 3. **Metric direction**: Is lower better (mae, mse, loss) or higher better (accuracy, f1, auc)?
18
+ 4. **Task description** (`{{TASK_DESCRIPTION}}`): What the model does (e.g., "Predict customer churn from usage data")
19
+ 5. **ML directory** (`{{ML_DIR}}`): Where ML files go relative to project root (e.g., "ml/sentiment")
20
+ 6. **Data source** (`{{DATA_SOURCE}}`): Where training data comes from (e.g., "data/reviews.csv")
21
+
22
+ ## Scaffolding
23
+
24
+ Once you have all 6 values, delegate to the unified scaffolding script:
25
+
26
+ ```bash
27
+ python3 <templates_dir>/scripts/scaffold.py \
28
+ --project-name "<project_name>" \
29
+ --target-metric "<target_metric>" \
30
+ --metric-direction "<metric_direction>" \
31
+ --task-description "<task_description>" \
32
+ --ml-dir "<ml_dir>" \
33
+ --data-source "<data_source>" \
34
+ --templates-dir "<templates_dir>"
35
+ ```
36
+
37
+ The scaffold script handles everything in a single atomic operation:
38
+ - Copies all template files with placeholder substitution
39
+ - Creates data/, experiments/, models/ directories
40
+ - Sets up agent memory at `.claude/agent-memory/ml-researcher/MEMORY.md`
41
+ - Configures Claude Code hooks in `.claude/settings.local.json`
42
+ - Creates Python virtual environment and installs requirements
43
+ - Verifies all placeholders were replaced (fails loudly if any remain)
44
+
45
+ ## Locating Templates
46
+
47
+ Find the templates directory using Glob:
48
+ ```
49
+ ~/.claude/plugins/*/templates/
50
+ ```
51
+ Or check if installed via npm by looking for `node_modules/claude-turing/templates/`.
52
+
53
+ ## After Scaffolding
54
+
55
+ Report what was created:
56
+ - The separation: READ-ONLY (`prepare.py`, `evaluate.py`) vs AGENT-EDITABLE (`train.py`)
57
+ - Next steps: add data to the configured data source path, run `python prepare.py`, then `/turing:train`
58
+ - The taste-leverage loop: `/turing:try` to inject hypotheses, `/turing:brief` for intelligence reports
59
+
60
+ ## Research Plan Generation (--plan flag)
61
+
62
+ If `$ARGUMENTS` contains `--plan`, generate a research plan AFTER scaffolding. This gives the agent strategic direction for its first 5-10 experiments rather than ad-hoc exploration.
63
+
64
+ ### Steps:
65
+
66
+ 1. **Read the task context** from the just-created `config.yaml`: task description, model type, target metric, data source.
67
+
68
+ 2. **Search literature** with `WebSearch` for the task domain:
69
+ - "state of the art <task description> machine learning 2024 2025"
70
+ - "best model <target metric> <data type> benchmark"
71
+ - "<task description> common approaches survey"
72
+
73
+ Use `WebFetch` on top 2-3 results to extract: dominant model families, typical metric ranges, known challenges.
74
+
75
+ 3. **Generate `RESEARCH_PLAN.md`** in the ML project directory with this structure:
76
+
77
+ ```markdown
78
+ # Research Plan: <task description>
79
+
80
+ Generated: <date>
81
+
82
+ ## Task Summary
83
+ <one paragraph describing the task, data, and success criteria>
84
+
85
+ ## Model Families to Explore
86
+ Ordered by expected relevance based on literature:
87
+ 1. **<family 1>** — <why, with citation>
88
+ 2. **<family 2>** — <why, with citation>
89
+ 3. **<family 3>** — <why, with citation>
90
+
91
+ ## Evaluation Strategy
92
+ - Primary metric: <metric> (<higher/lower> is better)
93
+ - Multi-run recommendation: <yes/no, based on expected variance>
94
+ - Baseline target: <realistic first-pass metric from literature>
95
+
96
+ ## Search Budget
97
+ - <N> experiments per model family before moving on
98
+ - Total budget: <N> experiments before first convergence check
99
+
100
+ ## Success Criteria
101
+ - Target metric: <value from literature benchmarks>
102
+ - Convergence: <patience> consecutive non-improvements
103
+
104
+ ## Known Challenges
105
+ - <challenge 1 from literature, e.g., "class imbalance common in this domain">
106
+ - <challenge 2>
107
+
108
+ ## Sources
109
+ - <citation 1>
110
+ - <citation 2>
111
+ ```
112
+
113
+ 4. **Self-critique the plan** (one round):
114
+ - Are the model families ordered by evidence strength?
115
+ - Is the budget realistic?
116
+ - Are the success criteria grounded in benchmark data?
117
+ Revise if any section is vague or unsupported.
118
+
119
+ 5. **Report:** "Research plan generated at `<ml_dir>/RESEARCH_PLAN.md`. The agent will read this during `/turing:train` for strategic direction."
120
+
121
+ ### Integration
122
+
123
+ The agent's `program.md` OBSERVE step reads `RESEARCH_PLAN.md` (if it exists) for strategic direction. The plan is advisory — the agent can deviate but should note why in `experiment_state.yaml`.
@@ -0,0 +1,51 @@
1
+ ---
2
+ name: logbook
3
+ description: Generate a research logbook showing the full experiment narrative — hypotheses proposed, experiments run, decisions made, and progress over time. Outputs HTML (with interactive chart) or markdown.
4
+ disable-model-invocation: true
5
+ argument-hint: "[--since YYYY-MM-DD] [--format html|markdown] [--output path]"
6
+ allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*), Grep, Glob
7
+ ---
8
+
9
+ Generate a research logbook that captures the full narrative of the experiment campaign.
10
+
11
+ ## Steps
12
+
13
+ 1. **Generate the logbook:**
14
+ ```bash
15
+ source .venv/bin/activate && python scripts/generate_logbook.py
16
+ ```
17
+
18
+ **With options from `$ARGUMENTS`:**
19
+ - `--since 2026-03-15` — only include events after this date
20
+ - `--format markdown` — output as markdown instead of HTML
21
+ - `--output logbook.html` — write to file instead of stdout
22
+
23
+ **Common usage:**
24
+ ```bash
25
+ # HTML logbook with interactive trajectory chart
26
+ source .venv/bin/activate && python scripts/generate_logbook.py --output logbook.html
27
+
28
+ # Markdown for embedding in docs or READMEs
29
+ source .venv/bin/activate && python scripts/generate_logbook.py --format markdown --output logbook.md
30
+
31
+ # Last week's activity
32
+ source .venv/bin/activate && python scripts/generate_logbook.py --since 2026-03-24 --output logbook.html
33
+ ```
34
+
35
+ 2. **Present the result:**
36
+ - If HTML: tell the user to open the file in their browser. The logbook includes an interactive Chart.js trajectory visualization.
37
+ - If markdown: display inline or note the output file location.
38
+
39
+ ## What the Logbook Contains
40
+
41
+ - **Campaign summary:** total experiments, keep rate, best metric, hypothesis count
42
+ - **Improvement trajectory:** interactive line chart showing metric progression and best-so-far envelope
43
+ - **Experiment log:** every experiment with ID, description, metric value, status (kept/discarded), date
44
+ - **Hypothesis queue:** every hypothesis with source (human/agent/literature), status, priority
45
+
46
+ ## When to Use
47
+
48
+ - To share progress with collaborators
49
+ - Before and after meetings to show what was tried
50
+ - To archive a completed research campaign
51
+ - To track progress over a specific time period
@@ -0,0 +1,43 @@
1
+ ---
2
+ name: mode
3
+ description: Set the research strategy mode — explore (try new things), exploit (refine what works), or replicate (verify results). Drives novelty guard policy and agent behavior.
4
+ disable-model-invocation: true
5
+ argument-hint: "<explore|exploit|replicate>"
6
+ ---
7
+
8
+ Set the research mode for the current project. The mode determines how the novelty guard filters proposed experiments and how the agent prioritizes its work.
9
+
10
+ ## Modes
11
+
12
+ | Mode | Novelty Guard Policy | Agent Behavior |
13
+ |------|---------------------|----------------|
14
+ | **explore** | Allow novel ideas, block repeats and follow-ups | Try fundamentally different approaches |
15
+ | **exploit** | Allow follow-ups and known successes, block repeats | Refine the current best configuration |
16
+ | **replicate** | Allow duplicate runs, block novel ideas | Re-run best experiments with different seeds |
17
+
18
+ ## Steps
19
+
20
+ 1. **Parse mode** from `$ARGUMENTS`. Must be one of: `explore`, `exploit`, `replicate`.
21
+
22
+ 2. **Update experiment state:**
23
+ ```bash
24
+ source .venv/bin/activate
25
+ python -c "
26
+ import yaml
27
+ from pathlib import Path
28
+ path = Path('experiment_state.yaml')
29
+ state = yaml.safe_load(path.read_text()) if path.exists() else {}
30
+ state['research_mode'] = '$ARGUMENTS'
31
+ path.write_text(yaml.dump(state, default_flow_style=False))
32
+ print(f'Research mode set to: $ARGUMENTS')
33
+ "
34
+ ```
35
+
36
+ 3. **Confirm** with guidance:
37
+ - `explore`: "The agent will prioritize novel ideas and avoid follow-ups. Best when the current approach feels exhausted."
38
+ - `exploit`: "The agent will refine the current best. Best when you have a promising direction."
39
+ - `replicate`: "The agent will re-run experiments for statistical verification. Best before declaring a winner."
40
+
41
+ ## Default
42
+
43
+ The default mode is `exploit` (refine what works). Change to `explore` when plateauing, `replicate` before final decisions.
@@ -0,0 +1,89 @@
1
+ ---
2
+ name: poster
3
+ description: Generate a single-page HTML research poster summarizing the experiment campaign — best result, trajectory, key findings, and methodology. Adapted from posterskill's self-contained HTML architecture.
4
+ disable-model-invocation: true
5
+ argument-hint: "[title override]"
6
+ allowed-tools: Read, Write, Edit, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*, open:*), Grep, Glob
7
+ ---
8
+
9
+ Generate a research poster summarizing the experiment campaign as a single self-contained HTML file. Adapted from [posterskill](https://github.com/ethanweber/posterskill)'s architecture — no build step, works when opened as `file://`.
10
+
11
+ ## Steps
12
+
13
+ ### 1. Gather Data
14
+
15
+ Read the experiment history and project context:
16
+
17
+ ```bash
18
+ cat config.yaml
19
+ source .venv/bin/activate && python scripts/generate_brief.py
20
+ source .venv/bin/activate && python scripts/show_metrics.py --last 20
21
+ cat experiment_state.yaml 2>/dev/null || true
22
+ cat RESEARCH_PLAN.md 2>/dev/null || true
23
+ ```
24
+
25
+ From this, extract:
26
+ - **Title:** from config task description (or `$ARGUMENTS` override)
27
+ - **Best result:** metric name, value, experiment ID
28
+ - **Improvement trajectory:** metric values over experiments
29
+ - **Key findings:** what model families worked, what didn't, what was surprising
30
+ - **Methodology:** the experiment loop, evaluation strategy, convergence criteria
31
+ - **Campaign stats:** total experiments, keep rate, time span
32
+
33
+ ### 2. Generate the Poster HTML
34
+
35
+ Create `poster/index.html` — a self-contained HTML file with:
36
+
37
+ ```bash
38
+ mkdir -p poster
39
+ ```
40
+
41
+ **Structure the poster with these cards:**
42
+
43
+ | Card | Content |
44
+ |------|---------|
45
+ | **Header** | Title, "Autonomous ML Research Campaign", date range, best metric badge |
46
+ | **Objective** | Task description and success criteria from config |
47
+ | **Methodology** | The autoresearch loop: hypothesize → train → evaluate → decide. Mention immutable evaluation, git-disciplined rollback |
48
+ | **Trajectory** | Chart.js line chart of metric progression (embed data inline) |
49
+ | **Best Configuration** | Model type, hyperparameters, metric values from best experiment |
50
+ | **Key Findings** | 3-5 bullet points: what worked, what didn't, surprises |
51
+ | **Explored Approaches** | Table of model families tried with keep rates |
52
+ | **Campaign Stats** | Total experiments, keep rate, human vs agent hypotheses, convergence |
53
+
54
+ **Design principles (from posterskill):**
55
+ - Single self-contained HTML file, CDN dependencies only (Chart.js, Google Fonts)
56
+ - Print-optimized CSS (`@media print`, `@page` with poster dimensions)
57
+ - Card-based layout with colored top borders
58
+ - Clean typography (system fonts or Nunito from Google Fonts)
59
+ - Data embedded directly in the HTML as JSON — no external file dependencies
60
+
61
+ **Poster dimensions:** Default A1 landscape (841mm x 594mm). The user can print to PDF from their browser.
62
+
63
+ ### 3. Self-Critique
64
+
65
+ Review the generated poster:
66
+ - Does the trajectory chart render correctly with the embedded data?
67
+ - Are the key findings specific and data-grounded (not generic)?
68
+ - Is the best configuration complete (model type + all relevant hyperparameters)?
69
+ - Would a collaborator understand the campaign from this single page?
70
+
71
+ Fix any issues found.
72
+
73
+ ### 4. Present
74
+
75
+ ```
76
+ Research poster generated at poster/index.html
77
+
78
+ Open in your browser to view. Print to PDF for sharing.
79
+ Best result: <metric>=<value> (<experiment_id>)
80
+ Campaign: <N> experiments, <keep_rate>% keep rate
81
+ ```
82
+
83
+ Suggest: "Open `poster/index.html` in your browser. Use Ctrl+P / Cmd+P to save as PDF."
84
+
85
+ ## Integration
86
+
87
+ - The poster reads from the same data sources as `/turing:brief` and `/turing:logbook`
88
+ - For a more detailed view, use `/turing:logbook` (full experiment-by-experiment narrative)
89
+ - For a quick summary, use `/turing:brief` (text-only intelligence report)