claude-turing 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +34 -0
- package/LICENSE +21 -0
- package/README.md +457 -0
- package/agents/ml-evaluator.md +43 -0
- package/agents/ml-researcher.md +74 -0
- package/bin/cli.js +46 -0
- package/bin/turing-init.sh +57 -0
- package/commands/brief.md +83 -0
- package/commands/compare.md +24 -0
- package/commands/design.md +97 -0
- package/commands/init.md +123 -0
- package/commands/logbook.md +51 -0
- package/commands/mode.md +43 -0
- package/commands/poster.md +89 -0
- package/commands/preflight.md +75 -0
- package/commands/report.md +97 -0
- package/commands/rules/loop-protocol.md +91 -0
- package/commands/status.md +24 -0
- package/commands/suggest.md +95 -0
- package/commands/sweep.md +45 -0
- package/commands/train.md +66 -0
- package/commands/try.md +63 -0
- package/commands/turing.md +54 -0
- package/commands/validate.md +34 -0
- package/config/defaults.yaml +45 -0
- package/config/experiment_archetypes.yaml +127 -0
- package/config/lifecycle.toml +31 -0
- package/config/novelty_aliases.yaml +107 -0
- package/config/relationships.toml +125 -0
- package/config/state.toml +24 -0
- package/config/task_taxonomy.yaml +110 -0
- package/config/taxonomy.toml +37 -0
- package/package.json +54 -0
- package/src/claude-md.js +55 -0
- package/src/install.js +107 -0
- package/src/paths.js +20 -0
- package/src/postinstall.js +22 -0
- package/src/verify.js +109 -0
- package/templates/MEMORY.md +36 -0
- package/templates/README.md +93 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/config.yaml +48 -0
- package/templates/evaluate.py +237 -0
- package/templates/features/__init__.py +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/features/featurizers.py +138 -0
- package/templates/prepare.py +171 -0
- package/templates/program.md +216 -0
- package/templates/pyproject.toml +8 -0
- package/templates/requirements.txt +8 -0
- package/templates/scripts/__init__.py +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/check_convergence.py +230 -0
- package/templates/scripts/compare_runs.py +124 -0
- package/templates/scripts/critique_hypothesis.py +350 -0
- package/templates/scripts/experiment_index.py +288 -0
- package/templates/scripts/generate_brief.py +389 -0
- package/templates/scripts/generate_logbook.py +423 -0
- package/templates/scripts/log_experiment.py +243 -0
- package/templates/scripts/manage_hypotheses.py +543 -0
- package/templates/scripts/novelty_guard.py +343 -0
- package/templates/scripts/parse_metrics.py +139 -0
- package/templates/scripts/post-train-hook.sh +74 -0
- package/templates/scripts/preflight.py +549 -0
- package/templates/scripts/scaffold.py +409 -0
- package/templates/scripts/show_environment.py +92 -0
- package/templates/scripts/show_experiment_tree.py +144 -0
- package/templates/scripts/show_families.py +133 -0
- package/templates/scripts/show_metrics.py +157 -0
- package/templates/scripts/statistical_compare.py +259 -0
- package/templates/scripts/stop-hook.sh +34 -0
- package/templates/scripts/suggest_next.py +301 -0
- package/templates/scripts/sweep.py +276 -0
- package/templates/scripts/synthesize_decision.py +300 -0
- package/templates/scripts/turing_io.py +76 -0
- package/templates/scripts/update_state.py +296 -0
- package/templates/scripts/validate_stability.py +167 -0
- package/templates/scripts/verify_placeholders.py +119 -0
- package/templates/sweep_config.yaml +14 -0
- package/templates/tests/__init__.py +0 -0
- package/templates/tests/conftest.py +91 -0
- package/templates/train.py +240 -0
package/bin/cli.js
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { createRequire } from "module";
|
|
3
|
+
const require = createRequire(import.meta.url);
|
|
4
|
+
const { Command } = require("commander");
|
|
5
|
+
const pkg = require("../package.json");
|
|
6
|
+
|
|
7
|
+
const program = new Command();
|
|
8
|
+
|
|
9
|
+
program
|
|
10
|
+
.name("claude-turing")
|
|
11
|
+
.description(pkg.description)
|
|
12
|
+
.version(pkg.version);
|
|
13
|
+
|
|
14
|
+
program
|
|
15
|
+
.command("install")
|
|
16
|
+
.description("Install turing commands and agents to Claude Code")
|
|
17
|
+
.option("--global", "Install globally (~/.claude/)")
|
|
18
|
+
.option("--project", "Install for current project (.claude/)")
|
|
19
|
+
.action(async (opts) => {
|
|
20
|
+
const { install } = await import("../src/install.js");
|
|
21
|
+
await install(opts);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
program
|
|
25
|
+
.command("verify")
|
|
26
|
+
.description("Verify turing installation is complete")
|
|
27
|
+
.option("--scope <scope>", "Check a specific scope (global|project)")
|
|
28
|
+
.action(async (opts) => {
|
|
29
|
+
const { verify } = await import("../src/verify.js");
|
|
30
|
+
await verify(opts);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
program
|
|
34
|
+
.command("init [name] [dir]")
|
|
35
|
+
.description("Scaffold ML project (CLI mode, non-Claude-Code usage)")
|
|
36
|
+
.action(async (name, dir) => {
|
|
37
|
+
const { execSync } = await import("child_process");
|
|
38
|
+
const { dirname, join } = await import("path");
|
|
39
|
+
const { fileURLToPath } = await import("url");
|
|
40
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
41
|
+
const script = join(__dirname, "turing-init.sh");
|
|
42
|
+
const args = [name, dir].filter(Boolean).join(" ");
|
|
43
|
+
execSync(`bash "${script}" ${args}`, { stdio: "inherit" });
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
program.parse();
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Turing CLI init script.
|
|
3
|
+
# Thin wrapper around scripts/scaffold.py — the unified scaffolding
|
|
4
|
+
# implementation that both CLI and Claude Code use.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# turing-init [project_name] [ml_dir]
|
|
8
|
+
# turing-init --interactive
|
|
9
|
+
#
|
|
10
|
+
# For Claude Code usage, use /turing:init instead.
|
|
11
|
+
|
|
12
|
+
set -euo pipefail
|
|
13
|
+
|
|
14
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
15
|
+
PLUGIN_DIR="$(dirname "$SCRIPT_DIR")"
|
|
16
|
+
TEMPLATES_DIR="${PLUGIN_DIR}/templates"
|
|
17
|
+
SCAFFOLD_SCRIPT="${TEMPLATES_DIR}/scripts/scaffold.py"
|
|
18
|
+
|
|
19
|
+
# Check scaffold script exists
|
|
20
|
+
if [[ ! -f "$SCAFFOLD_SCRIPT" ]]; then
|
|
21
|
+
echo "Error: scaffold.py not found at ${SCAFFOLD_SCRIPT}" >&2
|
|
22
|
+
echo "Ensure the Turing plugin is installed correctly." >&2
|
|
23
|
+
exit 1
|
|
24
|
+
fi
|
|
25
|
+
|
|
26
|
+
echo "╔══════════════════════════════════════════╗"
|
|
27
|
+
echo "║ Turing ML Research Harness ║"
|
|
28
|
+
echo "║ Autonomous Experiment Infrastructure ║"
|
|
29
|
+
echo "╚══════════════════════════════════════════╝"
|
|
30
|
+
echo ""
|
|
31
|
+
|
|
32
|
+
# If no args or --interactive, run interactive mode
|
|
33
|
+
if [[ $# -eq 0 ]] || [[ "${1:-}" == "--interactive" ]]; then
|
|
34
|
+
python3 "$SCAFFOLD_SCRIPT" --interactive --templates-dir "$TEMPLATES_DIR" --no-venv
|
|
35
|
+
echo ""
|
|
36
|
+
echo " To set up the virtual environment:"
|
|
37
|
+
echo " cd <ml_dir> && python3 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt"
|
|
38
|
+
exit 0
|
|
39
|
+
fi
|
|
40
|
+
|
|
41
|
+
# Positional args mode: turing-init <project_name> [ml_dir]
|
|
42
|
+
PROJECT_NAME="${1:-my-ml-project}"
|
|
43
|
+
ML_DIR="${2:-ml/${PROJECT_NAME}}"
|
|
44
|
+
|
|
45
|
+
python3 "$SCAFFOLD_SCRIPT" \
|
|
46
|
+
--project-name "$PROJECT_NAME" \
|
|
47
|
+
--target-metric "accuracy" \
|
|
48
|
+
--metric-direction "higher" \
|
|
49
|
+
--task-description "ML task for ${PROJECT_NAME}" \
|
|
50
|
+
--ml-dir "$ML_DIR" \
|
|
51
|
+
--data-source "data/training.csv" \
|
|
52
|
+
--templates-dir "$TEMPLATES_DIR" \
|
|
53
|
+
--no-venv --no-hooks
|
|
54
|
+
|
|
55
|
+
echo ""
|
|
56
|
+
echo "Note: Run with --interactive for full setup (metric, data source, etc.)"
|
|
57
|
+
echo "Or use Claude Code: /turing:init"
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: brief
|
|
3
|
+
description: Generate a structured research intelligence report from experiment history — what's been learned, what's promising, what's exhausted, and what the human should consider next. Use --deep for literature-grounded suggestions.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--deep]"
|
|
6
|
+
allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob, WebSearch, WebFetch
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Generate a research briefing that a human can read in 2 minutes and immediately decide what to inject next.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Generate the briefing:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate && python scripts/generate_brief.py
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Self-critique the briefing** before presenting. Review the generated output and check:
|
|
19
|
+
- **Recommendations specificity:** Are they concrete enough to act on? "Try a different model" is bad. "Try LightGBM with leaf-wise growth because exp-004 showed depth sensitivity" is good. If vague, rewrite them with specific model/hyperparameter suggestions grounded in the experiment data.
|
|
20
|
+
- **Exhausted directions coverage:** Cross-reference the "Model Types Explored" section against `experiments/log.jsonl`. Are there discarded experiments missing from the summary? If so, add them.
|
|
21
|
+
- **Convergence estimate grounding:** If the briefing says "close to convergence" or "further improvement possible", verify against the actual metric trajectory. Is the claim supported by the numbers?
|
|
22
|
+
- **Metric accuracy:** Spot-check that the "Current Best" metrics match the actual log. Run `python scripts/show_metrics.py --last 1` if uncertain.
|
|
23
|
+
|
|
24
|
+
If any section fails the check, regenerate just that section. Max 1 revision round — don't over-polish.
|
|
25
|
+
|
|
26
|
+
3. **Present the output** to the user. The briefing has 6 sections:
|
|
27
|
+
- **Campaign Summary** — total experiments, keep rate, timespan
|
|
28
|
+
- **Current Best** — model type, metrics, experiment ID, configuration
|
|
29
|
+
- **Improvement Trajectory** — metric over time, rate of improvement
|
|
30
|
+
- **Model Types Explored** — which approaches have been tried and their hit rates
|
|
31
|
+
- **Hypothesis Queue** — pending and completed hypotheses
|
|
32
|
+
- **Recommendations** — data-driven next steps
|
|
33
|
+
|
|
34
|
+
4. **If `$ARGUMENTS` contains `--deep`:** run the Literature-Grounded Suggestions step below.
|
|
35
|
+
|
|
36
|
+
5. **Prompt for action:**
|
|
37
|
+
- "Want to inject a hypothesis? Use `/turing:try <idea>`"
|
|
38
|
+
- "Want to continue training? Use `/turing:train`"
|
|
39
|
+
- "Want literature-backed suggestions? Use `/turing:brief --deep`"
|
|
40
|
+
|
|
41
|
+
## Literature-Grounded Suggestions (--deep flag)
|
|
42
|
+
|
|
43
|
+
When `--deep` is requested, add a 7th section: **Literature-Grounded Suggestions**.
|
|
44
|
+
|
|
45
|
+
### Steps:
|
|
46
|
+
|
|
47
|
+
1. **Read context:** Read `config.yaml` and the briefing output to understand:
|
|
48
|
+
- What task type this is (tabular classification, time series, etc.)
|
|
49
|
+
- Which model families have been exhausted (from "Model Types Explored")
|
|
50
|
+
- Where improvement has plateaued (from "Improvement Trajectory")
|
|
51
|
+
- What failure patterns keep recurring
|
|
52
|
+
|
|
53
|
+
2. **Search literature** with `WebSearch` for techniques that address the specific stagnation:
|
|
54
|
+
- If plateaued: "improve [task type] accuracy beyond [current metric] 2024"
|
|
55
|
+
- If overfitting: "regularization techniques [model family] [task type]"
|
|
56
|
+
- If all models tried: "state of the art [task type] benchmark 2024 2025"
|
|
57
|
+
|
|
58
|
+
3. **Distill 3-5 suggestions** from the literature, each with:
|
|
59
|
+
- **Technique:** specific and actionable
|
|
60
|
+
- **Source:** paper or article URL
|
|
61
|
+
- **Why now:** how it addresses the specific stagnation point
|
|
62
|
+
- **Impact estimate:** high/medium/low
|
|
63
|
+
- **Complexity:** low/medium/high
|
|
64
|
+
|
|
65
|
+
4. **Queue suggestions** as hypotheses:
|
|
66
|
+
```bash
|
|
67
|
+
source .venv/bin/activate && python scripts/manage_hypotheses.py add "<technique>: <rationale> (source: <citation>)" --priority medium --source literature
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
5. **Format as a section** appended to the briefing.
|
|
71
|
+
|
|
72
|
+
## Saving Briefs
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
mkdir -p briefs && python scripts/generate_brief.py > briefs/brief-$(date +%Y-%m-%d).md
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## When to Use
|
|
79
|
+
|
|
80
|
+
- After a training session completes or converges
|
|
81
|
+
- Before injecting new hypotheses (to understand what's already been tried)
|
|
82
|
+
- When returning to a project after time away
|
|
83
|
+
- **With `--deep`:** when the agent seems stuck and you want evidence-based direction
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: compare
|
|
3
|
+
description: Compare two ML experiment runs side-by-side — metrics, configuration deltas, and a verdict on which approach is more promising.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-id-1> <exp-id-2>"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Compare two ML experiment runs side-by-side to understand what changed and why one performed better.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Run comparison:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate && python scripts/compare_runs.py $0 $1
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Analyze the delta:**
|
|
19
|
+
- **Metric differences:** all configured metrics for both runs
|
|
20
|
+
- **Configuration delta:** what changed (model type, hyperparameters, features)
|
|
21
|
+
- **Causal analysis:** which changes likely caused the metric difference
|
|
22
|
+
- **Verdict:** which approach is more promising for future experiments
|
|
23
|
+
|
|
24
|
+
3. **If either ID is missing:** report the error and suggest `/turing:status` to see available experiment IDs.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: design
|
|
3
|
+
description: Generate a structured experiment design for a hypothesis. Reads experiment history, searches literature for methodology, produces a scored design document at experiments/designs/.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<hypothesis-id or description>"
|
|
6
|
+
allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*), Grep, Glob, WebSearch, WebFetch
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Front-load the thinking before the coding. Given a hypothesis, produce a structured experiment design grounded in methodology from the literature.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
### 1. Load Context
|
|
14
|
+
|
|
15
|
+
If `$ARGUMENTS` matches `hyp-NNN`, load the hypothesis:
|
|
16
|
+
```bash
|
|
17
|
+
source .venv/bin/activate && python scripts/manage_hypotheses.py show $ARGUMENTS
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
If freeform text, use it directly as the hypothesis description.
|
|
21
|
+
|
|
22
|
+
Read the current config and experiment state:
|
|
23
|
+
```bash
|
|
24
|
+
cat config.yaml
|
|
25
|
+
```
|
|
26
|
+
```bash
|
|
27
|
+
source .venv/bin/activate && python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
|
|
28
|
+
```
|
|
29
|
+
```bash
|
|
30
|
+
cat experiment_state.yaml 2>/dev/null || echo "No experiment state yet"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### 2. Search for Methodology
|
|
34
|
+
|
|
35
|
+
Use `WebSearch` to find 2-3 papers or articles describing how to implement the proposed change effectively. Target:
|
|
36
|
+
- The specific technique in the hypothesis (e.g., "LightGBM dart boosting implementation best practices")
|
|
37
|
+
- Common pitfalls for this type of change
|
|
38
|
+
- Benchmark results showing expected improvement range
|
|
39
|
+
|
|
40
|
+
Use `WebFetch` on the most relevant results to extract specific methodology details: hyperparameter recommendations, training procedures, evaluation approaches.
|
|
41
|
+
|
|
42
|
+
### 3. Write the Design Document
|
|
43
|
+
|
|
44
|
+
Create `experiments/designs/<hyp-id>-design.md` (or `experiments/designs/adhoc-<date>-design.md` for freeform hypotheses):
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
mkdir -p experiments/designs
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Write with this structure:
|
|
51
|
+
|
|
52
|
+
```markdown
|
|
53
|
+
# Experiment Design: <hypothesis summary>
|
|
54
|
+
|
|
55
|
+
## Hypothesis
|
|
56
|
+
<full description>
|
|
57
|
+
|
|
58
|
+
## Objective
|
|
59
|
+
<what we're testing, stated as a falsifiable claim>
|
|
60
|
+
|
|
61
|
+
## Method
|
|
62
|
+
<specific changes, grounded in literature findings>
|
|
63
|
+
|
|
64
|
+
## Literature Support
|
|
65
|
+
- <source 1>: <what it says about this approach>
|
|
66
|
+
- <source 2>: <relevant finding>
|
|
67
|
+
|
|
68
|
+
## Implementation Plan
|
|
69
|
+
### Changes to train.py
|
|
70
|
+
<concrete code changes needed>
|
|
71
|
+
|
|
72
|
+
### Changes to config.yaml (if any)
|
|
73
|
+
<hyperparameter values to set, with rationale from literature>
|
|
74
|
+
|
|
75
|
+
## Expected Outcome
|
|
76
|
+
- **Success:** <metric > threshold, specific number>
|
|
77
|
+
- **Failure:** <what would disprove the hypothesis>
|
|
78
|
+
|
|
79
|
+
## Risks
|
|
80
|
+
<specific pitfalls from literature, not generic "might not work">
|
|
81
|
+
|
|
82
|
+
## Estimated Runs
|
|
83
|
+
<how many iterations>
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 4. Self-Critique
|
|
87
|
+
|
|
88
|
+
Review the design:
|
|
89
|
+
- Is the implementation plan specific enough for the researcher agent to execute without ambiguity?
|
|
90
|
+
- Does the expected outcome have a concrete metric threshold?
|
|
91
|
+
- Are risks actionable?
|
|
92
|
+
|
|
93
|
+
Score each dimension 1-10 (feasibility, novelty, clarity). If any < 7, revise that section. Max 2 revision rounds.
|
|
94
|
+
|
|
95
|
+
### 5. Report
|
|
96
|
+
|
|
97
|
+
Display the design summary with scores and file location. The researcher agent can read the design during `/turing:train`.
|
package/commands/init.md
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: init
|
|
3
|
+
description: Initialize a new ML project with the Turing autoresearch harness. Scaffolds the full experiment infrastructure — immutable evaluation pipeline, agent-editable training code, structured logging, convergence detection hooks, and a Python virtual environment. Use --plan to generate a research plan.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[project_name] [--plan]"
|
|
6
|
+
allowed-tools: Read, Write, Edit, Bash(*), Grep, Glob, WebSearch, WebFetch
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Scaffold a new ML project with the Turing autoresearch harness. This creates the separation between the measurement apparatus (READ-ONLY) and the hypothesis space (AGENT-EDITABLE) that makes autonomous experimentation trustworthy.
|
|
10
|
+
|
|
11
|
+
## Interactive Setup
|
|
12
|
+
|
|
13
|
+
Ask the user for the following (or accept from `$ARGUMENTS` if provided as JSON):
|
|
14
|
+
|
|
15
|
+
1. **Project name** (`{{PROJECT_NAME}}`): Name of the ML project (e.g., "sentiment", "churn", "fraud-detection")
|
|
16
|
+
2. **Target metric** (`{{TARGET_METRIC}}`): Primary metric to optimize (e.g., "accuracy", "f1", "mae", "mse", "auc")
|
|
17
|
+
3. **Metric direction**: Is lower better (mae, mse, loss) or higher better (accuracy, f1, auc)?
|
|
18
|
+
4. **Task description** (`{{TASK_DESCRIPTION}}`): What the model does (e.g., "Predict customer churn from usage data")
|
|
19
|
+
5. **ML directory** (`{{ML_DIR}}`): Where ML files go relative to project root (e.g., "ml/sentiment")
|
|
20
|
+
6. **Data source** (`{{DATA_SOURCE}}`): Where training data comes from (e.g., "data/reviews.csv")
|
|
21
|
+
|
|
22
|
+
## Scaffolding
|
|
23
|
+
|
|
24
|
+
Once you have all 6 values, delegate to the unified scaffolding script:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
python3 <templates_dir>/scripts/scaffold.py \
|
|
28
|
+
--project-name "<project_name>" \
|
|
29
|
+
--target-metric "<target_metric>" \
|
|
30
|
+
--metric-direction "<metric_direction>" \
|
|
31
|
+
--task-description "<task_description>" \
|
|
32
|
+
--ml-dir "<ml_dir>" \
|
|
33
|
+
--data-source "<data_source>" \
|
|
34
|
+
--templates-dir "<templates_dir>"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
The scaffold script handles everything in a single atomic operation:
|
|
38
|
+
- Copies all template files with placeholder substitution
|
|
39
|
+
- Creates data/, experiments/, models/ directories
|
|
40
|
+
- Sets up agent memory at `.claude/agent-memory/ml-researcher/MEMORY.md`
|
|
41
|
+
- Configures Claude Code hooks in `.claude/settings.local.json`
|
|
42
|
+
- Creates Python virtual environment and installs requirements
|
|
43
|
+
- Verifies all placeholders were replaced (fails loudly if any remain)
|
|
44
|
+
|
|
45
|
+
## Locating Templates
|
|
46
|
+
|
|
47
|
+
Find the templates directory using Glob:
|
|
48
|
+
```
|
|
49
|
+
~/.claude/plugins/*/templates/
|
|
50
|
+
```
|
|
51
|
+
Or check if installed via npm by looking for `node_modules/claude-turing/templates/`.
|
|
52
|
+
|
|
53
|
+
## After Scaffolding
|
|
54
|
+
|
|
55
|
+
Report what was created:
|
|
56
|
+
- The separation: READ-ONLY (`prepare.py`, `evaluate.py`) vs AGENT-EDITABLE (`train.py`)
|
|
57
|
+
- Next steps: add data to the configured data source path, run `python prepare.py`, then `/turing:train`
|
|
58
|
+
- The taste-leverage loop: `/turing:try` to inject hypotheses, `/turing:brief` for intelligence reports
|
|
59
|
+
|
|
60
|
+
## Research Plan Generation (--plan flag)
|
|
61
|
+
|
|
62
|
+
If `$ARGUMENTS` contains `--plan`, generate a research plan AFTER scaffolding. This gives the agent strategic direction for its first 5-10 experiments rather than ad-hoc exploration.
|
|
63
|
+
|
|
64
|
+
### Steps:
|
|
65
|
+
|
|
66
|
+
1. **Read the task context** from the just-created `config.yaml`: task description, model type, target metric, data source.
|
|
67
|
+
|
|
68
|
+
2. **Search literature** with `WebSearch` for the task domain:
|
|
69
|
+
- "state of the art <task description> machine learning 2024 2025"
|
|
70
|
+
- "best model <target metric> <data type> benchmark"
|
|
71
|
+
- "<task description> common approaches survey"
|
|
72
|
+
|
|
73
|
+
Use `WebFetch` on top 2-3 results to extract: dominant model families, typical metric ranges, known challenges.
|
|
74
|
+
|
|
75
|
+
3. **Generate `RESEARCH_PLAN.md`** in the ML project directory with this structure:
|
|
76
|
+
|
|
77
|
+
```markdown
|
|
78
|
+
# Research Plan: <task description>
|
|
79
|
+
|
|
80
|
+
Generated: <date>
|
|
81
|
+
|
|
82
|
+
## Task Summary
|
|
83
|
+
<one paragraph describing the task, data, and success criteria>
|
|
84
|
+
|
|
85
|
+
## Model Families to Explore
|
|
86
|
+
Ordered by expected relevance based on literature:
|
|
87
|
+
1. **<family 1>** — <why, with citation>
|
|
88
|
+
2. **<family 2>** — <why, with citation>
|
|
89
|
+
3. **<family 3>** — <why, with citation>
|
|
90
|
+
|
|
91
|
+
## Evaluation Strategy
|
|
92
|
+
- Primary metric: <metric> (<higher/lower> is better)
|
|
93
|
+
- Multi-run recommendation: <yes/no, based on expected variance>
|
|
94
|
+
- Baseline target: <realistic first-pass metric from literature>
|
|
95
|
+
|
|
96
|
+
## Search Budget
|
|
97
|
+
- <N> experiments per model family before moving on
|
|
98
|
+
- Total budget: <N> experiments before first convergence check
|
|
99
|
+
|
|
100
|
+
## Success Criteria
|
|
101
|
+
- Target metric: <value from literature benchmarks>
|
|
102
|
+
- Convergence: <patience> consecutive non-improvements
|
|
103
|
+
|
|
104
|
+
## Known Challenges
|
|
105
|
+
- <challenge 1 from literature, e.g., "class imbalance common in this domain">
|
|
106
|
+
- <challenge 2>
|
|
107
|
+
|
|
108
|
+
## Sources
|
|
109
|
+
- <citation 1>
|
|
110
|
+
- <citation 2>
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
4. **Self-critique the plan** (one round):
|
|
114
|
+
- Are the model families ordered by evidence strength?
|
|
115
|
+
- Is the budget realistic?
|
|
116
|
+
- Are the success criteria grounded in benchmark data?
|
|
117
|
+
Revise if any section is vague or unsupported.
|
|
118
|
+
|
|
119
|
+
5. **Report:** "Research plan generated at `<ml_dir>/RESEARCH_PLAN.md`. The agent will read this during `/turing:train` for strategic direction."
|
|
120
|
+
|
|
121
|
+
### Integration
|
|
122
|
+
|
|
123
|
+
The agent's `program.md` OBSERVE step reads `RESEARCH_PLAN.md` (if it exists) for strategic direction. The plan is advisory — the agent can deviate but should note why in `experiment_state.yaml`.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: logbook
|
|
3
|
+
description: Generate a research logbook showing the full experiment narrative — hypotheses proposed, experiments run, decisions made, and progress over time. Outputs HTML (with interactive chart) or markdown.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--since YYYY-MM-DD] [--format html|markdown] [--output path]"
|
|
6
|
+
allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Generate a research logbook that captures the full narrative of the experiment campaign.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Generate the logbook:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate && python scripts/generate_logbook.py
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
**With options from `$ARGUMENTS`:**
|
|
19
|
+
- `--since 2026-03-15` — only include events after this date
|
|
20
|
+
- `--format markdown` — output as markdown instead of HTML
|
|
21
|
+
- `--output logbook.html` — write to file instead of stdout
|
|
22
|
+
|
|
23
|
+
**Common usage:**
|
|
24
|
+
```bash
|
|
25
|
+
# HTML logbook with interactive trajectory chart
|
|
26
|
+
source .venv/bin/activate && python scripts/generate_logbook.py --output logbook.html
|
|
27
|
+
|
|
28
|
+
# Markdown for embedding in docs or READMEs
|
|
29
|
+
source .venv/bin/activate && python scripts/generate_logbook.py --format markdown --output logbook.md
|
|
30
|
+
|
|
31
|
+
# Last week's activity
|
|
32
|
+
source .venv/bin/activate && python scripts/generate_logbook.py --since 2026-03-24 --output logbook.html
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
2. **Present the result:**
|
|
36
|
+
- If HTML: tell the user to open the file in their browser. The logbook includes an interactive Chart.js trajectory visualization.
|
|
37
|
+
- If markdown: display inline or note the output file location.
|
|
38
|
+
|
|
39
|
+
## What the Logbook Contains
|
|
40
|
+
|
|
41
|
+
- **Campaign summary:** total experiments, keep rate, best metric, hypothesis count
|
|
42
|
+
- **Improvement trajectory:** interactive line chart showing metric progression and best-so-far envelope
|
|
43
|
+
- **Experiment log:** every experiment with ID, description, metric value, status (kept/discarded), date
|
|
44
|
+
- **Hypothesis queue:** every hypothesis with source (human/agent/literature), status, priority
|
|
45
|
+
|
|
46
|
+
## When to Use
|
|
47
|
+
|
|
48
|
+
- To share progress with collaborators
|
|
49
|
+
- Before and after meetings to show what was tried
|
|
50
|
+
- To archive a completed research campaign
|
|
51
|
+
- To track progress over a specific time period
|
package/commands/mode.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mode
|
|
3
|
+
description: Set the research strategy mode — explore (try new things), exploit (refine what works), or replicate (verify results). Drives novelty guard policy and agent behavior.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<explore|exploit|replicate>"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
Set the research mode for the current project. The mode determines how the novelty guard filters proposed experiments and how the agent prioritizes its work.
|
|
9
|
+
|
|
10
|
+
## Modes
|
|
11
|
+
|
|
12
|
+
| Mode | Novelty Guard Policy | Agent Behavior |
|
|
13
|
+
|------|---------------------|----------------|
|
|
14
|
+
| **explore** | Allow novel ideas, block repeats and follow-ups | Try fundamentally different approaches |
|
|
15
|
+
| **exploit** | Allow follow-ups and known successes, block repeats | Refine the current best configuration |
|
|
16
|
+
| **replicate** | Allow duplicate runs, block novel ideas | Re-run best experiments with different seeds |
|
|
17
|
+
|
|
18
|
+
## Steps
|
|
19
|
+
|
|
20
|
+
1. **Parse mode** from `$ARGUMENTS`. Must be one of: `explore`, `exploit`, `replicate`.
|
|
21
|
+
|
|
22
|
+
2. **Update experiment state:**
|
|
23
|
+
```bash
|
|
24
|
+
source .venv/bin/activate
|
|
25
|
+
python -c "
|
|
26
|
+
import yaml
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
path = Path('experiment_state.yaml')
|
|
29
|
+
state = yaml.safe_load(path.read_text()) if path.exists() else {}
|
|
30
|
+
state['research_mode'] = '$ARGUMENTS'
|
|
31
|
+
path.write_text(yaml.dump(state, default_flow_style=False))
|
|
32
|
+
print(f'Research mode set to: $ARGUMENTS')
|
|
33
|
+
"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
3. **Confirm** with guidance:
|
|
37
|
+
- `explore`: "The agent will prioritize novel ideas and avoid follow-ups. Best when the current approach feels exhausted."
|
|
38
|
+
- `exploit`: "The agent will refine the current best. Best when you have a promising direction."
|
|
39
|
+
- `replicate`: "The agent will re-run experiments for statistical verification. Best before declaring a winner."
|
|
40
|
+
|
|
41
|
+
## Default
|
|
42
|
+
|
|
43
|
+
The default mode is `exploit` (refine what works). Change to `explore` when plateauing, `replicate` before final decisions.
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: poster
|
|
3
|
+
description: Generate a single-page HTML research poster summarizing the experiment campaign — best result, trajectory, key findings, and methodology. Adapted from posterskill's self-contained HTML architecture.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[title override]"
|
|
6
|
+
allowed-tools: Read, Write, Edit, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*, open:*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Generate a research poster summarizing the experiment campaign as a single self-contained HTML file. Adapted from [posterskill](https://github.com/ethanweber/posterskill)'s architecture — no build step, works when opened as `file://`.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
### 1. Gather Data
|
|
14
|
+
|
|
15
|
+
Read the experiment history and project context:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
cat config.yaml
|
|
19
|
+
source .venv/bin/activate && python scripts/generate_brief.py
|
|
20
|
+
source .venv/bin/activate && python scripts/show_metrics.py --last 20
|
|
21
|
+
cat experiment_state.yaml 2>/dev/null || true
|
|
22
|
+
cat RESEARCH_PLAN.md 2>/dev/null || true
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
From this, extract:
|
|
26
|
+
- **Title:** from config task description (or `$ARGUMENTS` override)
|
|
27
|
+
- **Best result:** metric name, value, experiment ID
|
|
28
|
+
- **Improvement trajectory:** metric values over experiments
|
|
29
|
+
- **Key findings:** what model families worked, what didn't, what was surprising
|
|
30
|
+
- **Methodology:** the experiment loop, evaluation strategy, convergence criteria
|
|
31
|
+
- **Campaign stats:** total experiments, keep rate, time span
|
|
32
|
+
|
|
33
|
+
### 2. Generate the Poster HTML
|
|
34
|
+
|
|
35
|
+
Create `poster/index.html` — a self-contained HTML file with:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
mkdir -p poster
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
**Structure the poster with these cards:**
|
|
42
|
+
|
|
43
|
+
| Card | Content |
|
|
44
|
+
|------|---------|
|
|
45
|
+
| **Header** | Title, "Autonomous ML Research Campaign", date range, best metric badge |
|
|
46
|
+
| **Objective** | Task description and success criteria from config |
|
|
47
|
+
| **Methodology** | The autoresearch loop: hypothesize → train → evaluate → decide. Mention immutable evaluation, git-disciplined rollback |
|
|
48
|
+
| **Trajectory** | Chart.js line chart of metric progression (embed data inline) |
|
|
49
|
+
| **Best Configuration** | Model type, hyperparameters, metric values from best experiment |
|
|
50
|
+
| **Key Findings** | 3-5 bullet points: what worked, what didn't, surprises |
|
|
51
|
+
| **Explored Approaches** | Table of model families tried with keep rates |
|
|
52
|
+
| **Campaign Stats** | Total experiments, keep rate, human vs agent hypotheses, convergence |
|
|
53
|
+
|
|
54
|
+
**Design principles (from posterskill):**
|
|
55
|
+
- Single self-contained HTML file, CDN dependencies only (Chart.js, Google Fonts)
|
|
56
|
+
- Print-optimized CSS (`@media print`, `@page` with poster dimensions)
|
|
57
|
+
- Card-based layout with colored top borders
|
|
58
|
+
- Clean typography (system fonts or Nunito from Google Fonts)
|
|
59
|
+
- Data embedded directly in the HTML as JSON — no external file dependencies
|
|
60
|
+
|
|
61
|
+
**Poster dimensions:** Default A1 landscape (841mm x 594mm). The user can print to PDF from their browser.
|
|
62
|
+
|
|
63
|
+
### 3. Self-Critique
|
|
64
|
+
|
|
65
|
+
Review the generated poster:
|
|
66
|
+
- Does the trajectory chart render correctly with the embedded data?
|
|
67
|
+
- Are the key findings specific and data-grounded (not generic)?
|
|
68
|
+
- Is the best configuration complete (model type + all relevant hyperparameters)?
|
|
69
|
+
- Would a collaborator understand the campaign from this single page?
|
|
70
|
+
|
|
71
|
+
Fix any issues found.
|
|
72
|
+
|
|
73
|
+
### 4. Present
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
Research poster generated at poster/index.html
|
|
77
|
+
|
|
78
|
+
Open in your browser to view. Print to PDF for sharing.
|
|
79
|
+
Best result: <metric>=<value> (<experiment_id>)
|
|
80
|
+
Campaign: <N> experiments, <keep_rate>% keep rate
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Suggest: "Open `poster/index.html` in your browser. Use Ctrl+P / Cmd+P to save as PDF."
|
|
84
|
+
|
|
85
|
+
## Integration
|
|
86
|
+
|
|
87
|
+
- The poster reads from the same data sources as `/turing:brief` and `/turing:logbook`
|
|
88
|
+
- For a more detailed view, use `/turing:logbook` (full experiment-by-experiment narrative)
|
|
89
|
+
- For a quick summary, use `/turing:brief` (text-only intelligence report)
|